In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load dataset 
train = pd.read_csv('remove_duplicates_transform_label_address_train.csv')
test = pd.read_csv('remove_duplicates_transform_label_address_test.csv')
sample = pd.read_csv('input/sampleSubmission.csv')
train['Zipcode']=train['Zipcode'].astype(str)
test['Zipcode']=test['Zipcode'].astype(str)

weather = pd.read_csv('input/weather.csv')
# spray = pd.read_csv('input/spray.csv')
# weather station coordinates
station1 = [-87.933 , 41.995]
station2 = [-87.752 , 41.786]

In [3]:
def generate_date_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].apply(lambda x: x.year)
    df['Month'] = df['Date'].apply(lambda x: x.month)
    df['DayOfMonth'] = df['Date'].apply(lambda x: x.day)
    # df['DayOfWeekName'] = df['Date'].apply(lambda x: x.day_name())
    df['DayOfWeek'] = df['Date'].apply(lambda x: x.dayofweek)
    df['DayOfYear'] = df['Date'].apply(lambda x: x.dayofyear)
#     df['WeekOfYear'] = df['Date'].apply(lambda x: x.weekofyear)
#     df['IsLeapYear'] = df['Date'].apply(lambda x: x.is_leap_year)
#     df['IsLeapYear'] = df['IsLeapYear'].astype(str)
#     df['Quarter'] = df['Date'].apply(lambda x: x.quarter)

def transform_df(df):
    df_ = df.copy()
    df_['Lat_int'] = df_.Latitude.astype(int)
    df_['Long_int'] = df_.Longitude.astype(int)
    return df_

In [4]:
train2 = transform_df(train)
test2 = transform_df(test)

In [5]:
train2.dtypes 

Date                object
Species             object
Block                int64
Street              object
Trap                object
Latitude           float64
Longitude          float64
AddressAccuracy      int64
WnvPresent           int64
City                object
State               object
Zipcode             object
Lat_int              int64
Long_int             int64
dtype: object

# featurize weather, remove missing values

In [6]:
# Not using codesum for this benchmark
weather = weather.drop('CodeSum', axis=1)

# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

In [7]:
# Merge with weather data
train3 = train2.merge(weather, on='Date')
test3 = test2.merge(weather, on='Date')

In [8]:
train3.dtypes

Date                object
Species             object
Block                int64
Street              object
Trap                object
Latitude           float64
Longitude          float64
AddressAccuracy      int64
WnvPresent           int64
City                object
State               object
Zipcode             object
Lat_int              int64
Long_int             int64
Tmax_x               int64
Tmin_x               int64
Tavg_x              object
Depart_x            object
DewPoint_x           int64
WetBulb_x           object
Heat_x              object
Cool_x              object
Sunrise_x           object
Sunset_x            object
Depth_x             object
Water1_x             int64
SnowFall_x          object
PrecipTotal_x       object
StnPressure_x       object
SeaLevel_x          object
ResultSpeed_x      float64
ResultDir_x          int64
AvgSpeed_x          object
Tmax_y               int64
Tmin_y               int64
Tavg_y              object
Depart_y             int64
D

In [9]:
# drop address columns
train3 = train3.drop(['Date','WnvPresent'],  axis = 1)
test3 = test3.drop(['Date','Id'],axis = 1)

In [10]:
train3.shape

(8475, 50)

In [11]:
test3.shape

(116293, 50)

# label encoder

In [12]:
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [105]:
def get_label_encoder(train, test, cols):
    lbl = preprocessing.LabelEncoder()
    for col in cols:
        lbl.fit(list(train[col].values) + list(test[col].values))
        train[col] = lbl.transform(train[col].values)
        test[col] = lbl.transform(test[col].values)
def convert_string_to_num(df, str_columns):
    for col in str_columns:
        df[col] = df[col].astype(float)
def get_metrics(y_true, y_pred, y_pred_prob):
    print("Precision: %1.3f" % precision_score(y_true, y_pred))
    print("Recall: %1.3f" % recall_score(y_true, y_pred))
    print("F1: %1.3f" % f1_score(y_true, y_pred))
    print("AUC: %1.3f" % roc_auc_score(y_true, y_pred))
    print("AUC with probability: %1.3f" % roc_auc_score(y_true, y_pred_prob))

In [14]:
all_obj_cols = train3.select_dtypes(include=['object']).columns
all_obj_cols

Index(['Species', 'Street', 'Trap', 'City', 'State', 'Zipcode', 'Tavg_x',
       'Depart_x', 'WetBulb_x', 'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x',
       'Depth_x', 'SnowFall_x', 'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x',
       'AvgSpeed_x', 'Tavg_y', 'WetBulb_y', 'Heat_y', 'Cool_y',
       'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y', 'AvgSpeed_y'],
      dtype='object')

In [15]:
str_columns = [u'Tavg_x', u'Depart_x', u'WetBulb_x',
       u'Heat_x', u'Cool_x', u'Sunrise_x', u'Sunset_x', u'Depth_x',
       u'SnowFall_x', u'PrecipTotal_x', u'StnPressure_x', u'SeaLevel_x',
       u'AvgSpeed_x', u'Tavg_y', u'WetBulb_y', u'Heat_y', u'Cool_y',
       u'PrecipTotal_y', u'StnPressure_y', u'SeaLevel_y', u'AvgSpeed_y']
convert_string_to_num(train3,str_columns )
convert_string_to_num(test3,str_columns )

In [16]:
obj_cols = [u'Species', u'Street', u'Trap','State','City','Zipcode']
get_label_encoder(train3, test3, obj_cols)

In [17]:
# remove missing data
train4 = train3.ix[:,(train3 != -1).any(axis=0)]
test4 = test3.ix[:,(test3 != -1).any(axis=0)]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
train4.shape

(8475, 43)

In [32]:
test4.shape

(116293, 43)

In [20]:
X = train4
y = train.WnvPresent.values
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3,
                                                    random_state=42) # need to convert to np array

In [113]:
gbt = ensemble.GradientBoostingClassifier()

learning_rate = [0.08,0.12]
max_depth = [6, 8]
n_estimators = [150, 200]
#min_samples_split = [2,10,50]

tuned_parameters = [{'max_depth': max_depth,
                     'n_estimators':n_estimators,
                     'learning_rate':learning_rate ,
                     #'min_samples_split':min_samples_split
                    }]
n_folds = 3

In [114]:
# cv with aucroc
clf = GridSearchCV(gbt, 
                   tuned_parameters, 
                   cv=n_folds, 
                   refit=True, 
                   scoring='recall') # what if i set as recall

In [115]:
%%time
clf.fit(X_train, y_train)

CPU times: user 1min 34s, sys: 260 ms, total: 1min 34s
Wall time: 1min 35s


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'max_depth': [6, 8], 'n_estimators': [150, 200], 'learning_rate': [0.08, 0.12]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [116]:
clf.best_params_

{'learning_rate': 0.08, 'max_depth': 6, 'n_estimators': 200}

In [117]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
y_train_pred_prob = clf.predict_proba(X_train)
y_test_pred_prob = clf.predict_proba(X_test)

In [118]:
def count_label(y):
    unique, counts = np.unique(y, return_counts=True)
    print (unique, counts)

In [119]:
count_label(y_train)

[0 1] [5619  313]


In [120]:
count_label(y_train_pred)

[0 1] [5666  266]


In [121]:
count_label(y_test)

[0 1] [2399  144]


In [122]:
count_label(y_test_pred)

[0 1] [2479   64]


In [123]:
unique, counts = np.unique(y_test, return_counts=True)
unique, counts

(array([0, 1]), array([2399,  144]))

In [124]:
get_metrics(y_train, y_train_pred, y_train_pred_prob)

Precision: 0.996
Recall: 0.847
F1: 0.915
AUC: 0.923


ValueError: bad input shape (5932, 2)

In [None]:
get_metrics(y_test, y_test_pred, y_test_pred_prob)

In [34]:
def create_prediction(clf, test, sample):
    predictions = clf.predict(test)
    sample['WnvPresent'] = predictions 
    # wait why is it not binary prediction, all zeros????
    # why it is still all 0
    print (sample.WnvPresent.value_counts())

In [35]:
create_prediction(clf, test4, sample)

0    115936
1       357
Name: WnvPresent, dtype: int64


In [42]:
sample.to_csv('exp4_2.csv', index=False)
# change naming system

In [None]:
# use f1

# use different scoring metrics
F-score is ill-defined and being set to 0.0 due to no predicted samples.
recall = TP/(TP+FN), in case if predictor doesn't predict positive class - TP is 0 - recall is 0.
now you are dividing 0/0.
TP is 0 ==> set precision as 0
precision = TP / (TP + FP) # no positive at all
recall = TP / (TP + FN) # there is always false negative
==> no positive precision at all???

In [85]:
# The scorers can be either be one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
scoring = {'AUC': 'roc_auc', 
           'f1': 'f1',
           'recall':'recall',
           'precision':'precision'}

In [86]:
clf2 = GridSearchCV(gbt, 
                   tuned_parameters, 
                   cv=n_folds, 
                   refit=False, 
                   scoring=scoring)  
# what if i set as recall
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
clf2.fit(X_train, y_train)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'max_depth': [3, 6, 8], 'n_estimators': [100, 150, 200], 'learning_rate': [0.03, 0.08, 0.12], 'min_samples_split': [2, 10, 50]}],
       pre_dispatch='2*n_jobs', refit=False, return_train_score='warn',
       scoring={'AUC': 'roc_auc', 'f1': 'f1', 'recall': 'recall', 'precision': 'precision'},
       verbose=0)

In [92]:
results = clf2.cv_results_
results_df = pd.DataFrame.from_dict(results )
results_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_learning_rate', 'param_max_depth', 'param_min_samples_split',
       'param_n_estimators', 'params', 'split0_test_AUC', 'split1_test_AUC',
       'split2_test_AUC', 'mean_test_AUC', 'std_test_AUC', 'rank_test_AUC',
       'split0_train_AUC', 'split1_train_AUC', 'split2_train_AUC',
       'mean_train_AUC', 'std_train_AUC', 'split0_test_f1', 'split1_test_f1',
       'split2_test_f1', 'mean_test_f1', 'std_test_f1', 'rank_test_f1',
       'split0_train_f1', 'split1_train_f1', 'split2_train_f1',
       'mean_train_f1', 'std_train_f1', 'split0_test_recall',
       'split1_test_recall', 'split2_test_recall', 'mean_test_recall',
       'std_test_recall', 'rank_test_recall', 'split0_train_recall',
       'split1_train_recall', 'split2_train_recall', 'mean_train_recall',
       'std_train_recall', 'split0_test_precision', 'split1_test_precision',
       'split2_test_precision', 'mean_test_precision', 'std_

In [100]:
results_df.sort_values('mean_test_f1',ascending=False)[['params','mean_test_f1']].head(3)

Unnamed: 0,params,mean_test_f1
71,"{'learning_rate': 0.12, 'max_depth': 6, 'min_s...",0.217917
70,"{'learning_rate': 0.12, 'max_depth': 6, 'min_s...",0.213217
67,"{'learning_rate': 0.12, 'max_depth': 6, 'min_s...",0.211681


In [101]:
results_df.sort_values('mean_test_AUC',ascending=False)[['params','mean_test_AUC']].head(3)
# valid score
# why is valid AUC so different from test AUC

Unnamed: 0,params,mean_test_AUC
60,"{'learning_rate': 0.12, 'max_depth': 3, 'min_s...",0.843076
31,"{'learning_rate': 0.08, 'max_depth': 3, 'min_s...",0.842779
32,"{'learning_rate': 0.08, 'max_depth': 3, 'min_s...",0.842671


In [103]:
results_df.sort_values('mean_test_precision',ascending=False)[['params','mean_test_precision']].head(3)

Unnamed: 0,params,mean_test_precision
1,"{'learning_rate': 0.03, 'max_depth': 3, 'min_s...",0.688885
8,"{'learning_rate': 0.03, 'max_depth': 3, 'min_s...",0.684514
4,"{'learning_rate': 0.03, 'max_depth': 3, 'min_s...",0.657128


In [104]:
results_df.sort_values('mean_test_recall',ascending=False)[['params','mean_test_recall']].head(3)

Unnamed: 0,params,mean_test_recall
71,"{'learning_rate': 0.12, 'max_depth': 6, 'min_s...",0.169323
68,"{'learning_rate': 0.12, 'max_depth': 6, 'min_s...",0.166205
67,"{'learning_rate': 0.12, 'max_depth': 6, 'min_s...",0.163


In [None]:
clf2.best_params_

In [None]:
# use the best params to refit the model
gbt2 = 

In [None]:
y_train_pred = clf2.predict(X_train)
y_test_pred = clf2.predict(X_test)

In [None]:
get_metrics(y_train, y_train_pred)

In [None]:
get_metrics(y_test, y_test_pred)

using recall

using precision
- train
- Precision: 1.000
- Recall: 0.006
- F1: 0.013
- AUC: 0.503
- test
- Precision: 0.000
- Recall: 0.000
- F1: 0.000
- AUC: 0.500
using f1
- train
- Precision: 1.000
- Recall: 0.997
- F1: 0.998
- AUC: 0.998
- test
- Precision: 0.338
- Recall: 0.153
- F1: 0.211
- AUC: 0.567
using balanced_accuracy
- train
- Precision: 1.000
- Recall: 0.997
- F1: 0.998
- AUC: 0.998
- test
- Precision: 0.323
- Recall: 0.146
- F1: 0.201
- AUC: 0.564
using average_precision
- train
- Precision: 0.769
- Recall: 0.064
- F1: 0.118
- AUC: 0.531
- test
- Precision: 0.250
- Recall: 0.014
- F1: 0.026
- AUC: 0.506

In [None]:
# with refit=False
def get_top_features(clf,X):
    important_features = pd.Series(data=clf.feature_importances_,index=X.columns)
    important_features.sort_values(ascending=False,inplace=True)
    return important_features

In [None]:
get_top_features(gbt2,X)

In [None]:
create_prediction(clf2, test4, sample)

In [None]:
sample.to_csv('exp4_1.csv', index=False)