In [3]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
# Load dataset 
train = pd.read_csv('remove_duplicates_transform_label_address_train.csv')
test = pd.read_csv('remove_duplicates_transform_label_address_test.csv')
sample = pd.read_csv('input/sampleSubmission.csv')
train['Zipcode']=train['Zipcode'].astype(str)
test['Zipcode']=test['Zipcode'].astype(str)

In [26]:
train.dtypes

Date                object
Species             object
Block                int64
Street              object
Trap                object
Latitude           float64
Longitude          float64
AddressAccuracy      int64
WnvPresent           int64
City                object
State               object
Zipcode             object
dtype: object

In [28]:
def generate_date_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].apply(lambda x: x.year)
    df['Month'] = df['Date'].apply(lambda x: x.month)
    df['DayOfMonth'] = df['Date'].apply(lambda x: x.day)
    # df['DayOfWeekName'] = df['Date'].apply(lambda x: x.day_name())
    df['DayOfWeek'] = df['Date'].apply(lambda x: x.dayofweek)
    df['DayOfYear'] = df['Date'].apply(lambda x: x.dayofyear)
    # df['WeekOfYear'] = df['Date'].apply(lambda x: x.weekofyear)
    #df['IsLeapYear'] = df['Date'].apply(lambda x: x.is_leap_year)
    #df['IsLeapYear'] = df['IsLeapYear'].astype(str)
    # df['Quarter'] = df['Date'].apply(lambda x: x.quarter)

def transform_df(df):
    df_ = df.copy()
    df_['Lat_int'] = df_.Latitude.astype(int)
    df_['Long_int'] = df_.Longitude.astype(int)
    generate_date_features(df_)
    return df_

In [29]:
train2 = transform_df(train)
test2 = transform_df(test)

In [30]:
# drop address columns
train2 = train2.drop(['Date'],  axis = 1)
test2 = test2.drop(['Date'],  axis = 1)

In [32]:
test2.dtypes 

Id                   int64
Species             object
Block                int64
Street              object
Trap                object
Latitude           float64
Longitude          float64
AddressAccuracy      int64
City                object
State               object
Zipcode             object
Lat_int              int64
Long_int             int64
Year                 int64
Month                int64
DayOfMonth           int64
DayOfWeek            int64
DayOfYear            int64
dtype: object

# label encoder

In [34]:
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [35]:
def get_label_encoder(train, test, cols):
    lbl = preprocessing.LabelEncoder()
    for col in cols:
        lbl.fit(list(train[col].values) + list(test[col].values))
        train[col] = lbl.transform(train[col].values)
        test[col] = lbl.transform(test[col].values)
def convert_string_to_num(df, str_columns):
    for col in str_columns:
        df[col] = df[col].astype(float)
def get_metrics(y_true, y_pred):
    print("Precision: %1.3f" % precision_score(y_true, y_pred))
    print("Recall: %1.3f" % recall_score(y_true, y_pred))
    print("F1: %1.3f" % f1_score(y_true, y_pred))
    print("AUC: %1.3f" % roc_auc_score(y_true, y_pred))

In [38]:
obj_cols = train2.select_dtypes(include=['object']).columns
get_label_encoder(train2,test2,obj_cols)

In [57]:
X = train2.drop(['WnvPresent'],axis=1)
y = train2.WnvPresent.values

In [62]:
train2.WnvPresent.value_counts()

0    8018
1     457
Name: WnvPresent, dtype: int64

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3,
                                                    random_state=42)

In [60]:
gbt = ensemble.GradientBoostingClassifier()

learning_rate = [0.02, 0.04, 0.05, 0.08]
max_depth = [2, 3, 4, 6]
n_estimators = [50, 100, 150]

tuned_parameters = [{'max_depth': max_depth,
                     'n_estimators':n_estimators,
                     'learning_rate':learning_rate 
                    }]
n_folds = 3

# cv with aucroc
clf = GridSearchCV(gbt, 
                   tuned_parameters, 
                   cv=n_folds, 
                   refit=True, 
                   scoring='roc_auc')

In [61]:
%%time
clf.fit(X_train, y_train)

CPU times: user 56.2 s, sys: 208 ms, total: 56.4 s
Wall time: 56.7 s


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'max_depth': [2, 3, 4, 6], 'n_estimators': [50, 100, 150], 'learning_rate': [0.02, 0.04, 0.05, 0.08]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [63]:
clf.best_params_

{'learning_rate': 0.08, 'max_depth': 3, 'n_estimators': 150}

In [64]:
scores = clf.cv_results_['mean_test_score']
# cross validation on the training data
scores

array([0.80926137, 0.82097079, 0.82740496, 0.82068647, 0.82867489,
       0.8333709 , 0.82791587, 0.83482678, 0.83611128, 0.8184082 ,
       0.82875862, 0.8342846 , 0.82216616, 0.83230667, 0.83738335,
       0.82841956, 0.83545523, 0.8384746 , 0.8354971 , 0.83865499,
       0.84246441, 0.83087376, 0.83828014, 0.83990986, 0.82441309,
       0.83637056, 0.83944123, 0.8298054 , 0.83616487, 0.84006818,
       0.83427795, 0.84002938, 0.84104861, 0.83419144, 0.83920108,
       0.83939485, 0.83145634, 0.8390331 , 0.84385666, 0.83509879,
       0.83979866, 0.8439994 , 0.8375881 , 0.84050566, 0.83936624,
       0.83334312, 0.83570961, 0.83495715])

In [65]:
def get_metrics(y_true, y_pred):
    print("Precision: %1.3f" % precision_score(y_true, y_pred))
    print("Recall: %1.3f" % recall_score(y_true, y_pred))
    print("F1: %1.3f" % f1_score(y_true, y_pred))
    print("AUC: %1.3f" % roc_auc_score(y_true, y_pred))

In [66]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [67]:
get_metrics(y_train, y_train_pred)
# at most is only 0.618 precision TP/ TP + FP
# recall TP / TP + FN
# predict very few positives in generate
# why very low recall???

Precision: 0.800
Recall: 0.102
F1: 0.181
AUC: 0.550


In [68]:
get_metrics(y_test, y_test_pred)
# very low recall on test dataset
# 0.5002 auc

Precision: 0.190
Recall: 0.028
F1: 0.048
AUC: 0.510


In [73]:
# create predictions and submission file
# not a good training by itself
test3 = test2.drop(['Id'],axis=1)
predictions = clf.predict(test3)
predictions 

array([0, 0, 0, ..., 0, 0, 0])

In [75]:
sample['WnvPresent'] = predictions
# wait why is it not binary prediction, all zeros!
sample.WnvPresent.value_counts()

0    116079
1       214
Name: WnvPresent, dtype: int64

In [76]:
sample.to_csv('exp2.csv', index=False)

In [79]:
clf2 = GridSearchCV(gbt, 
                   tuned_parameters, 
                   cv=n_folds, 
                   refit=True, 
                   scoring='recall')
clf2.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'max_depth': [2, 3, 4, 6], 'n_estimators': [50, 100, 150], 'learning_rate': [0.02, 0.04, 0.05, 0.08]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [80]:
clf2.best_params_

{'learning_rate': 0.08, 'max_depth': 6, 'n_estimators': 150}

In [81]:
y_train_pred = clf2.predict(X_train)
y_test_pred = clf2.predict(X_test)

In [85]:
get_metrics(y_train, y_train_pred)

Precision: 0.985
Recall: 0.610
F1: 0.753
AUC: 0.805


In [86]:
get_metrics(y_test, y_test_pred)

Precision: 0.264
Recall: 0.097
F1: 0.142
AUC: 0.540


In [87]:
predictions2 = clf2.predict(test3)
predictions2

array([0, 0, 0, ..., 0, 0, 0])

In [91]:
clf2.best_estimator_.feature_importances_

array([0.10766408, 0.03962548, 0.06042617, 0.09463617, 0.07395133,
       0.09794303, 0.01023595, 0.00339452, 0.        , 0.04098795,
       0.00122074, 0.        , 0.09506588, 0.01033639, 0.08959246,
       0.03911752, 0.23580233])

In [94]:
def get_top_features(clf,X):
    important_features = pd.Series(data=clf.best_estimator_.feature_importances_,index=X.columns)
    important_features.sort_values(ascending=False,inplace=True)
    return important_features

In [95]:
get_top_features(clf2,X)

DayOfYear          0.235802
Species            0.107664
Longitude          0.097943
Year               0.095066
Trap               0.094636
DayOfMonth         0.089592
Latitude           0.073951
Street             0.060426
Zipcode            0.040988
Block              0.039625
DayOfWeek          0.039118
Month              0.010336
AddressAccuracy    0.010236
City               0.003395
Lat_int            0.001221
Long_int           0.000000
State              0.000000
dtype: float64

In [96]:
get_top_features(clf,X)

DayOfYear          0.357529
Longitude          0.128575
Year               0.121866
Trap               0.103310
Species            0.079814
Latitude           0.075773
Street             0.043263
DayOfMonth         0.028922
Block              0.020200
Zipcode            0.015718
Month              0.008483
DayOfWeek          0.007994
City               0.006987
AddressAccuracy    0.000880
Lat_int            0.000687
Long_int           0.000000
State              0.000000
dtype: float64

In [88]:
sample['WnvPresent'] = predictions2
# wait why is it not binary prediction, all zeros!
sample.WnvPresent.value_counts()

0    115652
1       641
Name: WnvPresent, dtype: int64

In [89]:
sample.to_csv('exp2_1.csv', index=False)