In [119]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [120]:
# Load dataset 
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
sample = pd.read_csv('input/sampleSubmission.csv')

# weather = pd.read_csv('input/weather.csv')
# spray = pd.read_csv('input/spray.csv')

In [121]:
train.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [122]:
test.dtypes

Id                          int64
Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
dtype: object

In [123]:
def generate_date_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].apply(lambda x: x.year)
    df['Month'] = df['Date'].apply(lambda x: x.month)
    df['DayOfMonth'] = df['Date'].apply(lambda x: x.day)
    # df['DayOfWeekName'] = df['Date'].apply(lambda x: x.day_name())
    df['DayOfWeek'] = df['Date'].apply(lambda x: x.dayofweek)
    df['DayOfYear'] = df['Date'].apply(lambda x: x.dayofyear)
    # df['WeekOfYear'] = df['Date'].apply(lambda x: x.weekofyear)
    #df['IsLeapYear'] = df['Date'].apply(lambda x: x.is_leap_year)
    #df['IsLeapYear'] = df['IsLeapYear'].astype(str)
    # df['Quarter'] = df['Date'].apply(lambda x: x.quarter)

def transform_df(df):
    df_ = df.copy()
    df_['Lat_int'] = df_.Latitude.astype(int)
    df_['Long_int'] = df_.Longitude.astype(int)
    generate_date_features(df_)
    return df_

In [124]:
train2 = transform_df(train)
test2 = transform_df(test)

In [125]:
# drop address columns
train2 = train2.drop(['Date','Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'],  axis = 1)
test2 = test2.drop(['Date','Id', 'Address', 'AddressNumberAndStreet'],axis = 1)

In [126]:
train2.dtypes 

Species             object
Block                int64
Street              object
Trap                object
Latitude           float64
Longitude          float64
AddressAccuracy      int64
Lat_int              int64
Long_int             int64
Year                 int64
Month                int64
DayOfMonth           int64
DayOfWeek            int64
DayOfYear            int64
dtype: object

# label encoder

In [127]:
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [128]:
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train2['Species'].values) + list(test2['Species'].values))
train2['Species'] = lbl.transform(train2['Species'].values)
test2['Species'] = lbl.transform(test2['Species'].values)

lbl.fit(list(train2['Street'].values) + list(test2['Street'].values))
train2['Street'] = lbl.transform(train2['Street'].values)
test2['Street'] = lbl.transform(test2['Street'].values)

lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train2['Trap'] = lbl.transform(train2['Trap'].values)
test2['Trap'] = lbl.transform(test2['Trap'].values)

In [129]:
# Get labels
labels = train.WnvPresent.values

In [130]:
train2.dtypes

Species              int64
Block                int64
Street               int64
Trap                 int64
Latitude           float64
Longitude          float64
AddressAccuracy      int64
Lat_int              int64
Long_int             int64
Year                 int64
Month                int64
DayOfMonth           int64
DayOfWeek            int64
DayOfYear            int64
dtype: object

In [131]:
X = train2
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3,
                                                    random_state=42)

In [132]:
gbt = ensemble.GradientBoostingClassifier()

learning_rate = [0.02, 0.04, 0.05, 0.08]
max_depth = [2, 3, 4, 6]
n_estimators = [50, 100, 150]

tuned_parameters = [{'max_depth': max_depth,
                     'n_estimators':n_estimators,
                     'learning_rate':learning_rate 
                    }]
n_folds = 3

# cv with aucroc
clf = GridSearchCV(gbt, 
                   tuned_parameters, 
                   cv=n_folds, 
                   refit=True, 
                   scoring='roc_auc')
clf2 = GridSearchCV(gbt, 
                   tuned_parameters, 
                   cv=n_folds, 
                   refit=True, 
                   scoring='recall')
# do we use refit or not
# if not refit, need to redo fit with training data

In [146]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [153]:
%%time
clf.fit(X_train, y_train)

CPU times: user 74.2 ms, sys: 2.72 ms, total: 76.9 ms
Wall time: 75.6 ms




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [160]:
def get_metrics(y_true, y_pred, y_pred_proba):
    print("Precision: %1.3f" % precision_score(y_true, y_pred))
    print("Recall: %1.3f" % recall_score(y_true, y_pred))
    print("F1: %1.3f" % f1_score(y_true, y_pred))
    print("AUC: %1.3f" % roc_auc_score(y_true, y_pred))
    print("AUC proba: %1.3f" % roc_auc_score(y_true, y_pred_proba))

In [163]:
y_train_pred = clf.predict(X_train)
y_train_pred_proba = clf.predict_proba(X_train)[:,1]
y_test_pred = clf.predict(X_test)
y_test_pred_proba = clf.predict_proba(X_test)[:,1]


In [164]:
get_metrics(y_train, y_train_pred, y_train_pred_proba)
# at most is only 0.618 precision TP/ TP + FP
# recall TP / TP + FN
# predict very few positives in generate
# why very low recall???

Precision: 0.000
Recall: 0.000
F1: 0.000
AUC: 0.500
AUC proba: 0.705


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [165]:
get_metrics(y_test, y_test_pred,y_test_pred_proba)
# very low recall on test dataset
# 0.5002 auc

Precision: 0.000
Recall: 0.000
F1: 0.000
AUC: 0.500
AUC proba: 0.693


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [167]:
# create predictions and submission file
# not a good training by itself
predictions = clf.predict_proba(test2)[:,1]
predictions 

array([0.01862009, 0.01032872, 0.03334312, ..., 0.01687283, 0.00516595,
       0.25418738])

In [168]:
sample['WnvPresent'] = predictions 
# wait why is it not binary prediction, all zeros!
# sample.WnvPresent.value_counts()

In [169]:
sample.to_csv('exp1_2.csv', index=False)

In [134]:
y_train_pred = clf2.predict(X_train)
y_test_pred = clf2.predict(X_test)

In [135]:
get_metrics(y_train, y_train_pred)

Precision: 0.939
Recall: 0.400
F1: 0.561
AUC: 0.699


In [136]:
get_metrics(y_test, y_test_pred)

Precision: 0.436
Recall: 0.102
F1: 0.166
AUC: 0.548


In [142]:
predictions = clf2.predict_proba(test2)[:,1]
predictions 

array([0.00130257, 0.00044301, 0.00059825, ..., 0.01398145, 0.01398145,
       0.00640116])

In [144]:
sample['WnvPresent'] = predictions 
# wait why is it not binary prediction, all zeros!

In [145]:
sample.to_csv('exp1_2.csv', index=False)