In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load dataset 
train = pd.read_csv('remove_duplicates_transform_label_address_train.csv')
test = pd.read_csv('remove_duplicates_transform_label_address_test.csv')
sample = pd.read_csv('input/sampleSubmission.csv')
train['Zipcode']=train['Zipcode'].astype(str)
test['Zipcode']=test['Zipcode'].astype(str)

weather = pd.read_csv('input/weather.csv')
# spray = pd.read_csv('input/spray.csv')
# weather station coordinates
station1 = [-87.933 , 41.995]
station2 = [-87.752 , 41.786]

In [3]:
def generate_date_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].apply(lambda x: x.year)
    df['Month'] = df['Date'].apply(lambda x: x.month)
    df['DayOfMonth'] = df['Date'].apply(lambda x: x.day)
    # df['DayOfWeekName'] = df['Date'].apply(lambda x: x.day_name())
    df['DayOfWeek'] = df['Date'].apply(lambda x: x.dayofweek)
    df['DayOfYear'] = df['Date'].apply(lambda x: x.dayofyear)
#     df['WeekOfYear'] = df['Date'].apply(lambda x: x.weekofyear)
#     df['IsLeapYear'] = df['Date'].apply(lambda x: x.is_leap_year)
#     df['IsLeapYear'] = df['IsLeapYear'].astype(str)
#     df['Quarter'] = df['Date'].apply(lambda x: x.quarter)

def transform_df(df):
    df_ = df.copy()
    df_['Lat_int'] = df_.Latitude.astype(int)
    df_['Long_int'] = df_.Longitude.astype(int)
    return df_

In [4]:
train2 = transform_df(train)
test2 = transform_df(test)

In [5]:
train2.dtypes 

Date                object
Species             object
Block                int64
Street              object
Trap                object
Latitude           float64
Longitude          float64
AddressAccuracy      int64
WnvPresent           int64
City                object
State               object
Zipcode             object
Lat_int              int64
Long_int             int64
dtype: object

# featurize weather, remove missing values

In [6]:
# Not using codesum for this benchmark
weather = weather.drop('CodeSum', axis=1)

# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

In [7]:
# Merge with weather data
train3 = train2.merge(weather, on='Date')
test3 = test2.merge(weather, on='Date')

In [8]:
train3.dtypes

Date                object
Species             object
Block                int64
Street              object
Trap                object
Latitude           float64
Longitude          float64
AddressAccuracy      int64
WnvPresent           int64
City                object
State               object
Zipcode             object
Lat_int              int64
Long_int             int64
Tmax_x               int64
Tmin_x               int64
Tavg_x              object
Depart_x            object
DewPoint_x           int64
WetBulb_x           object
Heat_x              object
Cool_x              object
Sunrise_x           object
Sunset_x            object
Depth_x             object
Water1_x             int64
SnowFall_x          object
PrecipTotal_x       object
StnPressure_x       object
SeaLevel_x          object
ResultSpeed_x      float64
ResultDir_x          int64
AvgSpeed_x          object
Tmax_y               int64
Tmin_y               int64
Tavg_y              object
Depart_y             int64
D

In [9]:
# drop address columns
train3 = train3.drop(['Date','WnvPresent'],  axis = 1)
test3 = test3.drop(['Date','Id'],axis = 1)

In [10]:
train3.shape

(8475, 50)

In [11]:
test3.shape

(116293, 50)

# label encoder

In [12]:
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [135]:
def get_label_encoder(train, test, cols):
    lbl = preprocessing.LabelEncoder()
    for col in cols:
        lbl.fit(list(train[col].values) + list(test[col].values))
        train[col] = lbl.transform(train[col].values)
        test[col] = lbl.transform(test[col].values)
def convert_string_to_num(df, str_columns):
    for col in str_columns:
        df[col] = df[col].astype(float)
def get_metrics(y_true, y_pred):
    print("Precision: %1.3f" % precision_score(y_true, y_pred))
    print("Recall: %1.3f" % recall_score(y_true, y_pred))
    print("F1: %1.3f" % f1_score(y_true, y_pred))
    print("AUC: %1.3f" % roc_auc_score(y_true, y_pred))

In [14]:
all_obj_cols = train3.select_dtypes(include=['object']).columns
all_obj_cols

Index(['Species', 'Street', 'Trap', 'City', 'State', 'Zipcode', 'Tavg_x',
       'Depart_x', 'WetBulb_x', 'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x',
       'Depth_x', 'SnowFall_x', 'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x',
       'AvgSpeed_x', 'Tavg_y', 'WetBulb_y', 'Heat_y', 'Cool_y',
       'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y', 'AvgSpeed_y'],
      dtype='object')

In [15]:
str_columns = [u'Tavg_x', u'Depart_x', u'WetBulb_x',
       u'Heat_x', u'Cool_x', u'Sunrise_x', u'Sunset_x', u'Depth_x',
       u'SnowFall_x', u'PrecipTotal_x', u'StnPressure_x', u'SeaLevel_x',
       u'AvgSpeed_x', u'Tavg_y', u'WetBulb_y', u'Heat_y', u'Cool_y',
       u'PrecipTotal_y', u'StnPressure_y', u'SeaLevel_y', u'AvgSpeed_y']
convert_string_to_num(train3,str_columns )
convert_string_to_num(test3,str_columns )

In [16]:
obj_cols = [u'Species', u'Street', u'Trap','State','City','Zipcode']
get_label_encoder(train3, test3, obj_cols)

In [17]:
# remove missing data
train4 = train3.ix[:,(train3 != -1).any(axis=0)]
test4 = test3.ix[:,(test3 != -1).any(axis=0)]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
train4.shape

(8475, 43)

In [32]:
test4.shape

(116293, 43)

In [34]:
def create_prediction(clf, test, sample):
    predictions = clf.predict(test)
    sample['WnvPresent'] = predictions 
    # wait why is it not binary prediction, all zeros????
    # why it is still all 0
    print (sample.WnvPresent.value_counts())

In [136]:
X = train4
y = train.WnvPresent.values
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3,
                                                    random_state=42) # need to convert to np array

In [140]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

penalty = ["l1", "l2"]
class_weight  = [None,"balanced"]
max_iter = [100,150]

tuned_parameters = [{'penalty':penalty,
                     'class_weight':class_weight,
                     'max_iter':max_iter
                    }]
n_folds = 3

scoring = {'AUC': 'roc_auc', 
           'f1': 'f1',
           'recall':'recall',
           'precision':'precision'}

In [141]:
cv = GridSearchCV(clf, 
                   tuned_parameters, 
                   cv=n_folds, 
                   refit=False, 
                   scoring=scoring)  
# what if i set as recall
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
cv.fit(X_train, y_train)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'max_iter': [100, 150]}],
       pre_dispatch='2*n_jobs', refit=False, return_train_score='warn',
       scoring={'AUC': 'roc_auc', 'f1': 'f1', 'recall': 'recall', 'precision': 'precision'},
       verbose=0)

In [143]:
results = cv.cv_results_
results_df = pd.DataFrame.from_dict(results )



In [144]:
results_df.sort_values('mean_test_f1',ascending=False)[['params','mean_test_f1']].head(3)

Unnamed: 0,params,mean_test_f1
5,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.20975
7,"{'class_weight': 'balanced', 'max_iter': 150, ...",0.20975
6,"{'class_weight': 'balanced', 'max_iter': 150, ...",0.201032


In [153]:
results_df['params'].iloc[5]

{'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2'}

In [145]:
results_df.sort_values('mean_test_AUC',ascending=False)[['params','mean_test_AUC']].head(3)
# valid score
# why is valid AUC so different from test AUC

Unnamed: 0,params,mean_test_AUC
5,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.794552
7,"{'class_weight': 'balanced', 'max_iter': 150, ...",0.794529
1,"{'class_weight': None, 'max_iter': 100, 'penal...",0.789016


In [146]:
results_df.sort_values('mean_test_precision',ascending=False)[['params','mean_test_precision']].head(3)

Unnamed: 0,params,mean_test_precision
5,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.122043
7,"{'class_weight': 'balanced', 'max_iter': 150, ...",0.122043
6,"{'class_weight': 'balanced', 'max_iter': 150, ...",0.116448


In [147]:
results_df.sort_values('mean_test_recall',ascending=False)[['params','mean_test_recall']].head(3)

Unnamed: 0,params,mean_test_recall
5,"{'class_weight': 'balanced', 'max_iter': 100, ...",0.751054
7,"{'class_weight': 'balanced', 'max_iter': 150, ...",0.751054
6,"{'class_weight': 'balanced', 'max_iter': 150, ...",0.741469


In [154]:
# use the best params to refit the model
# choose the one with highest f1 score
best = LogisticRegression(penalty="l2", class_weight="balanced", max_iter=100)
best.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [161]:
y_train_pred = best.predict(X_train)
y_test_pred = best.predict(X_test)

In [162]:
get_metrics(y_train, y_train_pred)

Precision: 0.126
Recall: 0.783
F1: 0.217
AUC: 0.740


In [163]:
get_metrics(y_test, y_test_pred)

Precision: 0.151
Recall: 0.840
F1: 0.256
AUC: 0.778


In [164]:
# The estimated coefficients will all be around 1:
print(best.coef_)

[[-4.60334807e-01 -4.33876561e-03 -1.75289845e-03  3.11487929e-03
  -4.30417884e-01 -4.25621994e+00 -9.59674157e-02  2.73953162e-01
   0.00000000e+00  1.18041217e-02  7.53540900e-02  2.96898253e+00
  -2.02723528e-01 -1.70402615e-01 -1.80819165e-01 -4.17193151e-01
   5.33158267e-02  5.61934217e-01 -9.54027706e-01  1.08335846e+00
   2.93953023e-02 -1.15340977e-02  0.00000000e+00  1.05725750e+00
  -2.72673381e-01 -9.55437577e-01 -6.61084209e-01  4.40424459e-01
   1.01273665e-01 -5.81611673e-01  1.69329144e-01  2.15752535e-01
  -1.14400108e+00 -4.89968949e-01  1.22557590e-01 -5.53032535e-01
   5.21171716e-01  5.25029396e-01  5.33943758e-01  2.31341173e-01
  -4.96259761e-01 -1.30768138e-01  6.89174606e-01]]


In [165]:
create_prediction(best, test4, sample)

0    90135
1    26158
Name: WnvPresent, dtype: int64


In [171]:
pred_prob = best.predict_proba(test4)[:,1]
pred_prob.shape

(116293,)

In [172]:
sample['WnvPresent'] = pred_prob

In [173]:
sample.to_csv('exp5_prob.csv', index=False)

In [174]:
sample.head()

Unnamed: 0,Id,WnvPresent
0,1,0.056458
1,2,0.036387
2,3,0.086606
3,4,0.023275
4,5,0.009401
