# Project 4 (Part 2)

Continuation with different features to test against Part 1.

In [82]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')

**Find Distances from the location to the weather stations**

Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level<br>
Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

In [83]:
# pip install geopy

import geopy.distance as gp

coord1 = (41.995,-87.933)  #coordinates for Chicago O'hare Airport (station 1)
coord2 = (41.786,-87.752)  #coordinates for S Doty Avenue (station 2)

dist_fr_1 = []
dist_fr_2 = []

for i in range(len(train.Latitude)):
    coord = (train.Latitude[i],train.Longitude[i])
    dist_fr_1.append(-(gp.distance(coord1, coord).km))
    dist_fr_2.append(-(gp.distance(coord2, coord).km))

# dist_fr_1 = [i for i in dist_fr_1]
# dist_fr_2 = [i for i in dist_fr_2]
    
train['dist_fr_1'] = dist_fr_1
train['dist_fr_2'] = dist_fr_2

In [84]:
coord1 = (41.995,-87.933)  #coordinates for Chicago O'hare Airport Hotstpot (station 1)
coord2 = (41.786,-87.752)  #coordinates for S Doty Avenue Hotspot (station 2)

dist_fr_1 = []
dist_fr_2 = []

for i in range(len(test.Latitude)):
    coord = (test.Latitude[i],test.Longitude[i])
    dist_fr_1.append(-(gp.distance(coord1, coord).km))
    dist_fr_2.append(-(gp.distance(coord2, coord).km))

# dist_fr_1 = [-i for i in dist_fr_1]
# dist_fr_2 = [-i for i in dist_fr_2]
    
test['dist_fr_1'] = dist_fr_1
test['dist_fr_2'] = dist_fr_2

In [85]:
train = train.drop(['Address','Block','AddressNumberAndStreet', 'AddressAccuracy', 'Latitude', 'Longitude', 'NumMosquitos', 'Trap', 'Street'],axis=1)
test = test.drop(['Address','Block','AddressNumberAndStreet', 'AddressAccuracy', 'Latitude', 'Longitude', 'Trap', 'Street'],axis=1)

train.head()

Unnamed: 0,Date,Species,WnvPresent,dist_fr_1,dist_fr_2
0,2007-05-29,CULEX PIPIENS/RESTUANS,0,-11.822004,-19.172879
1,2007-05-29,CULEX RESTUANS,0,-11.822004,-19.172879
2,2007-05-29,CULEX RESTUANS,0,-13.56547,-23.257125
3,2007-05-29,CULEX PIPIENS/RESTUANS,0,-9.261595,-21.747902
4,2007-05-29,CULEX RESTUANS,0,-9.261595,-21.747902


In [86]:
train = train.join(pd.get_dummies(train['Species'], drop_first=True))
train.drop('Species', axis=1, inplace=True)

In [87]:
train['UNSPECIFIED CULEX'] = 0

In [88]:
train.head()

Unnamed: 0,Date,WnvPresent,dist_fr_1,dist_fr_2,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX
0,2007-05-29,0,-11.822004,-19.172879,0,1,0,0,0,0,0
1,2007-05-29,0,-11.822004,-19.172879,0,0,1,0,0,0,0
2,2007-05-29,0,-13.56547,-23.257125,0,0,1,0,0,0,0
3,2007-05-29,0,-9.261595,-21.747902,0,1,0,0,0,0,0
4,2007-05-29,0,-9.261595,-21.747902,0,0,1,0,0,0,0


In [89]:
test = test.join(pd.get_dummies(test['Species'], drop_first=True))
test.drop('Species', axis=1, inplace=True)

In [90]:
test.head()

Unnamed: 0,Id,Date,dist_fr_1,dist_fr_2,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX
0,1,2008-06-11,-11.822004,-19.172879,0,1,0,0,0,0,0
1,2,2008-06-11,-11.822004,-19.172879,0,0,1,0,0,0,0
2,3,2008-06-11,-11.822004,-19.172879,1,0,0,0,0,0,0
3,4,2008-06-11,-11.822004,-19.172879,0,0,0,1,0,0,0
4,5,2008-06-11,-11.822004,-19.172879,0,0,0,0,0,1,0


## Reading and Data Cleaning for Weather Data

In [91]:
weather = pd.read_csv('weather_original.csv')
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,0448,1849,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,-,-,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,0447,1850,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,-,-,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,0446,1851,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


### Replacing missing values with average of values surrounding the missing data

In [92]:
weather.iloc[[87, 1745, 2067], 21] = (7.48, 7.61, 6.37) # AvgSpeed
weather.iloc[[87, 848, 2410, 2411], 17] = (29.37, 29.09, 29.30, 29.30) # StnPressure
weather.iloc[[848, 2410, 2412, 2415], 7] = (71.89, 64.67, 64.67, 59.44) # WetBulb

In [93]:
for i in range(len(weather.Tavg)):
    if weather.Tavg[i]=='M':
        weather.Tavg[i]= round((weather.Tmax[i]+weather.Tmin[i])/2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [94]:
for index, row in weather[weather['SeaLevel']=='M'].iterrows():
    weather.SeaLevel[index] = np.nanmean(pd.to_numeric(weather[weather['Date']==row['Date']]['SeaLevel'], 
                                                          errors='coerce'))        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [95]:
weather['PrecipTotal'] = weather['PrecipTotal'].replace('T',0.001, regex=True)
weather['PrecipTotal'] = weather['PrecipTotal'].replace('M',0, regex=True)

Heating/Cooling are building temperature days with a base of 65F, with degrees above/below it indicated. We're more interested in the external temperature where mosquitoes can breed, hence we'll be using Tmin, Tmax, Tavg instead. The same applies to Temperature Departure where it indicates a difference from the 30-year normal. 

Depth has either 0 or missing data, so that can be dropped, same goes for SnowFall (Missing, Trace, 0 and 0.1). Water1 has all M values.

CodeSum indicates particular reported weather occurrences that, through subsequent feature testing, was negligible when testing, even with munging into groups eg precipitation related grouping, which applies to StnPressure and SeaLevel as well (small differences throughout the range).

StnPressure and wind-related features were also negligible during subsequent testing, hence these were dropped.

In [96]:
weather = weather.drop(['Depart','Heat','Cool','Depth','Water1','SnowFall','Sunrise','Sunset','CodeSum',
                       'ResultSpeed', 'ResultDir', 'AvgSpeed', 'StnPressure', 'SeaLevel'],axis=1)

In [97]:
for col in ['Tavg', 'DewPoint', 'WetBulb','PrecipTotal']:
    weather[col] = pd.to_numeric(weather[col])
    
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal
0,1,2007-05-01,83,50,67.0,51,56.0,0.0
1,2,2007-05-01,84,52,68.0,51,57.0,0.0
2,1,2007-05-02,59,42,51.0,42,47.0,0.0
3,2,2007-05-02,60,43,52.0,42,47.0,0.0
4,1,2007-05-03,66,46,56.0,40,48.0,0.0


In [98]:
stn1 = weather[weather['Station']==1]
stn2 = weather[weather['Station']==2]
stn1 = stn1.drop('Station', axis=1)
stn2 = stn2.drop('Station', axis=1)
weather = stn1.merge(stn2, on='Date')

In [99]:
weather['Month'] = weather['Date'].apply(lambda x: x.split("-")[1])
weather = weather.join(pd.get_dummies(weather['Month']))
weather.drop('Month', axis=1, inplace=True)

In [100]:
weather['Day'] = weather['Date'].apply(lambda x: 'day'+x.split("-")[2])
weather = weather.join(pd.get_dummies(weather['Day']))
weather.drop('Day', axis=1, inplace=True)

In [101]:
weather.to_csv('weather1.csv') # Write out csv for safekeeping

## Combine Train and Weather Data

In [102]:
weather = pd.read_csv('weather1.csv') # To run if needed to load back data without running weather processing steps.
weather.drop('Unnamed: 0', axis=1, inplace=True)

In [103]:
weather.head()

Unnamed: 0,Date,Tmax_x,Tmin_x,Tavg_x,DewPoint_x,WetBulb_x,PrecipTotal_x,Tmax_y,Tmin_y,Tavg_y,DewPoint_y,WetBulb_y,PrecipTotal_y,05,06,07,08,09,10,day01,day02,day03,day04,day05,day06,day07,day08,day09,day10,day11,day12,day13,day14,day15,day16,day17,day18,day19,day20,day21,day22,day23,day24,day25,day26,day27,day28,day29,day30,day31
0,2007-05-01,83,50,67.0,51,56.0,0.0,84,52,68.0,51,57.0,0.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2007-05-02,59,42,51.0,42,47.0,0.0,60,43,52.0,42,47.0,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2007-05-03,66,46,56.0,40,48.0,0.0,67,48,58.0,40,50.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2007-05-04,66,49,58.0,41,50.0,0.001,78,51,64.0,42,50.0,0.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2007-05-05,66,53,60.0,38,49.0,0.001,66,54,60.0,39,50.0,0.001,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [104]:
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Id', 'Date'], axis = 1)

In [105]:
train.to_csv('final_train.csv')
test.to_csv('final_test.csv')

## Combine Train and Test Data to Perform Encoding

In [106]:
final_train = pd.read_csv('final_train.csv')
final_test = pd.read_csv('final_test.csv')

In [107]:
y = train['WnvPresent']

In [108]:
final_train.drop('Unnamed: 0', axis=1, inplace=True)
final_test.drop('Unnamed: 0', axis=1, inplace=True)

In [109]:
train = train.drop('WnvPresent', axis=1)

In [110]:
train.shape

(10506, 58)

In [111]:
test.shape

(116293, 58)

In [112]:
train.head()

Unnamed: 0,dist_fr_1,dist_fr_2,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX,Tmax_x,Tmin_x,Tavg_x,DewPoint_x,WetBulb_x,PrecipTotal_x,Tmax_y,Tmin_y,Tavg_y,DewPoint_y,WetBulb_y,PrecipTotal_y,05,06,07,08,09,10,day01,day02,day03,day04,day05,day06,day07,day08,day09,day10,day11,day12,day13,day14,day15,day16,day17,day18,day19,day20,day21,day22,day23,day24,day25,day26,day27,day28,day29,day30,day31
0,-11.822004,-19.172879,0,1,0,0,0,0,0,88,60,74.0,58,65.0,0.0,88,65,77.0,59,66.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,-11.822004,-19.172879,0,0,1,0,0,0,0,88,60,74.0,58,65.0,0.0,88,65,77.0,59,66.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,-13.56547,-23.257125,0,0,1,0,0,0,0,88,60,74.0,58,65.0,0.0,88,65,77.0,59,66.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,-9.261595,-21.747902,0,1,0,0,0,0,0,88,60,74.0,58,65.0,0.0,88,65,77.0,59,66.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,-9.261595,-21.747902,0,0,1,0,0,0,0,88,60,74.0,58,65.0,0.0,88,65,77.0,59,66.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [113]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.3, shuffle=True)

In [114]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
final_test = scaler.transform(test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.
  """


In [38]:
pipeline = Pipeline([
    ('svc', SVC()),
])

parameters = {    
    'svc__gamma': (0, 1, 2, 3, 10, 20),
    'svc__C': (0.1, 0.5),
    'svc__kernel': ('linear', 'rbf')
}

# Running pipeline to find best settings in 3 folds

grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   22.1s finished


Best parameters set:
	svc__C: 0.5
	svc__gamma: 20
	svc__kernel: 'rbf'


In [39]:
svc = SVC(gamma=20, C=0.5, probability=True, kernel='rbf')

svc_model = svc.fit(X_train, y_train)
svc_score = cross_val_score(svc_model, X_train, y_train, cv=5)
print(svc_score)

svc_pred = pd.DataFrame(svc_model.predict(X_test))

tn, fp, fn, tp = confusion_matrix(y_test, svc_pred).ravel()
svc_final_auc = roc_auc_score(y_test, svc_pred)
svc_final_acc = (tn+tp)/(tn+fp+fn+tp)
svc_final_df = pd.DataFrame([[tn,fp,fn,tp,svc_final_acc,svc_final_auc]], index=['SVC'], columns=['TN','FP','FN','TP','ACC','AUC'])

print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("Accuracy: {}".format(svc_final_acc))
print("AUC ROC: {}".format(svc_final_auc))

[0.94836957 0.94633152 0.94965986 0.94693878 0.94829932]
True Negatives: 2980
False Positives: 3
False Negatives: 165
True Positives: 4
Accuracy: 0.9467005076142132
AUC ROC: 0.5113314700462384


In [415]:
pipeline = Pipeline([
    ('forest_model', RandomForestClassifier()),
])

parameters = {    
    'forest_model__n_estimators': (10, 100, 1000, 10000),
    'forest_model__max_depth': (5, 10, 20, 30)
}

# Running pipeline to find best settings in 3 folds

grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Best parameters set:
	forest_model__max_depth: 5
	forest_model__n_estimators: 10


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.5min finished


In [81]:
# max_depth via Pipeline proved to be ineffective even though it was indicated as so. This was removed here.

forest_model = RandomForestClassifier(n_jobs=-1, n_estimators=1000)
forest_model.fit(X_train, y_train)
forest_score = cross_val_score(forest_model, X_train, y_train, cv=5)
forest_final = forest_model.score(X_test, y_test)
print(forest_score)
print(forest_final)
forest_pred = pd.DataFrame(forest_model.predict(X_test))
forest_pred_train = pd.DataFrame(forest_model.predict(X_train))

tn, fp, fn, tp = confusion_matrix(y_test, forest_pred).ravel()
forest_final_auc = roc_auc_score(y_test, forest_pred)
forest_final_acc = (tn+tp)/(tn+fp+fn+tp)
forest_final_df = pd.DataFrame([[tn,fp,fn,tp,forest_final_acc,forest_final_auc]], index=['Forest'], columns=['TN','FP','FN','TP','ACC','AUC'])

print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("Accuracy: {}".format(forest_final_acc))
print("AUC ROC: {}".format(forest_final_auc))

[0.9375     0.92997961 0.93813732 0.94013605 0.93265306]
0.9368654822335025
True Negatives: 2939
False Positives: 48
False Negatives: 151
True Positives: 14
Accuracy: 0.9368654822335025
AUC ROC: 0.5343894248815575


In [43]:
lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train, y_train)
lr_score = cross_val_score(lr_model, X_train, y_train, cv=5)
lr_final = lr_model.score(X_test, y_test)
print(lr_score)
print(lr_final)
lr_pred = pd.DataFrame(lr_model.predict(X_test))
lr_pred_train = pd.DataFrame(lr_model.predict(X_train))

tn, fp, fn, tp = confusion_matrix(y_test, lr_pred).ravel()
lr_final_auc = roc_auc_score(y_test, lr_pred)
lr_final_acc = (tn+tp)/(tn+fp+fn+tp)
lr_final_df = pd.DataFrame([[tn,fp,fn,tp,lr_final_acc,lr_final_auc]], index=['Logistic Regression'], columns=['TN','FP','FN','TP','ACC','AUC'])

print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("Accuracy: {}".format(lr_final_acc))
print("AUC ROC: {}".format(lr_final_auc))

[0.94769022 0.94769022 0.94829932 0.94829932 0.94829932]
0.9463832487309645
True Negatives: 2983
False Positives: 0
False Negatives: 169
True Positives: 0
Accuracy: 0.9463832487309645
AUC ROC: 0.5


In [493]:
pipeline = Pipeline([
    ('xgb', XGBClassifier()),
])

parameters = {    
    'xgb__eta': (0.1, 0.5, 1),
    'xgb__max_depth': (2, 3, 4, 5),
    'xgb__gamma': (0, 1, 5, 8),
    'xgb__scale_pos_weight': (0,5, 1)
}

# Running pipeline to find best settings in 3 folds

grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  1.4min finished


Best parameters set:
	xgb__eta: 0.1
	xgb__gamma: 5
	xgb__max_depth: 5
	xgb__scale_pos_weight: 1


In [494]:
xgb = XGBClassifier(eta=0.1, gamma=5, max_depth=5, booster='gbtree', scale_pos_weight=1, n_estimators=1000)

xgb_model = xgb.fit(X_train, y_train)
xgb_score = cross_val_score(xgb_model, X_train, y_train, cv=5)
xgb_pred = pd.DataFrame(xgb_model.predict(X_test))

tn, fp, fn, tp = confusion_matrix(y_test, xgb_pred).ravel()
xgb_final_auc = roc_auc_score(y_test, xgb_pred)
xgb_final_acc = (tn+tp)/(tn+fp+fn+tp)
xgb_final_df = pd.DataFrame([[tn,fp,fn,tp,xgb_final_acc,xgb_final_auc]], index=['XGB'], columns=['TN','FP','FN','TP','ACC','AUC'])

print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("Accuracy: {}".format(xgb_final_acc))
print("AUC ROC: {}".format(xgb_final_auc))

True Negatives: 2983
False Positives: 0
False Negatives: 169
True Positives: 0
Accuracy: 0.9463832487309645
AUC ROC: 0.5


In [495]:
pipeline = Pipeline([
    ('lr', LogisticRegression(max_iter=10000, n_jobs=-1)),
])

parameters = {    
    'lr__solver': ('liblinear', 'saga'),
    'lr__penalty': ('l1', 'l2'),
    'lr__class_weight': (None, 'balanced')
}

# Running pipeline to find best settings in 3 folds

grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   13.5s finished
  " = {}.".format(effective_n_jobs(self.n_jobs)))


Best parameters set:
	lr__class_weight: None
	lr__penalty: 'l1'
	lr__solver: 'liblinear'


In [46]:
lr = LogisticRegression(solver='liblinear', penalty='l1', class_weight=None)

lr_model = lr.fit(X_train, y_train)
lr_score = cross_val_score(lr_model, X_train, y_train, cv=5)
lr_pred = pd.DataFrame(lr_model.predict(X_test))

tn, fp, fn, tp = confusion_matrix(y_test, lr_pred).ravel()
lr_final_auc = roc_auc_score(y_test, lr_pred)
lr_final_acc = (tn+tp)/(tn+fp+fn+tp)
lr_final_df = pd.DataFrame([[tn,fp,fn,tp,lr_final_acc,lr_final_auc]], index=['LR'], columns=['TN','FP','FN','TP','ACC','AUC'])

print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print("Accuracy: {}".format(lr_final_acc))
print("AUC ROC: {}".format(lr_final_auc))

True Negatives: 2998
False Positives: 0
False Negatives: 154
True Positives: 0
Accuracy: 0.9511421319796954
AUC ROC: 0.5


In [103]:
def run_forest():
    forest_probs = pd.DataFrame(forest_model.predict_proba(final_test))
    forest_probs.reset_index()
    forest_probs.index = forest_probs.index + 1
    forest_probs.drop(0, axis=1, inplace=True)
    forest_probs.columns = ['WnvPresent']
    forest_probs.index.name='Id'
    forest_probs.to_csv('forest_submission.csv')
    forest_probs.head()

In [417]:
run_forest()