In [1]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle as dist
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

#spray = pd.read_csv('data/spray.csv')
#train = pd.read_csv('data/train.csv')
#test = pd.read_csv('data/test.csv')
#weather = pd.read_csv('data/weather.csv')



## Data Clean Up

In [2]:
def closer_station(point):
    from geopy.distance import great_circle as dist
    Stat1 = (41.995,-87.933)
    Stat2 = (41.786,-87.752)
    if dist(point, Stat2).miles > dist(point, Stat1).miles:
        return 1
    else: return 2

In [3]:
def processtest(locstring):
    test = pd.read_csv(locstring, skip_blank_lines= True)
    test.set_index("Date", inplace = 1, drop =0)
    test.index = test.index.to_datetime()
    test["t_latlong"] = zip(test.Latitude, test.Longitude)
    test["Station"] = [closer_station(_) for _ in test.t_latlong]
    Stat1 = (41.995,-87.933)
    Stat2 = (41.786,-87.752)
    test["dist1"] = [dist(_, Stat1).miles for _ in test.t_latlong]
    test["dist2"] = [dist(_, Stat2).miles for _ in test.t_latlong]
    return test

#processtest('data/train.csv').tail(1)

In [4]:
def processweather(locstring):
    
    weather = pd.read_csv(locstring, na_values=["M", "T", "  T", "-"], skip_blank_lines= True)
    weather.set_index("Date", inplace = 1, drop = 0)
    weather.index = weather.index.to_datetime()
    return weather

#processweather('data/weather.csv').head(1).drop("ResultDir", axis = 1)

In [5]:
weather = processweather('data/weather.csv').drop(["Water1", "CodeSum", 
                                                   "Depth", "SnowFall", "SeaLevel", "ResultDir"], axis = 1)
#weather.ResultDir = weather.ResultDir.astype("object")

w1 = weather[weather.Station==1]
w2 = weather[weather.Station==2]

#nums = [1]
nums = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]

for col in w1.columns.drop("Date"):
    for n in nums:
        w1[u"prev_{}_".format(n)+col] = w1[col].shift(n)
for col in w2.columns.drop("Date"):
    for n in nums:
        w2[u"prev_{}_".format(n)+col] = w2[col].shift(n)

#weather = w1.append(w2)

In [6]:
def set_up_train(test = 0):
    from sklearn.preprocessing import StandardScaler, Imputer
    """ returns the y,X"""
    
    if test == 1:
        temp = processtest('data/test.csv')
    else:
        temp = processtest('data/train.csv')

    weather = processweather('data/weather.csv').drop(["Water1", "CodeSum", 
                                                       "Depth", "SnowFall", "SeaLevel", "ResultDir"], axis = 1)
    #weather.ResultDir = weather.ResultDir.astype("object")

    w1 = weather[weather.Station==1]
    w2 = weather[weather.Station==2]

    
    nums = [1,2,3,4,6,8,10,12]

    for col in w1.columns.drop("Date"):
        for n in nums:
            w1[u"prev_{}_".format(n)+col] = w1[col].shift(n)
    for col in w2.columns.drop("Date"):
        for n in nums:
            w2[u"prev_{}_".format(n)+col] = w2[col].shift(n)

    weather = w1.append(w2)        


    data = pd.merge(temp, weather, how = "left", on= ["Station", "Date"], left_index=0, right_index=0)
    
    if test != 1:
        weights = data.pivot_table(values = ["NumMosquitos"], index = ["Trap","Date"], aggfunc = "sum")/\
        data.pivot_table(values = ["NumMosquitos"], index = ["Trap"], aggfunc = "sum")

        weights = np.array(data.merge(weights.reset_index(), how = "left", on = ["Trap", "Date"]).NumMosquitos_y)
    else:
        pass
    
    
    data.set_index("Date", inplace = 1)
    data.index = data.index.to_datetime()
    data["month"] = [_ for _ in data.index.month]
    data["week"] = [_ for _ in data.index.weekday]
    data["year"] = [_ for _ in data.index.year]

    if test != 1:
        y = data.WnvPresent
    else: 
        pass

    X = data.drop(["t_latlong", "AddressNumberAndStreet", "AddressAccuracy", 
                   "Trap", "Block", "Street", "Address", "Station"], axis = 1)
    if test != 1:
        X.drop(["WnvPresent", "NumMosquitos"], axis = 1, inplace = 1)
    else:
        pass
    
    numcols = X.columns[(X.dtypes == "float64") |(X.dtypes == "int64")]
    #X = X.fillna(method = "bfill", axis = "columns").fillna(method = "ffill", axis = "columns")
    
    X[numcols] = Imputer(axis = 1).fit_transform(X[numcols])
    
    #for col in numcols:
    #    X[col] = X[col].interpolate() 
    X[numcols] = StandardScaler().fit_transform(X[numcols])

    X.loc[X.Species == "UNSPECIFIED CULEX", "Species"] = "CULEX PIPIENS"
    catcols = X.columns[(X.dtypes == "object")]
    X = pd.get_dummies(X, columns = catcols, drop_first=True)

    if test == 1:
        return X.iloc[:,1:]
    else:
        return y , X , weights


## Model Selection

In [45]:
y , X , w = set_up_train()

In [46]:
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report 

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42, stratify=y)

y.value_counts()/len(y)

0    0.947554
1    0.052446
Name: WnvPresent, dtype: float64

In [47]:

def print_cm_cr(y_true, y_pred, names):
    """prints the confusion matrix and the classification report"""
    cm = confusion_matrix(y_true, y_pred)
    cols = ['pred_' + c for c in names]
    dfcm = pd.DataFrame(cm, columns = cols, index = names)
    print dfcm
    print
    print classification_report(y_true, y_pred)

In [48]:


def evaluate_model(model, weights):
    model.fit(X, y, sample_weight = weights)
    y_pred = model.predict(X)
    
    a = accuracy_score(y, y_pred)
    
    print_cm_cr(y, y_pred, ["no_WN", "WN"])
    
    return a



### Random Forest / Extra Tree

In [49]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [50]:
rf = RandomForestClassifier(n_estimators=500, n_jobs = -1, criterion = "gini", min_samples_leaf=1)


evaluate_model(rf, w)



       pred_no_WN  pred_WN
no_WN        9938       17
WN            182      369

             precision    recall  f1-score   support

          0       0.98      1.00      0.99      9955
          1       0.96      0.67      0.79       551

avg / total       0.98      0.98      0.98     10506



0.98105844279459353

In [51]:
et = ExtraTreesClassifier(n_estimators=500, n_jobs = -1)


evaluate_model(et, w)

       pred_no_WN  pred_WN
no_WN        9945       10
WN            189      362

             precision    recall  f1-score   support

          0       0.98      1.00      0.99      9955
          1       0.97      0.66      0.78       551

avg / total       0.98      0.98      0.98     10506



0.98105844279459353

### SVM

In [52]:
from sklearn.svm import SVC



In [53]:
svm = SVC(C=5000)


evaluate_model(svm, w)

       pred_no_WN  pred_WN
no_WN        9888       67
WN            468       83

             precision    recall  f1-score   support

          0       0.95      0.99      0.97      9955
          1       0.55      0.15      0.24       551

avg / total       0.93      0.95      0.94     10506



0.94907671806586713

In [54]:
from sklearn.ensemble import BaggingClassifier

baggingsvc = BaggingClassifier(svm, n_estimators = 50, n_jobs=-1)


evaluate_model(baggingsvc, w)

       pred_no_WN  pred_WN
no_WN        9896       59
WN            472       79

             precision    recall  f1-score   support

          0       0.95      0.99      0.97      9955
          1       0.57      0.14      0.23       551

avg / total       0.93      0.95      0.93     10506



0.94945745288406624

### Decision Trees

In [55]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

baggingdt = BaggingClassifier(dt, n_estimators= 100, n_jobs=-1)


evaluate_model(baggingdt, w)




       pred_no_WN  pred_WN
no_WN        9937       18
WN            182      369

             precision    recall  f1-score   support

          0       0.98      1.00      0.99      9955
          1       0.95      0.67      0.79       551

avg / total       0.98      0.98      0.98     10506



0.98096325909004378

### Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegressionCV

In [57]:
lr = LogisticRegressionCV(n_jobs = -1, class_weight="balanced")


evaluate_model(lr, w)

       pred_no_WN  pred_WN
no_WN        6864     3091
WN             76      475

             precision    recall  f1-score   support

          0       0.99      0.69      0.81      9955
          1       0.13      0.86      0.23       551

avg / total       0.94      0.70      0.78     10506



0.69855320769084328

### SGD

In [58]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(n_iter=100, class_weight= "balanced")

evaluate_model(sgd, w)


       pred_no_WN  pred_WN
no_WN        5378     4577
WN             57      494

             precision    recall  f1-score   support

          0       0.99      0.54      0.70      9955
          1       0.10      0.90      0.18       551

avg / total       0.94      0.56      0.67     10506



0.55891871311631447

### Boost

In [59]:
from sklearn.ensemble import AdaBoostClassifier

lr2 = LogisticRegressionCV()
boostlr = AdaBoostClassifier(lr2)

evaluate_model(boostlr, w)

       pred_no_WN  pred_WN
no_WN        9928       27
WN            524       27

             precision    recall  f1-score   support

          0       0.95      1.00      0.97      9955
          1       0.50      0.05      0.09       551

avg / total       0.93      0.95      0.93     10506



0.94755377879307068

# Predict

In [7]:
y,X, w = set_up_train()

In [8]:
Xtest = set_up_train(test = 1)

In [9]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_estimators=5000, n_jobs = -1, class_weight="balanced")


et.fit(X,y, w)


ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [10]:
test_pred = et.predict_proba(Xtest)[:,1]

In [11]:
pd.DataFrame({"Id" : range(1,len(test_pred)+1), "WnvPresent" : test_pred}).to_csv("ek_submit.csv", index = False)