In [1]:
# loading packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from math import sin, cos, sqrt, atan2, radians

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

from xgboost import XGBClassifier


### functions

In [2]:
def distance_coordinate(lat1,lon1,lat2,lon2):
    # calculate the distance between 2 gps coordinate based on haver sin algorithm, resulting distance as km
    R = 6373.0
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

In [3]:
# relative humidity calculator (in percent)
def relative_humidity(T_dry, T_wet, SP):
    T_dry = (T_dry-32) * 5/9 # adjusting for temperature
    T_wet = (T_wet-32) * 5/9
    e_d = 6.112*np.exp(17.502*T_dry/(240.97+T_dry))
    e_w = 6.112*np.exp(17.502*T_wet/(240.97+T_wet))
    A = 0.00066*(1+0.00115*T_wet)
    P = 33.8639*SP # convert inch of mercury to pasco
    Hr = (e_w-A*P*(T_dry-T_wet))/e_d*100
    return Hr


In [4]:
def weather_delay(weather_in, N_delay):
    '''delay the weather feature for N weeks. '''  
        
    weather_tmp = weather_in.copy(deep = True)  
    
    weather_tmp.Month = weather_tmp.Month+N_delay

    col_name = [x+'_'+str(N_delay)+'w' for i,x in enumerate(list(weather_in.keys()))]
    weather_tmp.columns = col_name
    weather_tmp.columns.values[0:2]=('Year','Month')
        
    # handle missing rows
    weather_tmp = weather_tmp.shift(N_delay)
    for i in range(N_delay):
        weather_tmp.iloc[i,:] = weather_tmp.iloc[N_delay,:]
        weather_tmp.Month.iloc[i] = weather_tmp.Month.iloc[N_delay]-(N_delay-i)
    
    return weather_tmp  
       

In [5]:
def adjust_species(X):    
    X['Species'] = X['Species'].replace('CULEX PIPIENS/RESTUANS', 'P/R')
    X['Species'] = X['Species'].replace('CULEX RESTUANS', 'R')
    X['Species'] = X['Species'].replace('CULEX PIPIENS', 'P')
    X['Species'] = X['Species'].replace('CULEX TERRITANS', 'Others')
    X['Species'] = X['Species'].replace('CULEX SALINARIUS', 'Others')
    X['Species'] = X['Species'].replace('CULEX TARSALIS', 'Others')
    X['Species'] = X['Species'].replace('CULEX ERRATICUS', 'Others')
    X['Species'] = X['Species'].replace('UNSPECIFIED CULEX', 'Others')
    return X 

In [6]:
# pre-processing function
def preprocess(train, test, weather_data):
    # create empty dataframe
    X = pd.DataFrame()
    
    # temporal features
    X['Month'] = [x.week for x in test.Date]
    X['Year'] = [x.year for x in test.Date]
    X['Day'] = [x.dayofyear for x in test.Date]
    
    # adding geographic coordinates
    X['Latitude'] = test.Latitude
    X['Longitude'] = test.Longitude
    
    # adding trap_id as one-hot encoding
#     X['Trap_id'] = train.Trap.astype('category')
#     X = pd.concat((X, pd.get_dummies(X['Trap_id'], drop_first=True, prefix='Trap')),axis=1)
#     X = X.drop('Trap_id', axis = 1)
    
    # adding distance matrix
    dist_mat = distance_matrix(train, test)
    X['Trap_id'] = test.Trap.astype('category')
    X = X.merge(dist_mat, left_on=['Trap_id','Latitude','Longitude'], right_on=['Trap','Latitude','Longitude'], how='left')
    X = X.drop(['Trap','Trap_id'], axis = 1)
    
    # adding species
    test = adjust_species(test)
    X = pd.concat((X, pd.get_dummies(test['Species'], drop_first=True, prefix='Species')),axis=1)
        
    #X['AddressAccuracy'] = train.AddressAccuracy    
    
    # adding current weather info
    X = X.merge(weather_data, on=['Year','Month'], how='left')
    
    # adding previous month info
    N_month = 4
    for i in range(1, N_month+1):
         X = X.merge(weather_delay(weather_data,i), on=['Year','Month'], how='left')
    
    return X

In [7]:
def distance_matrix(target_train, target_test):
    # create distance matrix between target_train and target_test
    # target_test is observations (rows)
    # target_train is features (columns)
    
    tmp_train  = target_train[['Date','Trap','Latitude','Longitude']].groupby(['Trap','Latitude','Longitude']).count().reset_index()
    tmp_test = target_test[['Date','Trap','Latitude','Longitude']].groupby(['Trap','Latitude','Longitude']).count().reset_index()
#    target_combined = pd.concat((tmp2, pd.get_dummies(tmp.Trap, prefix='Trap')),axis=1)

    lamda = 1/3
    dist_mat = np.zeros((tmp_test.shape[0], tmp_train.shape[0]))
    
    for i, trap_loc in enumerate(tmp_test.Trap):
        lat1, lon1 = tmp_test[['Latitude','Longitude']].loc[i]
        
        for j, value in enumerate(tmp_train.Trap):        
            lat2, lon2 = tmp_train[['Latitude','Longitude']].iloc[j]
            dist = distance_coordinate(radians(lat1),radians(lon1),radians(lat2),radians(lon2))
           # dist_mat[i,j]= dist
            dist_mat[i,j]= 2/(1+np.exp(lamda*dist))
    
    target_combine = pd.concat((tmp_test,pd.DataFrame(data=dist_mat, columns = tmp_train.Trap)), axis = 1)
    target_combine = target_combine.drop('Date', axis=1)
    # adjusting for duplicated column names
    target_combine.columns=pd.io.parsers.ParserBase({'names':target_combine.columns})._maybe_dedup_names(target_combine.columns)
    return target_combine

### Main

In [8]:
# load data
traps = pd.read_csv('../SparkBeyond/data/train.csv')
weather = pd.read_csv('../SparkBeyond/data/weather.csv')
traps_test = pd.read_csv('../SparkBeyond/data/test.csv')
submission = pd.read_csv('../SparkBeyond/data/sampleSubmission.csv')

In [9]:
# collapse records that are separated because the number of catched mosquitos exceed 50
traps = traps.groupby(['Date', 'Trap', 'Species','Latitude','Longitude','AddressAccuracy','Block','Street']).sum().reset_index()

In [10]:
# pre-processing

# get right format for date
traps.Date = pd.to_datetime(traps.Date)
traps_test.Date = pd.to_datetime(traps_test.Date)

weather = weather.replace('M', np.NaN)
weather = weather.replace('-', np.NaN)
weather = weather.replace('T', np.NaN)
weather = weather.replace(' T', np.NaN)
weather = weather.replace('  T', np.NaN)

In [11]:
# keep the information from station1

weather_stn1 = weather[weather['Station']==1]
weather_stn1 = weather_stn1.drop(['Station', 'Water1', 'Heat', 'Cool', 'CodeSum','SnowFall','Depth','Sunrise','Sunset'], axis = 1)
weather_stn1.Date = pd.to_datetime(weather_stn1.Date)
weather_stn1['Month'] = [x.week for x in weather_stn1.Date]
weather_stn1['Year'] = [x.year for x in weather_stn1.Date]
    
weather_stn1 = weather_stn1.apply(pd.to_numeric).drop(['Date'], axis = 1).reset_index(drop = True)
#weather_stn1 = weather_stn1.groupby(['Year','Month']).mean().reset_index()

In [12]:
# refine features
# adding temperature difference
weather_stn1['Tdif'] = weather_stn1.Tmax - weather_stn1.Tmin
weather_stn1 = weather_stn1.drop(['Tmax'], axis =1)

# adding relative humidity (100)
weather_stn1['RH'] = [relative_humidity(weather_stn1.Tavg.loc[i],
                                        weather_stn1.WetBulb.loc[i],
                                        weather_stn1.StnPressure.loc[i]) 
                      for i,x in enumerate(weather_stn1.Tmin)]

weather_stn1['ResultDir'] = [np.sin(radians(x)) for x in weather_stn1['ResultDir']]

weather_stn1 = weather_stn1.groupby(['Year','Month']).mean().reset_index()

In [13]:
X_train = preprocess(traps, traps, weather_stn1)
X_test = preprocess(traps, traps_test, weather_stn1)
#X_train['Trap_T234'] =0

In [14]:
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)

In [15]:
# adjusting the column sequence
X_train = X_train[list(X_test.keys())]

In [16]:
# generating target
y_train = (traps.WnvPresent>0).astype(int)

In [17]:
# Training with XGB
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100, learning_rate=0.125)
xgb.fit(X_train, y_train)
yhat = xgb.predict_proba(X_test)

In [18]:
# Training with Logistic
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression(penalty='l1', C=100,)
lor.fit(X_train, y_train)
yhat = lor.predict_proba(X_test)

In [19]:
# Generating submission results
submission['WnvPresent'] = yhat[:,1]
# adjusting Species from others to probability of zero
submission.WnvPresent[adjust_species(traps_test).Species=='Others'] = 0

submission.to_csv('submission_xgb_weather_delay_spatial.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
