In [15]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, make_scorer
from common import get_train, get_test, get_parking, feat_eng, premodel_formating, mean_target_enc, get_XY
import pandas as pd
from sklearn.cross_validation import cross_val_score
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
%matplotlib inline
from sklearn.model_selection import KFold
from common import load_clean_train, load_clean_test, extract_dates, add_midpoint_loc_id, tt_join_city_stats2clean, load_clean_parking

In [8]:
def f_score(m, X, y):
    preds = m.predict(X)
    p = precision_score(y, preds)
    r = recall_score(y, preds)
    return (1.25*p*r/(0.25*p+r))

In [106]:
def writeToFile(preds, filename):
    result = "id,any_spot"
    for i in range(0, len(preds)):
        result = result + "\n" + str(i + 1) + "," + str(preds[i]) 
    # return result
    file = open(filename,"w")  
    file.write(result)
    file.close()

## XGboost

In [60]:
def cv_k(X, y, k, model):
    kf = KFold(n_splits = k)
    kf.get_n_splits(X)
    # f0.5 score
    f_score = 0
    # precision
    p_score = 0
    # recall
    r_score = 0
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.ix[train_index], X.ix[val_index]    
        y_train, y_val = y.ix[train_index], y.ix[val_index]
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        f_score += fbeta_score(y_val, preds, beta=0.5)
        p_score += precision_score(y_val, preds)
        r_score += recall_score(y_val, preds)
    return [f_score/(k*1.0), p_score/(k*1.0), r_score/(k*1.0)]

In [107]:
import warnings; warnings.simplefilter('ignore')
train_df = load_clean_train()
test_df = load_clean_test()
print(train_df.shape, test_df.shape)

train_df = extract_dates(train_df)
test_df = extract_dates(test_df)
print(train_df.shape, test_df.shape)

train_df = add_midpoint_loc_id(train_df)
test_df = add_midpoint_loc_id(test_df)
print(train_df.shape, test_df.shape)

train_df = tt_join_city_stats2clean(train_df)
test_df = tt_join_city_stats2clean(test_df)
print(train_df.shape, test_df.shape)

#train_df = premodel_formating(train_df, split=False)
#test_df = premodel_formating(test_df, split=False, test=True)

(1100, 18) (726, 16)
(1100, 26) (726, 24)
(1100, 27) (726, 25)
(1100, 31) (726, 29)


In [108]:
# drop nulls
for col in train_df:
    if train_df[col].isna().sum() != 0:
        print(col)
        train_df = train_df.drop(col, axis = 1)
        test_df = test_df.drop(col, axis = 1)

pop
area
med_age


In [109]:
# drop Date
train_df = train_df.drop('Date', axis = 1)
test_df = test_df.drop('Date', axis = 1)

## Map Street to unique ID

In [110]:
train_list = list(set().union(train_df["Street"], train_df["From"], train_df["To"]))
street_map = {}
for i in range(0, len(train_list)):
    street_map[train_list[i]] = i + 1
# Update train
train_df = train_df.replace({"Street": street_map})
train_df = train_df.replace({"From": street_map})
train_df = train_df.replace({"To": street_map})
# Update test
test_df = test_df.replace({"Street": street_map})
test_df = test_df.replace({"From": street_map})
test_df = test_df.replace({"To": street_map})

In [111]:
train_list = list(set().union(train_df["Clean_Street"], train_df["Clean_From"], train_df["Clean_To"]))
street_map = {}
for i in range(0, len(train_list)):
    street_map[train_list[i]] = i + 1
# Update train
train_df = train_df.replace({"Clean_Street": street_map})
train_df = train_df.replace({"Clean_From": street_map})
train_df = train_df.replace({"Clean_To": street_map})
# Update test
test_df = test_df.replace({"Clean_Street": street_map})
test_df = test_df.replace({"Clean_From": street_map})
test_df = test_df.replace({"Clean_To": street_map})

In [112]:
# Drop all non-numeric columns
for col in train_df:
    if train_df[col].dtype == "object":
        train_df = train_df.drop(col, axis = 1)
        test_df = test_df.drop(col, axis = 1)

In [113]:
X = train_df.drop(["Real.Spots", "any_spot"], axis=1)
y = train_df["any_spot"]

In [114]:
import xgboost as xgb

In [96]:
depths = np.arange(0, 10) + 1
rates = np.arange(0.1, 0.5, 0.1)
best_f_score = 0
for i in range(0, 50):
    if i % 10 == 0: print(i)
    for depth in depths:
        for rate in rates:
            model = xgb.XGBClassifier (max_depth=depth, n_estimators=i, n_jobs=-1,\
                                        learning_rate=rate)
            result = cv_k(X, y, 2, model)
            if result[1] > best_f_score:
                print(result[1], [depth, i, rate])
                best_f_score = result[1]

0
0.220180305132 [1, 1, 0.10000000000000001]
0.236185383244 [2, 1, 0.10000000000000001]
0.36344519762 [3, 1, 0.10000000000000001]
0.496516893069 [4, 1, 0.10000000000000001]
0.498649733658 [8, 1, 0.10000000000000001]
0.510563498035 [9, 1, 0.10000000000000001]
0.536532433084 [4, 2, 0.10000000000000001]
0.571056464321 [4, 2, 0.20000000000000001]
0.576715402773 [4, 2, 0.30000000000000004]
10
0.578186546539 [5, 15, 0.30000000000000004]
20
30
40


In [79]:
# leader board score : 0.62682 (parameter tuning using 3-cross validation)
model = xgb.XGBClassifier(max_depth=3, learning_rate=0.3, n_estimators=22, n_jobs=-1)
# model.fit(X, y)
cv_k(X, y, 3, model)

[0.38962197457995579, 0.51801884218466621, 0.25783097546982353]

In [115]:
from sklearn.model_selection import train_test_split

In [116]:
X_train, X_val = train_test_split(X, test_size=0.25)
y_train, y_val = train_test_split(y, test_size=0.25)
model = xgb.XGBClassifier(max_depth=3, learning_rate=0.3, n_estimators=22, n_jobs=-1)
model.fit(X_train, y_train)
preds = model.predict(X_val)
print("f0.5 is : ", fbeta_score(y_val, preds, beta=0.5))
print("precision is : ", precision_score(y_val, preds))
print("recall is : ", recall_score(y_val, preds))

f0.5 is :  0.201465201465
precision is :  0.239130434783
recall is :  0.123595505618


In [96]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(825, 22) (275, 22) (825,) (275,)


In [97]:
# leader board score : 0.61445 (parameter tuning using 2-cross validation)
model = xgb.XGBClassifier(max_depth=5, learning_rate=0.3, n_estimators=15, n_jobs=-1)
model.fit(X_train, y_train)
# cv_k(X, y, 3, model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=15,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [108]:
# preds = model.predict(test_df)
# writeToFile(preds, "submissions/submission_xg_no_enc_1.csv")

### RF without mean encoding

In [117]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=4,min_samples_leaf=11,max_features=0.3, max_depth = 7)
model.fit(X_train, y_train)
preds = model.predict(X_val)
print("f0.5 is : ", fbeta_score(y_val, preds, beta=0.5))
print("precision is : ", precision_score(y_val, preds))
print("recall is : ", recall_score(y_val, preds))

f0.5 is :  0.204778156997
precision is :  0.235294117647
recall is :  0.134831460674


In [99]:
# 4, 11, 0.30000000000000004, 7,
# leaderboard score 0.57071
m = RandomForestClassifier(n_jobs=-1, n_estimators=4,min_samples_leaf=11,max_features=0.3, max_depth = 7)
# m.fit(X, y)
# cv_k(X, y, 3, m)

[0.41664432918302885, 0.54867724867724865, 0.28856249621016006]

In [100]:
preds3 = m.predict(test_df)

In [51]:
sum(preds3 == preds)

623

In [52]:
writeToFile(preds3, "submissions/submission_rf_no_enc.csv")

In [34]:
!ls submissions

all_ones.csv   mean_enc_sub.csv       submission_xg_no_enc.csv
all_zeros.csv  submission_xg_enc.csv
