In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

In [50]:
data = pd.read_csv('data/new_features.csv', index_col='match_id')

In [51]:
y = data['radiant_win']
data.drop(['radiant_win'], axis=1, inplace=True)

In [52]:
data = StandardScaler().fit_transform(data)

In [53]:
data.shape

(97216, 1929)

In [82]:
n_folds = 5
cv = KFold(n=data.shape[0], n_folds=n_folds, shuffle=True)
auc_mean = 0.0
for train_idx, test_idx in cv:
    clf = LogisticRegression(C=0.00025, penalty='l2')
    clf.fit(data[train_idx], y.iloc[train_idx])
    y_pred = clf.predict_proba(data[test_idx])[:, 1]
    score = roc_auc_score(y.iloc[test_idx], y_pred)
    print score
    auc_mean += score
auc_mean = auc_mean / float(n_folds)
auc_mean

0.758484969018
0.755210738528
0.760199819316
0.752890589923
0.751508113838


0.75565884612474277

In [83]:
clf = LogisticRegression(C=0.00025, penalty='l2')
clf.fit(data, y)

LogisticRegression(C=0.00025, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [84]:
data_test = pd.read_csv('data/new_features_test.csv', index_col='match_id')

In [85]:
y_pred = pd.DataFrame(
    data=clf.predict_proba(StandardScaler().fit_transform(data_test))[:, 1],
    index=data_test.index,
    columns=['radiant_win']
)

In [86]:
file_name = 'data/submissions/lr-1930-0_00025.csv'
y_pred.to_csv(file_name, header=True, index=True)

In [87]:
# Add missing matches with baseline prediction
with open(file_name, 'a') as f:
    for match_id in [1811, 6336, 13048, 27418]:
        f.write("%d,0.5185\n" % match_id)