In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

In [15]:
def del_future_features(data):
    try:
        del data['tower_status_radiant']
        del data['tower_status_dire']
        del data['barracks_status_radiant']
        del data['barracks_status_dire']
        del data['duration']
    except:
        pass
    
def get_nan_columns(data):
    data_size = data.shape[0]
    return [c for c in data.columns if data[c].count() < data_size]

def del_categorial(data):
    try:
        for x in range(1, 6):
            del data['r{0}_hero'.format(x)]
            del data['d{0}_hero'.format(x)]
    except:
        pass
        
def get_pick_bof(data):
    data_size = data.shape[0]
    X_pick = np.zeros(shape=(data_size, 112))
    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    return X_pick

def scale_features(data):
    return StandardScaler().fit_transform(data)

In [50]:
def data_for_GBR(file_path):
    data = pd.read_csv(file_path, index_col="match_id")

    # Remove look-into-future features
    del_future_features(data)

    # Remove strange lower value for radiant_first_ward_time feature
    data = data[(data['radiant_first_ward_time'] >= -90) | (data['radiant_first_ward_time'].isnull())]

    # Bag-of-heroes
    X_pick = get_pick_bof(data)

    # Remove categorial features
    del_categorial(data)

    # Select target variable
    y = data['radiant_win']
    del data['radiant_win']

    # Convert NaNs to very small value
    for c in get_nan_columns(data):
        data.loc[(data[c].isnull()), c] = -1000.

    # Add converted heroes to train set
    data = np.hstack((data, X_pick))
    
    return data, y

In [47]:
def data_for_LR(file_path):
    data = pd.read_csv(file_path, index_col="match_id")

    # Remove look-into-future features
    del_future_features(data)

    # Remove strange lower value for radiant_first_ward_time feature
    data = data[(data['radiant_first_ward_time'] >= -90) | (data['radiant_first_ward_time'].isnull())]

    # Some deletion
    # del data['start_time']
    lt = data['lobby_type'].apply(lambda x: -1 if x == 7 else x)
    del data['lobby_type']

    # Bag-of-heroes
    X_pick = get_pick_bof(data)

    # Remove categorial features
    del_categorial(data)

    # Select target variable
    y = data['radiant_win']
    del data['radiant_win']

    # Convert NaNs to means
    for c in get_nan_columns(data):
        mean = data[c].mean()
        data.loc[(data[c].isnull()), c] = mean

    # Scale features for LR
    data = scale_features(data)

    # Add converted heroes to train set
    data = np.hstack((data, X_pick))

    # Add lobby
    # data = np.hstack((data, lt.reshape(lt.shape[0], 1)))
    
    return data, y

In [48]:
data, y = data_for_LR("data/features.csv")

In [36]:
n_folds = 5
cv = KFold(n=data.shape[0], n_folds=n_folds, shuffle=True)
grid_params = { 'C': np.logspace(-2, 2, 5) }
clf = LogisticRegression(penalty="l2")
grid = GridSearchCV(estimator=clf, param_grid=grid_params, scoring='roc_auc', cv=cv, verbose=True, n_jobs=1)
grid.fit(data, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.4min finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=97229, n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc',
       verbose=True)

In [37]:
grid.grid_scores_

[mean: 0.75167, std: 0.00149, params: {'C': 0.01},
 mean: 0.75192, std: 0.00166, params: {'C': 0.10000000000000001},
 mean: 0.75192, std: 0.00170, params: {'C': 1.0},
 mean: 0.75192, std: 0.00170, params: {'C': 10.0},
 mean: 0.75192, std: 0.00170, params: {'C': 100.0}]

In [49]:
n_folds = 5
cv = KFold(n=data.shape[0], n_folds=n_folds, shuffle=True)
auc_mean = 0.0
for train_idx, test_idx in cv:
    clf = LogisticRegression(C=0.1, penalty='l2')
    clf.fit(data[train_idx], y.iloc[train_idx])
    y_pred = clf.predict_proba(data[test_idx])[:, 1]
    score = roc_auc_score(y.iloc[test_idx], y_pred)
    print score
    auc_mean += score
auc_mean = auc_mean / float(n_folds)
auc_mean

0.752815128275
0.749586531267
0.749961803829
0.75179416175
0.756350523377


0.75210162969948358

In [51]:
data, y = data_for_GBR("data/features.csv")

In [55]:
n_folds = 5
cv = KFold(n=data.shape[0], n_folds=n_folds, shuffle=True)
auc_mean = 0.0
for train_idx, test_idx in cv:
    clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.2, max_depth=3, verbose=True)
    clf.fit(data[train_idx], y.iloc[train_idx])
    y_pred = clf.predict_proba(data[test_idx])[:, 1]
    score = roc_auc_score(y.iloc[test_idx], y_pred)
    print score
    auc_mean += score
auc_mean = auc_mean / float(n_folds)
auc_mean

      Iter       Train Loss   Remaining Time 
         1           1.3727            3.55m
         2           1.3631            3.54m
         3           1.3538            3.49m
         4           1.3456            3.45m
         5           1.3376            3.43m
         6           1.3309            3.43m
         7           1.3243            3.41m
         8           1.3182            3.39m
         9           1.3125            3.37m
        10           1.3080            3.35m
        20           1.2732            3.18m
        30           1.2522            3.00m
        40           1.2371            2.81m
        50           1.2252            2.62m
        60           1.2150            2.44m
        70           1.2060            2.26m
        80           1.1985            2.08m
        90           1.1911            1.91m
       100           1.1845            1.73m
       200           1.1373            0.00s
0.736437832312
      Iter       Train Loss   Remaining

0.73676812115402568

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
n_folds = 5
cv = KFold(n=data.shape[0], n_folds=n_folds, shuffle=True)
auc_mean = 0.0
for train_idx, test_idx in cv:
    clf = ExtraTreesClassifier(n_estimators=1000, verbose=True, n_jobs=4)
    clf.fit(data[train_idx], y.iloc[train_idx])
    y_pred = clf.predict_proba(data[test_idx])[:, 1]
    score = roc_auc_score(y.iloc[test_idx], y_pred)
    print score
    auc_mean += score
auc_mean = auc_mean / float(n_folds)
auc_mean

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   25.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   59.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    3.8s finished


0.715528614343


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.2s


KeyboardInterrupt: 