In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
data = pd.read_csv("data/features.csv", index_col="match_id")

In [15]:
data_size = data.shape[0]
print data_size

97230


In [16]:
nan_columns = [c for c in data.columns if data[c].count() < data_size]
min_columns = zip(nan_columns, (data[c].min() for c in nan_columns))

In [17]:
nan_columns, min_columns

(['first_blood_time',
  'first_blood_team',
  'first_blood_player1',
  'first_blood_player2',
  'radiant_bottle_time',
  'radiant_courier_time',
  'radiant_flying_courier_time',
  'radiant_first_ward_time',
  'dire_bottle_time',
  'dire_courier_time',
  'dire_flying_courier_time',
  'dire_first_ward_time'],
 [('first_blood_time', -78.0),
  ('first_blood_team', 0.0),
  ('first_blood_player1', 0.0),
  ('first_blood_player2', 0.0),
  ('radiant_bottle_time', -37.0),
  ('radiant_courier_time', -90.0),
  ('radiant_flying_courier_time', 180.0),
  ('radiant_first_ward_time', -236.0),
  ('dire_bottle_time', -45.0),
  ('dire_courier_time', -90.0),
  ('dire_flying_courier_time', 180.0),
  ('dire_first_ward_time', -84.0)])

In [18]:
for c in nan_columns:
    mean_rw = data[data['radiant_win'] == 1][c].mean()
    mean_rl = data[data['radiant_win'] == 0][c].mean()
    data.loc[(data[c].isnull()) & (data['radiant_win'] == 1), c] = mean_rw
    data.loc[(data[c].isnull()) & (data['radiant_win'] == 0), c] = mean_rl

In [19]:
data = data[data['radiant_first_ward_time'] >= -90] # просто удалим запись с выбросом
data_size = data.shape[0]

In [26]:
X_pick = np.zeros(shape=(data_size, 112))
for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [28]:
del data['lobby_type']
for x in range(1, 6):
    del data['r{0}_hero'.format(x)]
    del data['d{0}_hero'.format(x)]

In [29]:
del data['duration']
del data['tower_status_radiant']
del data['tower_status_dire']
del data['barracks_status_radiant']
del data['barracks_status_dire']

In [30]:
y = data['radiant_win']
del data['radiant_win']

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

In [32]:
data = StandardScaler().fit_transform(data)

In [33]:
cv = KFold(n=data_size, n_folds=5, shuffle=True)

In [34]:
auc_mean = 0.0
for train_idx, test_idx in cv:
    clf = LogisticRegression(penalty="l2", C=0.01)
    clf.fit(data[train_idx], y.iloc[train_idx])
    y_pred = clf.predict_proba(data[test_idx])[:, 1]
    auc_mean += roc_auc_score(y.iloc[test_idx], y_pred)
auc_mean = auc_mean / 5.0    

In [35]:
print auc_mean

0.72059063058


In [40]:
data = np.hstack((data, X_pick))

In [45]:
auc_mean = 0.0
for train_idx, test_idx in cv:
    clf = LogisticRegression(penalty="l2", C=0.05)
    clf.fit(data[train_idx], y.iloc[train_idx])
    y_pred = clf.predict_proba(data[test_idx])[:, 1]
    auc_mean += roc_auc_score(y.iloc[test_idx], y_pred)
auc_mean = auc_mean / 5.0

In [46]:
print auc_mean

0.754522826975


In [48]:
from sklearn.ensemble import GradientBoostingRegressor

In [53]:
auc_mean = 0.0
for train_idx, test_idx in cv:
    clf = GradientBoostingRegressor(n_estimators=30, verbose=True)
    clf.fit(data[train_idx], y.iloc[train_idx])
    y_pred = clf.predict(data[test_idx])
    auc_mean += roc_auc_score(y.iloc[test_idx], y_pred)
auc_mean = auc_mean / 5.0

      Iter       Train Loss   Remaining Time 
         1           0.2246            1.01m
         2           0.2043           58.42s
         3           0.1879           56.60s
         4           0.1744           54.18s
         5           0.1633           52.15s
         6           0.1542           49.88s
         7           0.1467           47.83s
         8           0.1405           45.91s
         9           0.1354           43.91s
        10           0.1312           41.81s
        20           0.1004           21.95s
        30           0.0829            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.2246            1.02m
         2           0.2043           58.02s
         3           0.1878           55.83s
         4           0.1743           53.47s
         5           0.1633           50.98s
         6           0.1542           48.63s
         7           0.1467           46.58s
         8           0.1405           44.62s
        

In [54]:
auc_mean

0.96945287650104617