In [2]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.metrics import mean_squared_error, make_scorer, roc_auc_score, log_loss
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, scale, StandardScaler, normalize, OneHotEncoder
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.utils import shuffle
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from scipy.sparse import csc_matrix, coo_matrix, csr_matrix, hstack
from scipy.stats.mstats import gmean, hmean
from scipy.stats import *
from collections import defaultdict, Counter
from itertools import product, combinations

import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
sns.set(style="whitegrid", palette="muted")

In [6]:
train = pd.read_csv('train2.csv')
mir_train = train.rename(columns={'team1':'team2', 'team2':'team1', 'score1':'score2', 'score2':'score1'})
mir_train['target'] = 1 - mir_train.target
train = pd.concat([train, mir_train])
train = train.reset_index().sort('index').reset_index().drop(['level_0', 'index'], axis=1)

test = pd.read_csv('test2.csv')
mir_test = train.rename(columns={'team1':'team2', 'team2':'team1'})
mir_test['Id'] = np.nan
test = pd.concat([test, mir_test])

In [4]:
#train, test = train[train.year < 3019], train[train.year >= 3019]

In [8]:
yearday = sorted(set(zip(train.year, train.day)))
e = np.ones(train.shape[0])
S_train = csc_matrix((e, (train.index.values, train.team1))) - csc_matrix((e, (train.index.values, train.team2)))
e = np.ones(test.shape[0])
S_test = csc_matrix((e, (np.arange(test.shape[0]), test.team1))) - csc_matrix((e, (np.arange(test.shape[0]), test.team2)))

In [9]:
def get_slice(sparse_df, df, y, yearday1, yearday2):
    ind1 = min(np.where(np.logical_and(df.year >= yearday1[0], df.day >= yearday1[1]))[0])
    ind2 = max(np.where(np.logical_and(df.year <= yearday2[0], df.day <= yearday2[1]))[0])
    return sparse_df[ind1:ind2], y[ind1:ind2]

def get_ind(df, yearday):
    ind = np.logical_and(df.year == yearday[0], df.day == yearday[1])
    return ind.values

In [10]:
n = 1000
train['logreg'] = 0
lr = LogisticRegression(C=0.06)
for i in range(len(yearday) - n - 1):
    X_, y_ = get_slice(S_train, train, train.target.values, yearday[i], yearday[i + n])
    lr.fit(X_, y_)
    ind = get_ind(train, yearday[i + n + 1])
    pred = lr.predict_proba(S_train[ind])
    train.loc[ind, 'logreg'] = pred[:,1]
    if i % 100 == 0: print i,
X_, y_ = get_slice(S_train, train, train.target.values, yearday[len(yearday) - n - 1], yearday[len(yearday) - 1])
lr.fit(X_, y_)
pred = lr.predict_proba(S_test)
test['logreg'] = pred[:,1]

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600


In [11]:
n = 1000
train['linreg'] = 0
lr = LinearRegression(fit_intercept=0)
for i in range(len(yearday) - n - 1):
    X_, y_ = get_slice(S_train, train, train.score1.values - train.score2.values, yearday[i], yearday[i + n])
    lr.fit(X_, y_)
    ind = get_ind(train, yearday[i + n + 1])
    pred = lr.predict(S_train[ind])
    train.loc[ind, 'linreg'] = pred
    if i % 100 == 0: print i,
X_, y_ = get_slice(S_train, train, train.target.values, yearday[len(yearday) - n - 1], yearday[len(yearday) - 1])
lr.fit(X_, y_)
pred = lr.predict(S_test)
test['linreg'] = pred

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600


In [12]:
def history_rating(team1_train, team2_train, target, team1_test, team2_test, gamma=0.1):
    teams = list(set(zip(team1_train, team2_train)))
    d = dict(zip(teams, np.zeros(len(teams))))
    w1, w2 = [], []
    for t1, t2, y in zip(team1_train, team2_train, target):
        w1.append(0.5 + d[(t1, t2)])
        w2.append(0.5 + d[(t2, t1)])
        d[(t1, t2)] += d[(t1, t2)]*gamma + y
    w1, w2 = np.array(w1), np.array(w2)
    
    w1_, w2_ = [], []
    for t1, t2 in zip(test.team1, test.team2):
        if (t1, t2) in d:
            w1_.append(0.5 + d[(t1, t2)])
            w2_.append(0.5 + d[(t2, t1)])
        else:
            w1_.append(0.5)
            w2_.append(0.5)
    w1_, w2_ = np.array(w1_), np.array(w2_)
    return w1, w2, w1_, w2_

def win_mean(team_train, target, team_test, gamma=0.1):
    teams = list(set(team_train))
    d = dict(zip(teams, np.zeros(len(teams))))
    w, w_ = [], []
    for t, y in zip(team_train, target):
        w.append(d[t])
        d[t] = d[t]* gamma + y
    for t in team_test:
        if t in d:
            w_.append(d[t])
        else:
            w_.append(0)
    return w, w_

In [13]:
teams = list(set(train.team1))
for i, weight_func in enumerate([lambda x: x**2, lambda x: x**3, lambda x: x**4,
                                 lambda x: 1.01**x, lambda x: 1.05**x, lambda x: 1.1**x, lambda x: 1.2**x, 
                                 lambda x: np.log1p(x)**0.5, lambda x: np.log1p(x)**0.8, lambda x: np.log1p(x)**1, 
                                 lambda x: np.log1p(x)**1.2, lambda x: np.log1p(x)**1.5, lambda x: np.log1p(x)**2, 
                                 lambda x: np.log1p(x)**3, lambda x: np.log1p(x)**4, lambda x: 1.01**np.log1p(x), 
                                 lambda x: 1.05**np.log1p(x), lambda x: 1.1**np.log1p(x), lambda x: 1.2**np.log1p(x)]):
    train['team1_mean' + str(i)] = 0
    train['team2_mean' + str(i)] = 0
    test['team1_mean' + str(i)] = 0
    test['team2_mean' + str(i)] = 0
    for t in teams:
        y = train[train.team1 == t].target.values
        w = weight_func(np.arange(len(y), 0, -1))
        p = np.convolve(y, w)[:len(w)] * 1. / np.cumsum(w)
        train.loc[train.team1 == t, 'team1_mean' + str(i)] = np.concatenate([[0.5], p[:-1]])
        test.loc[test.team1 == t, 'team1_mean' + str(i)] = p[-1]
    for t in teams:
        y = 1 - train[train.team2 == t].target.values
        w = weight_func(np.arange(len(y), 0, -1))
        p = np.convolve(y, w)[:len(w)] * 1. / np.cumsum(w)
        train.loc[train.team2 == t, 'team2_mean' + str(i)] = np.concatenate([[0.5], p[:-1]])
        test.loc[test.team2 == t, 'team2_mean' + str(i)] = p[-1]
    train['team1_mean_dif' + str(i)] = train['team1_mean' + str(i)].values - train['team2_mean' + str(i)].values
    test['team1_mean_dif' + str(i)] = test['team1_mean' + str(i)].values - test['team2_mean' + str(i)].values
    train = train.drop(['team1_mean' + str(i), 'team2_mean' + str(i)], axis=1)
    test = test.drop(['team1_mean' + str(i), 'team2_mean' + str(i)], axis=1)

In [14]:
for g in [0.1, 0.05, 0.01, 0.005, 0.001]:
    w1, w2, w1_, w2_ = history_rating(train.team1, train.team2, train.target, test.team1, test.team2, gamma=g)
    g = str(g)
    train['rating1_' + g], train['rating2_' + g] = w1, w2
    test['rating1_' + g], test['rating2_' + g] = w1_, w2_
    train['rating_dif_' + g] = train['rating1_' + g] - train['rating2_' + g]
    #train['rating_rat_' + g] = train['rating1_' + g] / (train['rating1_' + g] + train['rating2_' + g])
    test['rating_dif_' + g] = test['rating1_' + g] - test['rating2_' + g]
    #test['rating_rat_' + g] = test['rating1_' + g] / (test['rating1_' + g] + test['rating2_' + g])

In [15]:
for g in [0.95]:
    w1, w2 = win_mean(train.team1, train.target, test.team1, gamma=g)
    train['win1' + str(g)], test['win1' + str(g)] = w1, w2
    w1, w2 = win_mean(train.team2, 1 - train.target, test.team2, gamma=g)
    train['win2' + str(g)], test['win2' + str(g)] = w1, w2
    train['win_dif' + str(g)] = train['win1' + str(g)] - train['win2' + str(g)]
    test['win_dif' + str(g)] = test['win1' + str(g)] - test['win2' + str(g)]
    train = train.drop(['win1' + str(g), 'win2' + str(g)], axis=1)
    test = test.drop(['win1' + str(g), 'win2' + str(g)], axis=1)

In [16]:
data = pd.concat([train, test])


d = data.groupby(['year', 'team1']).mean()[['score1', 'score2']]
d = d.reset_index().rename(columns={'score1': 'mean_score1_of_team1', 'score2': 'mean_score2_of_team1'})
d.year += 2
data = pd.merge(data, d, on=['year', 'team1'], how='left')

d = data.groupby(['year', 'team2']).mean()[['score1', 'score2']]
d = d.reset_index().rename(columns={'score1': 'mean_score1_of_team2', 'score2': 'mean_score2_of_team2'})
d.year += 2
data = pd.merge(data, d, on=['year', 'team2'], how='left')


d = data.groupby(['year', 'team1']).std()[['score1', 'score2']]
d = d.reset_index().rename(columns={'score1': 'std_score1_of_team1', 'score2': 'std_score2_of_team1'})
d.year += 2
data = pd.merge(data, d, on=['year', 'team1'], how='left')

d = data.groupby(['year', 'team2']).std()[['score1', 'score2']]
d = d.reset_index().rename(columns={'score1': 'std_score1_of_team2', 'score2': 'std_score2_of_team2'})
d.year += 2
data = pd.merge(data, d, on=['year', 'team2'], how='left')

data['m1'] = data.mean_score1_of_team1 - data.mean_score1_of_team2
data['m2'] = data.mean_score1_of_team1 - data.mean_score2_of_team1
data['m3'] = data.mean_score1_of_team1 - data.mean_score2_of_team2
data['m4'] = data.mean_score1_of_team2 - data.mean_score2_of_team1
data['m5'] = data.mean_score1_of_team2 - data.mean_score2_of_team2
data['m6'] = data.mean_score2_of_team1 - data.mean_score2_of_team2

data['s1'] = data.std_score1_of_team1 - data.std_score1_of_team2
data['s2'] = data.std_score1_of_team1 - data.std_score2_of_team1
data['s3'] = data.std_score1_of_team1 - data.std_score2_of_team2
data['s4'] = data.std_score1_of_team2 - data.std_score2_of_team1
data['s5'] = data.std_score1_of_team2 - data.std_score2_of_team2
data['s6'] = data.std_score2_of_team1 - data.std_score2_of_team2

n = train.shape[0]
train, test = data[:n], data[n:]

In [17]:
train.head()

Unnamed: 0,Id,day,linreg,logreg,rating1_0.001,rating1_0.005,rating1_0.01,rating1_0.05,rating1_0.1,rating2_0.001,...,m3,m4,m5,m6,s1,s2,s3,s4,s5,s6
0,,19,0,0,0.5,0.5,0.5,0.5,0.5,0.5,...,,,,,,,,,,
1,,19,0,0,0.5,0.5,0.5,0.5,0.5,1.5,...,,,,,,,,,,
2,,28,0,0,0.5,0.5,0.5,0.5,0.5,0.5,...,,,,,,,,,,
3,,28,0,0,0.5,0.5,0.5,0.5,0.5,1.5,...,,,,,,,,,,
4,,28,0,0,0.5,0.5,0.5,0.5,0.5,0.5,...,,,,,,,,,,


In [18]:
dropcols = ['Id', 'day', 'score1', 'score2', 'year', 'target', 'year']
rf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1, random_state=0)
rf.fit(train.drop(dropcols, axis=1).fillna(-1), train.target)
pred_rf = rf.predict_proba(test.drop(dropcols, axis=1).fillna(-1))[:,1]

In [94]:
log_loss(test.target, pred_rf)

0.64481491371216493

In [19]:
sorted(zip(-rf.feature_importances_, train.drop(dropcols, axis=1).columns))

[(-0.051256292856207079, 'win_dif0.95'),
 (-0.048217419622430237, 'team1_mean_dif4'),
 (-0.046526303308039765, 'team1_mean_dif5'),
 (-0.038154221736638648, 'team1_mean_dif6'),
 (-0.028742102539181812, 'team1_mean_dif3'),
 (-0.026277867161106843, 'team1_mean_dif2'),
 (-0.024082113925763624, 'rating2_0.01'),
 (-0.023569635469600506, 'rating2_0.001'),
 (-0.023546194472670104, 'rating2_0.005'),
 (-0.02350592688059501, 'team1_mean_dif1'),
 (-0.023475268957938458, 'rating2_0.1'),
 (-0.02329973119487553, 'rating2_0.05'),
 (-0.023286077001380329, 'linreg'),
 (-0.021478781511281436, 'rating_dif_0.001'),
 (-0.021444331166161731, 'rating_dif_0.005'),
 (-0.020159603138981488, 'team1_mean_dif0'),
 (-0.01992113901106848, 'rating_dif_0.01'),
 (-0.019104703525147433, 'team2'),
 (-0.019043656048740528, 'team1'),
 (-0.018473730859930704, 'rating_dif_0.05'),
 (-0.016619888214484371, 'team1_mean_dif14'),
 (-0.016079829127219461, 'rating_dif_0.1'),
 (-0.015273245431479629, 'logreg'),
 (-0.01480452355505626

In [30]:
ohe = OneHotEncoder()
d1, d1_ = ohe.fit_transform(train[['team1']]), ohe.transform(test[['team1']])
d2, d2_ = ohe.fit_transform(train[['team2']]), ohe.transform(test[['team2']])
lr = LogisticRegression(C=0.24)
n = 70000
lr.fit((d1 - d2)[-n:], train.target[-n:])
pred_lr = lr.predict_proba(d1_ - d2_)[:,1]
#log_loss(test.target, pred_lr)

In [99]:
x = [log_loss(test.target, pred_lr**w * pred_rf**(1 - w)) for w in np.linspace(0, 1, 101)]
np.argmin(x), np.min(x)

(72, 0.6064370169725547)

In [71]:
192578 - 120000

72578

In [None]:
scores = []
for C in np.linspace(0.1, 0.5, 21):
    for n in [50000, 60000, 70000, 80000, 90000, 100000]:
        lr.fit((d1 - d2)[-n:], train.target[-n:])
        pred_lr = lr.predict_proba(d1_ - d2_)[:,1]
        score = log_loss(test.target, pred_lr)
        scores.append((score, n, C))

In [116]:
np.linspace(0.1, 0.5, 21)

array([ 0.1 ,  0.12,  0.14,  0.16,  0.18,  0.2 ,  0.22,  0.24,  0.26,
        0.28,  0.3 ,  0.32,  0.34,  0.36,  0.38,  0.4 ,  0.42,  0.44,
        0.46,  0.48,  0.5 ])

In [26]:
ind = test.Id.notnull().values

In [32]:
w = 0.7
pred = pred_lr**w * pred_rf**(1 - w)
pd.DataFrame({'Id': test[ind].Id.astype(int), 'target': pred[ind]}).to_csv('subm.csv', index=0)

In [24]:
test.Id.notnull().values

array([ True,  True,  True, ..., False, False, False], dtype=bool)