In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')

## Линейная модель

### Валидация

In [None]:
s_train = train[train['year'] < 3018][['year', 'team1', 'team2', 'target']]
s_val = train[train['year'] >= 3018][['year', 'team1', 'team2', 'target']]

In [None]:
def balanceTrainVal(s_train, s_val):
    st = set(s_train['team1'])
    s_val = s_val[s_val['team1'].map(lambda x: x in st)]
    s_val = s_val[s_val['team2'].map(lambda x: x in st)]

    st = set(s_val['team1'])
    s_train = s_train[s_train['team1'].map(lambda x: x in st)]
    s_train = s_train[s_train['team2'].map(lambda x: x in st)]
    s_val = s_val[s_val['team2'].map(lambda x: x in st)]
    return s_train, s_val

In [None]:
s_train, s_val = balanceTrainVal(s_train, s_val)

In [None]:
def MyHotEncoder(df):
    team_1_enc = pd.get_dummies(data=df, columns=['team1'])
    team_2_enc = pd.get_dummies(data=df, columns=['team2'], prefix='team1')
    for col in team_2_enc.columns:
        if 'team1' not in col:
            team_2_enc[col] = 0
    team_2_enc['team1'] = - team_2_enc['team1']
    combined_df = team_1_enc.add(team_2_enc * (-1), fill_value=0)
    y = combined_df['target']
    X = combined_df.drop('target', 1)
    return X, y

In [None]:
xtrain, ytrain = MyHotEncoder(s_train)
xval, yval = MyHotEncoder(s_val)

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(C=0.1, random_state=0)
LR.fit(xtrain, ytrain)
prediction_lr = LR.predict_proba(xval)[:, 1]

In [None]:
from sklearn.metrics import log_loss

log_loss(yval, prediction_lr)

### Тест

In [None]:
def makePrediction(train, test):
    united = pd.concat([train[['year', 'team1', 'team2', 'target']], test], ignore_index=True)
    itr = united[united['Id'].isnull()].index.values
    ite = united[united['target'].isnull()].index.values
    X_united, y_united = myHotEncoder(united)
    LR = LogisticRegression(C=0.1, random_state=0)
    LR.fit(X_united.ix[itr].drop('Id', 1), y_united[itr].values.astype(int))
    prediction = LR.predict_proba(X_united.ix[ite].drop('Id', 1))[:, 1]
    return prediction

In [None]:
pred = makePrediction(train, test)

In [None]:
subm = pd.read_csv('sample_submission.csv')
subm['target'] = pred
subm.to_csv('subm_LR.csv', index=False)

## Бустинг

### Валидация

In [None]:
s_train = train[train['year'] < 3018]
s_val = train[train['year'] >= 3018]

In [None]:
s_train, s_val = balanceTrainVal(s_train, s_val)

In [None]:
from collections import defaultdict

def statsCount(train):
    teams_means = {}
    teams_medians = {}
    teams_wins = defaultdict(int)

    for team_id in set(set(train['team1'].unique()) | set(train['team2'].unique())):
        m_1 = train[train['team1'] == team_id]['score1'].mean()
        m_2 = train[train['team2'] == team_id]['score2'].mean()
        teams_means[team_id] = float(m_1 + m_2) / 2
        
        m_1 = train[train['team1'] == team_id]['score1'].median()
        m_2 = train[train['team2'] == team_id]['score2'].median()
        teams_medians[team_id] = float(m_1 + m_2) / 2
        
        teams_wins[team_id] += train[train['team1'] == team_id]['target'].sum()
        teams_wins[team_id] += train[train['team2'] == team_id]['target'].sum()
        
    return teams_means, teams_wins, teams_medians

In [None]:
def myStat(train):
    teams_weightedWins = defaultdict(int)
    day = -1
    winners_of_the_day = set()
    losers_of_the_day = set()
    for i in train.index:
        if day != train.ix[i, 'day']:
            if train.ix[i, 'team1'] in winners_of_the_day and train.ix[i, 'team2'] in winners_of_the_day:
                for team in winners_of_the_day:
                    teams_weightedWins[team] += 10
            else:
                for team in losers_of_the_day:
                    teams_weightedWins[team] -= 1
            day = train.ix[i, 'day']
            winners_of_the_day = set()
            losers_of_the_day = set()
        if train.ix[i, 'target']:
            winners_of_the_day.add(train.ix[i, 'team1'])
            losers_of_the_day.add(train.ix[i, 'team2'])
        else:
            winners_of_the_day.add(train.ix[i, 'team2'])
            losers_of_the_day.add(train.ix[i, 'team1'])
    return teams_weightedWins

In [None]:
teams_means, teams_wins, teams_medians = statsCount(s_train)
teams_weightedWins = myStat(s_train)

In [None]:
def replaceTeams(source_df, teams_weightedWins, teams_means, teams_wins, teams_medians):
    df = source_df
    df['mean1'] = df['team1'].replace(teams_means).copy()
    df['mean2'] = df['team2'].replace(teams_means).copy()
    df['win1'] = df['team1'].replace(teams_wins).copy()
    df['win2'] = df['team2'].replace(teams_wins).copy()
    df['median1'] = df['team1'].replace(teams_medians).copy()
    df['median2'] = df['team2'].replace(teams_medians).copy()
    df['wWins1'] = df['team1'].replace(teams_weightedWins).copy()
    df['wWins2'] = df['team2'].replace(teams_weightedWins).copy()
    return df

In [None]:
new_train = replaceTeams(s_train, teams_weightedWins, teams_means, teams_wins, teams_medians)
new_test = replaceTeams(s_val, teams_weightedWins, teams_means, teams_wins, teams_medians)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(loss='deviance', random_state=0, n_estimators=70, max_depth=7)
gbc.fit(new_train.drop(['target', 'day', 'score1', 'score2', 'team1', 'team2'], 1),
        new_train['target'])
prediction_gb = gbc.predict_proba(new_test.drop(['target', 'day', 'score1', 'score2', 'team1', 'team2'], 1))[:, 1]

In [None]:
log_loss(s_val['target'], prediction_gb)

In [None]:
w = 0.32

In [None]:
log_loss(s_val.target, w * prediction_gb + (1 - w) * prediction_lr)

### Тест

In [None]:
inds = np.logical_and(test.team1 != 1, test.team2 != 1)
test_gb = test[inds]

In [None]:
new_train = replaceTeams(train, teams_weightedWins, teams_means, teams_wins, teams_medians)
new_test = replaceTeams(test_gb, teams_weightedWins, teams_means, teams_wins, teams_medians)

In [None]:
gbc = GradientBoostingClassifier(loss='deviance', random_state=0, n_estimators=70, max_depth=7)
gbc.fit(new_train.drop(['target', 'day', 'score1', 'score2', 'team1', 'team2'], 1),
        new_train['target'])
prediction_gb = gbc.predict_proba(new_test.drop(['target', 'day', 'score1', 'score2', 'team1', 'team2'], 1))[:, 1]

In [None]:
subm_LR = pd.read_csv('subm_LR.csv')
id_col = subm_LR['Id']
prediction_lr = subm_LR['target']

In [None]:
pred = pd.DataFrame(id_col)
pred['target'] = prediction_lr
pred.ix[inds, 'target'] = (1 - w) * prediction_lr[inds] + w * prediction_gb

In [None]:
pred.to_csv('subm_comb.csv', index=False)