# Import libraries

In [1]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model, cross_validation, grid_search
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer as DV
import xgboost as xgboost
import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful



# Load data using pandas

In [2]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# First view on data

# Teams

In [3]:
train.head(2)

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True
1,2998,28,61,29,301,259,True


In [4]:
test.head(2)

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161
1,1,3021,286,2


In [5]:
team1 = sorted(train.team1.unique())
print team1

[2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 2

In [6]:
print len(team1)

353


In [7]:
team2 = sorted(train.team2.unique())
print team2

[2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 2

In [8]:
print len(team2)

353


In [9]:
print len(set(team1) & set(team2))

353


Соответственно и в team1, и в team2 из train.csv одни и те же команды

In [10]:
print sorted(test.team2.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 

В test.csv есть команды, которых нет в train.csv. Ну а количество команд подталкивает к тому, что это индивидуальный спорт

# Scores

In [11]:
score1 = sorted(train.score1.unique())
print score1

[81, 85, 89, 92, 96, 100, 104, 108, 112, 116, 119, 123, 127, 131, 135, 139, 143, 147, 150, 154, 158, 162, 166, 170, 174, 178, 181, 185, 189, 193, 197, 201, 205, 208, 212, 216, 220, 224, 228, 232, 236, 239, 243, 247, 251, 255, 259, 263, 267, 270, 274, 278, 282, 286, 290, 294, 297, 301, 305, 309, 313, 317, 321, 325, 328, 332, 336, 340, 344, 348, 352, 356, 359, 363, 367, 371, 375, 379, 383, 387, 390, 394, 398, 402, 406, 410, 414, 417, 421, 425, 429, 433, 437, 441, 445, 448, 452, 456, 460, 464, 468, 472, 476, 479, 483, 487, 491, 495, 499, 503, 506, 510, 514, 526, 530, 534, 541, 545, 549, 553, 565, 572, 592, 599, 603]


Видим, что разница между очками в среднем 4 или 5. Это наводит на мысль о том, что за каждый 'гол' начисляется 4 или 5 очков.

# Make data for cross_validation

In [12]:
print train.year.unique()

[2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012
 3013 3014 3015 3016 3017 3018 3019]


In [13]:
print test.year.unique()

[3021 3020]


In [14]:
test.head(2)

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161
1,1,3021,286,2


В train.csv нет данных про игры позднее 3019 года, поэтому будем делать максимально похожую кросс-валидацию

In [15]:
STOP_YEAR = 3015
X_train_first_part = train[train['year']<STOP_YEAR]
X_train_first_part.year.unique()

array([2998, 2999, 3000, 3001, 3002, 3003, 3004, 3005, 3006, 3007, 3008,
       3009, 3010, 3011, 3012, 3013, 3014])

In [17]:
STOP_INDEX = X_train_first_part.shape[0]
print STOP_INDEX

75278


In [18]:
X_train_second_part = X_train_first_part

Посмотрим, в каком отношении мы разбили train.csv на обучающую и тестовую

In [19]:
print X_train_first_part.shape[0]/float(train.shape[0])

0.740859569526


In [20]:
y_train_first_part = train[['target']][: STOP_INDEX]

In [21]:
y_train_second_part = pd.DataFrame(list(map (lambda x: bool(1-x), y_train_first_part.values.T[0])), columns=['target'])

In [52]:
y_train_second_part.index = list(i for i in range(STOP_INDEX-1, 150555))

In [53]:
y_train_first_part.head(-1)

Unnamed: 0,target
0,True
1,True
2,True
3,False
4,True
5,False
6,False
7,True
8,True
9,False


In [54]:
y_train_first_part.shape

(75278, 1)

In [55]:
y_train_second_part.head(5)

Unnamed: 0,target
75277,False
75278,False
75279,False
75280,True
75281,False


In [56]:
y_train_second_part.shape

(75278, 1)

Сделаем итоговую таблицу X_year

In [57]:
X_year_I = X_train_first_part[['year']]
X_year_II = X_train_second_part[['year']]

In [58]:
X_year = pd.concat([X_year_I, X_year_II], axis=0)

In [59]:
X_year.shape

(150556, 1)

In [60]:
X_year.head(2)

Unnamed: 0,year
0,2998
1,2998


# Hot_Encode

In [61]:
enc = OneHotEncoder(sparse=False)

train_team1_all = pd.DataFrame(enc.fit_transform(train[['team1']]))
train_team2_all = pd.DataFrame(enc.transform(train[['team2']]))
train_teams_all = pd.concat([train_team1_all, train_team2_all], axis=1)

test_team1 = pd.DataFrame(enc.transform(test[['team1']]))
test_team2 = pd.DataFrame(enc.transform(test[['team2']]))
test_teams = pd.concat([test_team1, test_team2], axis=1)

In [62]:
reverse_train_teams_all = pd.concat([train_team2_all, train_team1_all], axis=1)

In [63]:
train_teams_all.shape

(101609, 706)

In [64]:
reverse_train_teams_all.shape

(101609, 706)

In [65]:
train_teams = train_teams_all[ :STOP_INDEX ]
train_teams.shape

(75278, 706)

In [66]:
reverse_train_teams = reverse_train_teams_all[ :STOP_INDEX ]
reverse_train_teams.shape

(75278, 706)

In [67]:
teams_to_add = pd.concat([train_teams, reverse_train_teams], axis=0)
teams_to_add.shape

(150556, 706)

In [68]:
train_data = pd.concat([X_year, teams_to_add], axis=1)

In [69]:
train_data.shape

(150556, 707)

In [70]:
train_data.head(2)

Unnamed: 0,year,0,1,2,3,4,5,6,7,8,...,343,344,345,346,347,348,349,350,351,352
0,2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
y_train = pd.concat([y_train_first_part, y_train_second_part], axis=0)
y_train.shape

(150556, 1)

In [73]:
y_test = train[['target']][STOP_INDEX : ]
y_test.shape

(26331, 1)

In [74]:
test_data = pd.concat([train[['year']][ STOP_INDEX: ], train_teams_all[ STOP_INDEX: ]], axis=1)
test_data.shape

(26331, 707)

In [75]:
test_data.head(2)

Unnamed: 0,year,0,1,2,3,4,5,6,7,8,...,343,344,345,346,347,348,349,350,351,352
75278,3015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75279,3015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
train_data.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            75268, 75269, 75270, 75271, 75272, 75273, 75274, 75275, 75276,
            75277],
           dtype='int64', length=150556)

Итак, train_data – данные, на которых обучаемся, y_train – метки класса для данных, на которых обучаемся;

test_data, y_test – данные, на которых оцениваем качество алгоритмов

# Linear_Classifier

In [77]:
lin_cl = linear_model.LogisticRegression(C=0.07)

In [78]:
lin_cl.fit(train_data, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.07, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [79]:
pred = lin_cl.predict_proba(test_data)[:,1]
log_loss(y_test, pred)

0.63409634717694308

# xgboost

In [80]:
param = {}
param['max_depth'] = 3
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.03

numround = 800

In [81]:
Xdatatrain = xgboost.DMatrix(data = train_data, label = y_train.values)
Xdatatest = xgboost.DMatrix(data = test_data, label = y_test.values)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist)

[0]	train-logloss:0.693316	eval-logloss:0.693123
[1]	train-logloss:0.693209	eval-logloss:0.693052
[2]	train-logloss:0.693088	eval-logloss:0.692874
[3]	train-logloss:0.692980	eval-logloss:0.692786
[4]	train-logloss:0.692867	eval-logloss:0.692655
[5]	train-logloss:0.692752	eval-logloss:0.692572
[6]	train-logloss:0.692652	eval-logloss:0.692461
[7]	train-logloss:0.692549	eval-logloss:0.692440
[8]	train-logloss:0.692439	eval-logloss:0.692289
[9]	train-logloss:0.692352	eval-logloss:0.692244
[10]	train-logloss:0.692249	eval-logloss:0.692164
[11]	train-logloss:0.692155	eval-logloss:0.692102
[12]	train-logloss:0.692060	eval-logloss:0.692005
[13]	train-logloss:0.691976	eval-logloss:0.692009
[14]	train-logloss:0.691875	eval-logloss:0.691915
[15]	train-logloss:0.691800	eval-logloss:0.691920
[16]	train-logloss:0.691708	eval-logloss:0.691803
[17]	train-logloss:0.691629	eval-logloss:0.691784
[18]	train-logloss:0.691534	eval-logloss:0.691687
[19]	train-logloss:0.691448	eval-logloss:0.691649
[20]	train

In [116]:
for i in [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.5]:
    lin_cl = linear_model.LogisticRegression(C=i)
    lin_cl.fit(train_data, y_train)
    y_pred = lin_cl.predict_proba(test_data)[:,1]
    print log_loss(y_test, y_pred)

0.636095761835
0.632892735309
0.632679207481
0.634129919477
0.635280062398
0.636117084977
0.637824467799


In [97]:
lin_cl = linear_model.LogisticRegression(C=0.2)

In [101]:
lin_cl.fit(train_data, y_train)

LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Make X_test_submission

In [99]:
X_test_submission = pd.concat([test[['year']], test_teams], axis = 1)

In [82]:
y_test_alg_submission = lin_cl.predict_proba(X_test_submission)[:,1]
y_test_bst_submission = bst.predict(xgboost.DMatrix(X_test_submission))
y_test_submission = y_test_alg_submission*0.58 + y_test_bst_submission*(1-0.58)

NameError: name 'X_test_submission' is not defined

In [103]:
ss = sample_submission.copy()

for i in range(len(y_test_submission)):
    c = y_test_submission[i]
    if(c <= 0.02):
        y_test_submission[i] = 0.0
    if(c >= 0.98):
        y_test_submission[i] = 1.0
        
for c in y_test_submission:
    if(c < 0 or c>1):
        print c
        
ss.target = y_test_submission
ss.to_csv('Double-alg-xgboost-lin_part1.csv', index = False)