In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
import time
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


In [2]:
data_train = pd.read_csv('features.csv', index_col='match_id')
X_test = pd.read_csv('features_test.csv', index_col='match_id')

y = data_train['radiant_win']
X_train = data_train.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis=1)
X_train = X_train.fillna(-1)
X_test = X_test.fillna(-1)

In [3]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X_train_scaled)

c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid ={'C': c_values, 'penalty': ['l2']}
#посик параметра С
grid = GridSearchCV(LogisticRegression(), param_grid, cv=kf, scoring='roc_auc')
grid.fit(X_train_scaled, y)
grid.best_params_

{'C': 0.01, 'penalty': 'l2'}

In [4]:
start_time = datetime.datetime.now()
lr = LogisticRegression(penalty='l2', C=0.00001)
lr.fit(X_train_scaled, y)
time.sleep(3)

cvs = cross_val_score(lr, X_train_scaled, y, scoring='roc_auc', cv=5)
print('Time elapsed:', datetime.datetime.now() - start_time, 'CVS:', cvs.mean())

Time elapsed: 0:00:05.544018 CVS: 0.6964629693180401


In [5]:
X_train2 = X_train.drop(['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero', 'lobby_type'], axis=1)
X_train_scaled2 = scaler.fit_transform(X_train2)
grid.fit(X_train_scaled2, y)
grid.best_params_

{'C': 0.01, 'penalty': 'l2'}

In [6]:
start_time = datetime.datetime.now()
lr2 = LogisticRegression(penalty='l2', C=0.01)
lr2.fit(X_train_scaled2, y)
time.sleep(3)

cvs = cross_val_score(lr2, X_train_scaled2, y, scoring='roc_auc', cv=5)
print('Time elapsed:', datetime.datetime.now() - start_time, 'CVS:', cvs.mean())

Time elapsed: 0:00:10.423168 CVS: 0.7152974588862767


In [7]:
#число героев
len(data_train.iloc[:, 2].unique())

108

In [8]:
hero_c = [c for c in X_train.columns if 'hero' in c]
all_heroes_id = np.unique(X_train[hero_c])
wb = {}
for id in all_heroes_id:
    # Мы используем + 0 для автоматического приведения bool-->int.
    r = [(X_train['r%d_hero' % n] == id) + 0 for n in range(1, 6)]
    d = [(X_train['d%d_hero' % n] == id) + 0 for n in range(1, 6)]
    wb['hero%s' % id] = sum(r) - sum(d)
X_pick = X_train.assign(**wb)


In [9]:
kf.get_n_splits(X_pick)
X_pick_scaled = scaler.fit_transform(X_pick)

c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid ={'C': c_values, 'penalty': ['l2']}

grid = GridSearchCV(LogisticRegression(), param_grid, cv=kf, scoring='roc_auc')
grid.fit(X_pick_scaled, y)
grid.best_params_

{'C': 0.01, 'penalty': 'l2'}

In [10]:
start_time = datetime.datetime.now()
lr3 = LogisticRegression(penalty='l2', C=0.01)
lr3.fit(X_pick_scaled, y)
time.sleep(3)

cvs = cross_val_score(lr3, X_pick_scaled, y, scoring='roc_auc', cv=5)
print('Time elapsed:', datetime.datetime.now() - start_time, 'CVS:', cvs.mean())

Time elapsed: 0:00:15.652704 CVS: 0.7495848514779525


In [28]:
hero_c2 = [c for c in X_test.columns if 'hero' in c]
all_heroes_id = np.unique(X_test[hero_c2])
wb1 = {}
for id in all_heroes_id:
    # Мы используем + 0 для автоматического приведения bool-->int.
    r = [(X_test['r%d_hero' % n] == id) + 0 for n in range(1, 6)]
    d = [(X_test['d%d_hero' % n] == id) + 0 for n in range(1, 6)]
    wb['hero%s' % id] = sum(r) - sum(d)
X_pick2 = X_test.assign(**wb1)

In [30]:
X_pick_scaled2 = scaler.transform(X_pick2)

In [31]:
proba = lr.predict_proba(X_pick_scaled2)
print(proba)

dire_proba = proba[:, 0]
rad_proba = proba[:, 1]

print(f"Dire: min = {np.min(dire_proba)}, max = {np.max(dire_proba)}, Unique values = {np.unique(dire_proba).size}")
print(f"Radiant: min = {np.min(dire_proba)}, max = {np.max(dire_proba)}, Unique values = {np.unique(dire_proba).size}")

[[0.50765566 0.49234434]
 [0.44558767 0.55441233]
 [0.5418276  0.4581724 ]
 ...
 [0.59701059 0.40298941]
 [0.54242705 0.45757295]
 [0.44860603 0.55139397]]
Dire: min = 0.19783233192578142, max = 0.8273952326875847, Unique values = 17177
Radiant: min = 0.19783233192578142, max = 0.8273952326875847, Unique values = 17177
