### About

This is the notebook for predicting wining side by player performance, namely KDA and minion killed.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go



In [2]:
df = pd.read_csv('cleanData.csv')
df.shape

(1733658, 30)

In [3]:
cols = df.columns.tolist()
print(cols)

#col concerned:
#kills, death, assists, totminionskilled, duration

['Unnamed: 0', 'id', 'matchid', 'player', 'championid', 'team_role', 'ss1', 'ss2', 'item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'win', 'kills', 'deaths', 'assists', 'turretkills', 'inhibkills', 'totminionskilled', 'pinksbought', 'wardsplaced', 'duration', 'baronkills', 'harrykills', 'dragonkills', 'firsttower', 'firstinhib', 'firstblood']


In [4]:
teamRoleList = df['team_role'].unique().tolist()
print(teamRoleList)

['1 - JUNGLE', '1 - DUO_SUPPORT', '1 - DUO_CARRY', '1 - TOP', '1 - MID', '2 - JUNGLE', '2 - TOP', '2 - MID', '2 - DUO_CARRY', '2 - DUO_SUPPORT', '1 - BOT', '2 - BOT']


In [5]:
def assignRoleWithNum(x):
    return {
        '1 - MID': 100,
        '2 - MID': 100,
        '1 - JUNGLE': 200,
        '2 - JUNGLE': 200,
        '1 - TOP': 300,
        '2 - TOP': 300,
        '1 - DUO_CARRY': 400,
        '2 - DUO_CARRY': 400,
        '1 - DUO_SUPPORT': 400,
        '2 - DUO_SUPPORT': 400,
        '1 - BOT': 400,
        '2 - BOT': 400
    }[x] 


The function above is to assign roles with values so that we would encode the features into regression.

Value of 1, 10, 100, 1000 is tried.
Turns out that 100 is the best choice (but only 1% accuracy increase, so not much). My educational guess behind this is that kill/death/assists is in range of 0-10. Duration is usually a 4 digits value. Having 100, distinguish this team role feature from others.

In [6]:
df['role'] = df['team_role'].apply(assignRoleWithNum)
dataset = df[['win','kills','deaths','assists','totminionskilled','duration']]
dataset = dataset.dropna() #drop all nan rows
dataset = dataset.take(np.random.permutation(len(dataset)))#randomize rows
dataset.head(10)

Unnamed: 0,win,kills,deaths,assists,totminionskilled,duration
200776,1.0,7.0,0.0,2.0,131.0,1091
1269266,0.0,11.0,11.0,6.0,211.0,2113
1711300,1.0,0.0,0.0,0.0,0.0,197
1631117,0.0,1.0,10.0,4.0,178.0,1667
1142502,0.0,0.0,0.0,0.0,0.0,195
890377,1.0,7.0,7.0,8.0,166.0,1846
891277,1.0,8.0,4.0,11.0,293.0,2000
738322,1.0,9.0,6.0,6.0,171.0,2179
1310871,0.0,4.0,8.0,2.0,119.0,1445
782005,1.0,0.0,4.0,8.0,8.0,1933


In [7]:
train, test = train_test_split(dataset, test_size = 0.1)
print('train:', train.shape, 'test:', test.shape)

train: (1560289, 6) test: (173366, 6)


In [8]:
def get_data_feed(dataset):
    team_data = dataset.iloc[:,1:] #exclude first column which is win
    winners = dataset['win']
    return team_data, winners 

trainX, trainY = get_data_feed(train)
testX, testY = get_data_feed(test)

In [9]:
LRTrainAccuracy = np.mean(cross_val_score(LogisticRegression(), trainX, trainY, scoring='accuracy', cv=2))
MNTrainAccuracy = np.mean(cross_val_score(MultinomialNB(), trainX, trainY, scoring='accuracy', cv=2))
LRTestAccuracy = np.mean(cross_val_score(LogisticRegression(), testX, testY, scoring='accuracy', cv=2))
MNTestAccuracy = np.mean(cross_val_score(MultinomialNB(), testX, testY, scoring='accuracy', cv=2))

print('Logistic Regression Train accuracy:', LRTrainAccuracy)
print('MultinominalNB Train accuracy:', MNTrainAccuracy)

Logistic Regression Train accuracy: 0.789665247171
MultinominalNB Train accuracy: 0.663800742968


In [10]:
init_notebook_mode(connected=True)
trace1 = go.Bar(
    x=['Logistic Regression', 'MultinomialNB'],
    y=[LRTrainAccuracy, MNTrainAccuracy],
    name='Train'
)
trace2 = go.Bar(
    x=['Logistic Regression', 'MultinomialNB'],
    y=[LRTestAccuracy, MNTestAccuracy],
    name='Test'
)

data = [trace1, trace2]
layout = go.Layout(
    title='Accuracy Comparsions',
    barmode='group',
    yaxis=dict(
        title='Accuracy',
        range=[0.5, 0.8]
    ),
    xaxis=dict(
        title='Method'
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

We can see logistic regression has done a great job here with much higher consistency.

### Let us try to put team role into use to see if we can know relationship between KDA, roles and win/lose.

In [11]:
newDataset = df[['win','kills','deaths','assists','totminionskilled','duration', 'role']]
newDataset = newDataset.dropna() #drop all nan rows
newDataset = newDataset.take(np.random.permutation(len(newDataset))) #randomize rows
newDataset.head(10)

Unnamed: 0,win,kills,deaths,assists,totminionskilled,duration,role
1443228,0.0,0.0,3.0,4.0,14.0,1728,400
438862,0.0,14.0,7.0,9.0,183.0,1844,400
1048628,1.0,8.0,2.0,7.0,29.0,1811,200
436459,0.0,3.0,11.0,7.0,168.0,2241,400
1028648,0.0,1.0,5.0,4.0,154.0,1528,400
530892,1.0,13.0,0.0,14.0,328.0,2555,100
307202,0.0,3.0,8.0,2.0,134.0,1458,400
173406,0.0,5.0,2.0,7.0,205.0,1669,400
1444774,0.0,5.0,5.0,2.0,81.0,1481,200
1653931,0.0,4.0,5.0,8.0,56.0,1664,400


In [12]:
train, test = train_test_split(newDataset, test_size = 0.1)

In [13]:
NewTrainX, NewTrainY = get_data_feed(train)
NewTestX, NewTestY = get_data_feed(test)

LRTrainAccuracy = np.mean(cross_val_score(LogisticRegression(), NewTrainX, NewTrainY, scoring='accuracy', cv=2))
MNTrainAccuracy = np.mean(cross_val_score(MultinomialNB(), NewTrainX, NewTrainY, scoring='accuracy', cv=2))
LRTestAccuracy = np.mean(cross_val_score(LogisticRegression(), NewTestX, NewTestY, scoring='accuracy', cv=2))
MNTestAccuracy = np.mean(cross_val_score(MultinomialNB(), NewTestX, NewTestY, scoring='accuracy', cv=2))

In [14]:
trace1 = go.Bar(
    x=['Logistic Regression', 'MultinomialNB'],
    y=[LRTrainAccuracy, MNTrainAccuracy],
    name='Train'
)
trace2 = go.Bar(
    x=['Logistic Regression', 'MultinomialNB'],
    y=[LRTestAccuracy, MNTestAccuracy],
    name='Test'
)

data = [trace1, trace2]
layout = go.Layout(
    title='Accuracy Comparsions',
    barmode='group',
    yaxis=dict(
        title='Accuracy',
        range=[0.5, 0.8]
    ),
    xaxis=dict(
        title='Method'
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Conclusion

As we can see, considering team role of players does not give us a increase in accuracy.
And using logistic regression, we have about 79% accuracy which is quite good.