# Goals
* Predict Results of Pro matches with 70% accuracy

### Data that will be used
* match data and maps

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np


data = pd.read_csv('scrapy/vlrgg/playerData/vlr.csv')

Refining the data so the players can be one hot encoded

In [None]:
#resorting data so that it is indexed by game
#can skip this step with csv file
refined_data = pd.DataFrame()

for match in data['matchID'].unique():
    map_names = data[data['matchID']==match]['map'].unique()
    for map_name in map_names:
        match_data = data[(data['matchID']==match)&(data['map']==map_name)]
        if len(match_data.index)!=10:
            #disregard incomplete games or games with less than 10 players
            print('Error: skipping incomplete match data')
            continue
        teams = match_data['playerTeam'].unique()
        try:
            team1players = match_data[match_data['playerTeam']==teams[0]]['playerName'].unique()
            team2players = match_data[match_data['playerTeam']==teams[1]]['playerName'].unique()
        except:
            print("Error: empty teams")
        try:
            refined_data =  refined_data.append(
                {'player1':team1players[0],
                'player2':team1players[1],
                'player3':team1players[2],
                'player4':team1players[3],
                'player5':team1players[4],
                'team': teams[0],
                'o_team': teams[1],
                'o_player1':team2players[0],
                'o_player2':team2players[1],
                'o_player3':team2players[2],
                'o_player4':team2players[3],
                'o_player5':team2players[4],
                'player1_agent':match_data[match_data['playerName']==team1players[0]]['playerAgent'].values[0],
                'player2_agent':match_data[match_data['playerName']==team1players[1]]['playerAgent'].values[0],
                'player3_agent':match_data[match_data['playerName']==team1players[2]]['playerAgent'].values[0],
                'player4_agent':match_data[match_data['playerName']==team1players[3]]['playerAgent'].values[0],
                'player5_agent':match_data[match_data['playerName']==team1players[4]]['playerAgent'].values[0],
                'o_player1_agent':match_data[match_data['playerName']==team2players[0]]['playerAgent'].values[0],
                'o_player2_agent':match_data[match_data['playerName']==team2players[1]]['playerAgent'].values[0],
                'o_player3_agent':match_data[match_data['playerName']==team2players[2]]['playerAgent'].values[0],
                'o_player4_agent':match_data[match_data['playerName']==team2players[3]]['playerAgent'].values[0],
                'o_player5_agent':match_data[match_data['playerName']==team2players[4]]['playerAgent'].values[0],
                'map': map_name,
                'result': match_data[match_data['playerName']==team1players[0]]['result'].values[0],
                'date': match_data[match_data['playerName']==team1players[0]]['date'].values[0]
                }, ignore_index=True)
        except:
            print("Error: skipping weird data")
refined_data.to_csv('refined.csv')



### Creating the model
Using logistic regression to predict wins (for the first iteration)

Inputs for this model will include:
* 5 Players on team
* 5 Players on opponent team
* Agents each player is on
* Map

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

csv_data = pd.read_csv('refined.csv')
used_columns = csv_data.columns.tolist()
used_columns.remove('date')
used_columns.remove('result')
used_columns.remove('Unnamed: 0')

X_data = pd.get_dummies(csv_data[used_columns]).to_numpy()
y_data = pd.get_dummies(csv_data['result'])['Win'].to_numpy()


X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,test_size=0.25,random_state=69)

clf = LogisticRegression(penalty="none")
clf.fit(X_train, y_train)



LogisticRegression(penalty='none')

In [54]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = clf.predict(X_test)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(accuracy_score(y_pred=y_pred, y_true=y_test))

[[1519  381]
 [ 505 1051]]
0.7436342592592593


Baseline Predictions for the upcoming VCTs

In [None]:
#find games and make predictions here
#and figure out how to vectorize all the player names

Initial model gives a good baseline of 70-75% accuracy, but this purely uses categorical data. More data could be considered such as
* Player's kda average from the past 5 games on that agent (if information is available)
* Twitter sentiment before the game starts (graph this correlation first)

This will introduce more numerical data in here, which can be graphed, making it easier to see correlations

array([1, 1, 1, ..., 0, 1, 0], dtype=uint8)