In [1]:
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy
import pandas as pd
import numpy as np
import yaml

from sklearn import preprocessing
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split

In [2]:
with open('config.yml', mode="r") as f:
    config = yaml.safe_load(f)

DATA_PATH = '../data/high_diamond_ranked_10min.csv'
SAVE_FOLDER = '../data/'

data = pd.read_csv(DATA_PATH)

In [3]:
# Drop unnecessary columns, explained in EDA
data.drop(columns={'redCSPerMin', 'blueCSPerMin'}, inplace=True)
data.drop(columns={'redGoldPerMin', 'blueGoldPerMin'}, inplace=True)
data.drop(columns={'redKills', 'redDeaths', 'redFirstBlood', 'redGoldDiff', 'redExperienceDiff'}, inplace=True)
data.drop(columns={'blueEliteMonsters', 'redEliteMonsters'}, inplace=True)

In [4]:
def number_of_corr_values(df, corr_threshold:int = 0.1) -> pd.DataFrame:
    # Function ranking correlation strenght between columns and blue team victories
        
    # Calculate correlation for all columns and sort by strength
    correlation = df.loc[:, df.columns != 'gameId'].corr()
    correlation_sorted = correlation.sort_values(['blueWins'], key=abs ,ascending=False)  
        
    new_index_order = ['gameId']   # always keep id 
        
    # Iterate through sorted corelation array as long as correlation is stronger than corr_threshold,
    # add those columns to array of columns to keep
    for cor in range(len(correlation_sorted['blueWins'])):
        if abs(correlation_sorted['blueWins'][cor]) < corr_threshold:
            break
        new_index_order.append(correlation_sorted['blueWins'].index[cor])
        
    # Create and return new dataframe
    new_set = df[new_index_order]
    return new_set

In [5]:
def feature_selection(df, cols:int = 16):
    # Function that returns columns recommended by filter methods
    # Uses mutual and f_regression to calculate important columns and rank them
    X_new = df.loc[:, df.columns != 'gameId']  # we are not evaluating id column
        
    # Create selectors with given type of calculation and number of output columns (cols)
    # selector_mutual = SelectKBest(mutual_info_regression, k=cols)
    selector_fregresion = SelectKBest(f_regression, k=cols)

    # Fit transforms for methods
    # result_mutual = selector_mutual.fit_transform(X_new, X_new['blueWins'])
    result_fregresion = selector_fregresion.fit_transform(X_new, X_new['blueWins'])

    # Select recommended column names from results 
    # chosen_names_mutual = np.append(selector_mutual.get_support(), True)
    chosen_names_fregresion = np.append(selector_fregresion.get_support(), True)
            
    # Create list of column names with unique values (mutual OR fregresion)
    # values = (list(colnames[chosen_names_mutual]))
    values_second = (list(df.columns[chosen_names_fregresion]))
    values_second.append('blueWins')
    # values += [item for item in values_second if item not in values]
        
    # Return dataframe containing reduced amount of columns (+id)
    return df[values_second]

In [6]:
if "selection_type" in config["preparation"].keys():
    # If there is no selection type, make no selection
    if config["preparation"]["selection_type"] == "correlation":
        if "correlation_threshold" in config["preparation"].keys():
            data = number_of_corr_values(data, data["preparation"]["correlation_threshold"])
        else:
            data = number_of_corr_values(data)
    
    elif config["preparation"]["selection_type"] == "feature_selection":
        if "features_amount" in config["preparation"].keys():
            data = feature_selection(data, cols=config["preparation"]["features_amount"])
        else:
            data = feature_selection(data)
            
    else:
        pass

After selecting features, we want to split our data into test, train and validation to process them further

In [7]:
if config["preparation"]["train_split"]+config["preparation"]["test_split"]+config["preparation"]["validation_split"] != 100:
    raise Exception("Split values together have to be 100!")
    
train_split = config["preparation"]["train_split"]
test_split = config["preparation"]["test_split"]
validation_split = config["preparation"]["validation_split"]

In [8]:
# split data to train and test by config ratio
train, test = train_test_split(data, test_size=(1-train_split/100))

# further split test part to ratio with validation 
test, validation = train_test_split(test, test_size=(1/(test_split+validation_split) * validation_split))

In [9]:
train.shape

(6915, 22)

In [10]:
test.shape

(1482, 22)

In [11]:
validation.shape

(1482, 22)

In this part we want to delete outliers, as those might negatively influenece machine learning algorithm. That is why we want to delete at least the first iteration of outliers.
There is ~5% values as outliers in the first iteration, which, we consider, is reasonable price to pay for cleaner andoutlier detection more useful data.\
We are considering values further than 3x standard deviations from the mean as outliers in our preprocessing.

In [12]:
# Function to detect and delete outliers
def delete_outliers(df : pd.DataFrame) -> pd.DataFrame:
    """
    Function deletes rows containing outlier value in any of the columns and returns adjusted dataframe
    Args
        df - dataframe containing columns to check for outliers
    Returns
        DataFrame without outlier values
    """
    for cols in df.columns:    
        # Check for each column in the dataframe    
        data_frame = df[cols]
        data_mean, data_std = np.mean(data_frame), np.std(data_frame)  # Outlier > mean+3*std OR outlier < mean-3*std

        # Outliers percentage definition
        cut_off = data_std * 3
        lower, upper = data_mean - cut_off, data_mean + cut_off 

        # Identify and remove outliers
        outliers = [False if x < lower or x > upper else True for x in data_frame] 
            
        # Information for the user about deleting rows based on given column
        if outliers.count(False) > 0:
            print(f'Identified outliers: {outliers.count(False)} in column: {cols}')
        df = df[outliers]

    return df

In [13]:
# Selected columns to outlier check
# Delete outliers for train only, as it is the only "formerly" know data part
check_outliers_columns = config["preparation"]["outliers_columns"]

for col in check_outliers_columns:
    try:
        train[col] = delete_outliers(train[[col]])
        train.dropna(inplace=True)
    except KeyError:
        pass

Identified outliers: 71 in column: blueWardsDestroyed
Identified outliers: 34 in column: blueKills
Identified outliers: 43 in column: blueDeaths
Identified outliers: 44 in column: blueAssists
Identified outliers: 28 in column: blueTotalGold
Identified outliers: 47 in column: blueAvgLevel
Identified outliers: 10 in column: blueTotalExperience
Identified outliers: 18 in column: blueTotalMinionsKilled
Identified outliers: 20 in column: blueTotalJungleMinionsKilled
Identified outliers: 12 in column: blueGoldDiff
Identified outliers: 74 in column: redWardsDestroyed
Identified outliers: 41 in column: redAssists
Identified outliers: 14 in column: redTotalGold
Identified outliers: 44 in column: redAvgLevel
Identified outliers: 10 in column: redTotalExperience
Identified outliers: 23 in column: redTotalJungleMinionsKilled


In [14]:
train.shape

(6382, 22)

In [15]:
def data_scale(train, test, validation):    
    """Scale train, test and validation dataset, based on the train MinMax. 
    It means in test and validation, numbers > 1 or < 0 are also valid, because 
    we don't know about those values before"""
    scaled_columns =  list(train.columns)
    scaled_columns.remove('gameId')  # We don't want to scale gameId
        
    # Scaling with the usage of MinMaxScaler - scale all values to <0,1> range
    scaler = preprocessing.MinMaxScaler()
    train_scaler = scaler.fit(train[scaled_columns])
    
    scaled_train = pd.DataFrame(scaler.transform(train[scaled_columns]), columns = scaled_columns)
    scaled_test = pd.DataFrame(scaler.transform(test[scaled_columns]), columns = scaled_columns)
    scaled_validation = pd.DataFrame(scaler.transform(validation[scaled_columns]), columns = scaled_columns)
        
    scaled_train['gameId'] = list(train['gameId'])   # Add id to new dataframe
    scaled_test['gameId'] = list(test['gameId'])   # Add id to new dataframe
    scaled_validation['gameId'] = list(validation['gameId'])   # Add id to new dataframe
    return scaled_train, scaled_test, scaled_validation

In [16]:
train, test, validation = data_scale(train, test, validation)

In [17]:
train.drop(columns='gameId', inplace=True)
test.drop(columns='gameId', inplace=True)
validation.drop(columns='gameId', inplace=True)

In [18]:
train.describe()

Unnamed: 0,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueHeralds,blueTowersDestroyed,blueTotalGold,blueAvgLevel,blueTotalExperience,...,blueTotalJungleMinionsKilled,blueGoldDiff,redWardsDestroyed,redAssists,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalJungleMinionsKilled,blueWins
count,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,...,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0,6382.0
mean,0.2986,0.504544,0.402559,0.43025,0.35648,0.191476,0.01452,0.457286,0.451622,0.493828,...,0.496112,0.493204,0.289826,0.360275,0.018725,0.478575,0.456499,0.498676,0.497267,0.50235
std,0.187224,0.500019,0.188402,0.19554,0.208536,0.393494,0.071779,0.173295,0.176999,0.167299,...,0.165015,0.1644,0.183977,0.208808,0.100938,0.172165,0.177082,0.166684,0.164996,0.500034
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.111111,0.0,0.266667,0.285714,0.222222,0.0,0.0,0.333536,0.375,0.381204,...,0.37931,0.381753,0.111111,0.222222,0.0,0.35407,0.375,0.387645,0.372881,0.0
50%,0.222222,1.0,0.4,0.428571,0.333333,0.0,0.0,0.44828,0.5,0.494902,...,0.482759,0.494165,0.222222,0.333333,0.0,0.467327,0.5,0.498873,0.508475,1.0
75%,0.444444,1.0,0.533333,0.571429,0.5,0.0,0.0,0.570803,0.625,0.607287,...,0.586207,0.603567,0.444444,0.5,0.0,0.593472,0.625,0.612618,0.59322,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
test.describe()

Unnamed: 0,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueHeralds,blueTowersDestroyed,blueTotalGold,blueAvgLevel,blueTotalExperience,...,blueTotalJungleMinionsKilled,blueGoldDiff,redWardsDestroyed,redAssists,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalJungleMinionsKilled,blueWins
count,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,...,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0
mean,0.321637,0.499325,0.418623,0.440717,0.381429,0.165317,0.01417,0.467279,0.450742,0.492752,...,0.488936,0.493209,0.294422,0.374644,0.02193,0.488877,0.461707,0.501443,0.498765,0.485155
std,0.252191,0.500168,0.199845,0.212835,0.229431,0.371592,0.072635,0.184273,0.195046,0.184054,...,0.175949,0.177677,0.219186,0.230863,0.108819,0.188728,0.179704,0.172719,0.163096,0.499948
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.033913,-0.625,-0.505173,...,-0.37931,-0.26512,0.0,0.0,0.0,0.024583,-0.25,-0.125658,0.067797,0.0
25%,0.222222,0.0,0.266667,0.285714,0.222222,0.0,0.0,0.331621,0.375,0.377043,...,0.37931,0.374284,0.111111,0.222222,0.0,0.350966,0.375,0.390538,0.372881,0.0
50%,0.333333,0.0,0.4,0.428571,0.333333,0.0,0.0,0.459706,0.5,0.497451,...,0.482759,0.487339,0.222222,0.333333,0.0,0.478114,0.5,0.499925,0.491525,0.0
75%,0.444444,1.0,0.533333,0.571429,0.5,0.0,0.0,0.581834,0.625,0.618608,...,0.586207,0.612577,0.444444,0.5,0.0,0.603506,0.625,0.613971,0.59322,1.0
max,2.111111,1.0,1.466667,1.357143,1.611111,1.0,1.0,1.287954,1.125,1.134953,...,1.086207,1.129184,2.666667,1.555556,1.0,1.267528,1.0,1.143544,1.135593,1.0


In [20]:
validation.describe()

Unnamed: 0,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueHeralds,blueTowersDestroyed,blueTotalGold,blueAvgLevel,blueTotalExperience,...,blueTotalJungleMinionsKilled,blueGoldDiff,redWardsDestroyed,redAssists,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalJungleMinionsKilled,blueWins
count,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,...,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0,1482.0
mean,0.322387,0.504723,0.41372,0.438307,0.370295,0.192308,0.018893,0.467781,0.445428,0.49144,...,0.487354,0.496431,0.302219,0.367296,0.019906,0.483891,0.451501,0.497082,0.503625,0.495951
std,0.268179,0.500146,0.202158,0.213831,0.225628,0.394247,0.079969,0.184591,0.182142,0.172309,...,0.164531,0.179281,0.254351,0.229312,0.101184,0.19093,0.188836,0.179854,0.16875,0.500152
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.082655,-0.25,-0.104064,...,-0.275862,-0.128009,0.0,0.0,0.0,0.034491,-0.25,-0.090035,-0.186441,0.0
25%,0.111111,0.0,0.266667,0.285714,0.222222,0.0,0.0,0.333323,0.375,0.381991,...,0.37931,0.384725,0.111111,0.222222,0.0,0.350746,0.375,0.379265,0.372881,0.0
50%,0.333333,1.0,0.4,0.428571,0.333333,0.0,0.0,0.453203,0.5,0.492803,...,0.482759,0.498605,0.222222,0.333333,0.0,0.462875,0.5,0.492485,0.508475,0.0
75%,0.444444,1.0,0.533333,0.571429,0.5,0.0,0.0,0.589674,0.625,0.607512,...,0.586207,0.60876,0.444444,0.5,0.0,0.601217,0.625,0.621186,0.627119,1.0
max,3.0,1.0,1.2,1.214286,1.388889,1.0,0.666667,1.294883,1.0,1.045134,...,1.068966,1.151938,2.555556,1.444444,1.0,1.189515,1.25,1.141891,1.186441,1.0


In [21]:
train.to_csv(f"{SAVE_FOLDER}/train.csv", index=False)
test.to_csv(f"{SAVE_FOLDER}/test.csv", index=False)
validation.to_csv(f"{SAVE_FOLDER}/validation.csv", index=False)