# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

Load the data

In [2]:
starcraft_loc = '../data/interimStarcraft_cleaned.csv'
#using index_col = 0 to drop the uncessary number column added by saving the data from the previous notebook.
starcraft = pd.read_csv(starcraft_loc,index_col = 0)
starcraft.info()
feat_names = ['Age','HoursPerWeek','TotalHours','APM','SelectByHotkeys','AssignToHotkeys','UniqueHotkeys','MinimapAttacks','MinimapRightClicks','NumberOfPACs','GapBetweenPACs','ActionLatency','ActionsInPAC','TotalMapExplored','WorkersMade','UniqueUnitsMade','ComplexUnitsMade','ComplexAbilitiesUsed']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3337 entries, 0 to 3336
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   GameID                3337 non-null   int64  
 1   LeagueIndex           3337 non-null   int64  
 2   Age                   3337 non-null   int64  
 3   HoursPerWeek          3337 non-null   int64  
 4   TotalHours            3337 non-null   int64  
 5   APM                   3337 non-null   float64
 6   SelectByHotkeys       3337 non-null   float64
 7   AssignToHotkeys       3337 non-null   float64
 8   UniqueHotkeys         3337 non-null   int64  
 9   MinimapAttacks        3337 non-null   float64
 10  MinimapRightClicks    3337 non-null   float64
 11  NumberOfPACs          3337 non-null   float64
 12  GapBetweenPACs        3337 non-null   float64
 13  ActionLatency         3337 non-null   float64
 14  ActionsInPAC          3337 non-null   float64
 15  TotalMapExplored     

Create a LogHours column thats the log of the total hours column which EDA has suggested is exponentially distributed

In [3]:
starcraft['LogHours'] = np.log(starcraft['TotalHours'])
starcraft.head()

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,...,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed,LogHours
0,52,5,27,10,3000,143.718,0.003515,0.00022,7,0.00011,...,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0.0,0.0,8.006368
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,...,0.004307,32.9194,42.3454,4.8434,22,0.001193,5,0.0,0.000208,8.517193
2,56,4,30,10,200,69.9612,0.001101,0.000336,4,0.000294,...,0.002926,44.6475,75.3548,4.043,22,0.000745,6,0.0,0.000189,5.298317
3,57,3,19,20,400,107.6016,0.001034,0.000213,1,5.3e-05,...,0.003783,29.2203,53.7352,4.9155,19,0.000426,7,0.0,0.000384,5.991465
4,58,3,32,10,500,122.8908,0.001136,0.000327,2,0.0,...,0.002368,22.6885,62.0813,9.374,15,0.001174,4,0.0,1.9e-05,6.214608


# Scale and split the data

In [4]:
#Scale the features, ignoring game ID as its not a meaningful feature and of course not scaling league index as its the target
starcraft2 = starcraft.reset_index(drop=True)
star_scale = starcraft2.copy(deep=True)
for col in feat_names:
    scaler = RobustScaler()
    star_scale[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(star_scale[col])))
star_scale.shape

(3337, 21)

Split the data in to Target and Features, then use train test split to split in to training and test sets.

In [5]:
y = starcraft['LeagueIndex']
X = star_scale[feat_names]
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,random_state=42)

Save the new data

In [6]:
data_path = '../data/interim'
Xtrain.to_csv(data_path+'Starcraft_Xtrain')
ytrain.to_csv(data_path+'Starcraft_ytrain')
Xtest.to_csv(data_path+'Starcraft_Xtest')
ytest.to_csv(data_path+'Starcraft_ytest')