In [437]:
import numpy as np
import pandas as pd

## Data Preprocessing

In [455]:
df = pd.read_csv('../DataSets/training_data.csv')
test_df = pd.read_csv('../DataSets/testing_data.csv')
datasets = [df, test_df]

for frames in datasets:
    frames.drop( ['date', 'time', 'game', 'white', 'black', 'white_clock', 'black_clock', 'eco', 'moves'], axis=1, inplace=True )
    
mapping = {
    True: 1,
    False: 0
}

for frames in datasets:
    frames.whiteiscomp = df.whiteiscomp.map( mapping )
    frames.blackiscomp = df.blackiscomp.map ( mapping )

PredArray = df.commentaries.unique()
mapIndex = {}
mapPred = {}
for i in range(PredArray.shape[0]):
    mapPred[PredArray[i]] = i
    mapIndex[i] = PredArray[i]

df.commentaries = df.commentaries.map( mapPred )

      id                            game          white          black  \
0  24843       "Sillycon" vs "Danyboyfr"       Sillycon      Danyboyfr   
1  18981   "cagliari" vs "Knightsmasher"       cagliari  Knightsmasher   
2  24251    "sylwild" vs "Knightsmasher"        sylwild  Knightsmasher   
3  35737    "archonpaladin" vs "IFDThor"  archonpaladin        IFDThor   
4  34345           "sparse" vs "IFDThor"         sparse        IFDThor   

   white_elo  black_elo  white_rd  black_rd  whiteiscomp  blackiscomp  \
0       2391       1706      39.8     324.9         True        False   
1       1716       2611     341.5      61.1        False         True   
2       1736       2319      45.3      46.1        False         True   
3       1803       2260      38.9      16.3        False         True   
4       1884       2184      28.0      16.3        False         True   

  timecontrol  date      time white_clock black_clock  eco  plycount  \
0       900+0   NaN  12:35:00     15:00.0   

In [439]:
TimeControl = df.timecontrol
GameTime = []
ExtraTime = []
for i in TimeControl:
    temp = i.split('+')
    GameTime.append( int(temp[0]) )
    ExtraTime.append( int(temp[1]) )

df['gametime'] = GameTime
df['extratime'] = ExtraTime
df.drop( 'timecontrol', axis=1, inplace=True )
df.head(n = 10)

label = df.commentaries.values
df.drop( 'commentaries', axis = 1, inplace=True )
features = df.values[:,1:]

In [440]:
from sklearn.preprocessing import scale

for ix in range(4):
    features[:,ix] = scale( features[:,ix] )

features[:,6] = scale( features[:,6] )

print features.shape, label.shape

(52676, 9) (52676,)


In [441]:
split = int(0.6*data.shape[0])
print split

feature_train = features[:]
label_train = label[:]
feature_test = features[split:]
label_test = label[split:]

print feature_train.shape, label_train.shape, feature_test.shape, label_test.shape

1800
(52676, 9) (52676,) (50876, 9) (50876,)


In [442]:
print np.unique( label_test )

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


## Training 

In [443]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit( feature_train, label_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [456]:
from keras.utils import np_utils

Y_train = np_utils.to_categorical( label_train )
Y_test = np_utils.to_categorical( label_test )

print Y_train.shape, Y_test.shape

model = Sequential()
model.add(Dense(32, input_shape=(9,)))
model.add(Activation('relu'))

model.add( Dense(15) )
model.add(Activation('softmax'))

model.summary()

(52676, 15) (50876, 15)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_29 (Dense)                 (None, 32)            320         dense_input_15[0][0]             
____________________________________________________________________________________________________
activation_29 (Activation)       (None, 32)            0           dense_29[0][0]                   
____________________________________________________________________________________________________
dense_30 (Dense)                 (None, 15)            495         activation_29[0][0]              
____________________________________________________________________________________________________
activation_30 (Activation)       (None, 15)            0           dense_30[0][0]                   
Total params: 815
Trainable params: 815
Non-trainable params: 0
___

In [457]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [458]:
model.fit(feature_train, Y_train, batch_size=16, nb_epoch=5, validation_data=(feature_test, Y_test))

Train on 52676 samples, validate on 50876 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13c6eac10>

In [444]:
TimeControl = test_df.timecontrol
GameTime = []
ExtraTime = []
for i in TimeControl:
    temp = i.split('+')
    GameTime.append( int(temp[0]) )
    ExtraTime.append( int(temp[1]) )

test_df['gametime'] = GameTime
test_df['extratime'] = ExtraTime
test_df.drop( 'timecontrol', axis=1, inplace=True )

label_test_data = test_df.values[:,1:]

for ix in range(4):
    label_test_data[:,ix] = ( label_test_data[:,ix] )

label_test_data[:,6] = scale( label_test_data[:,6] )

print label_test_data.shape

(28365, 9)


In [445]:
prediction = []
for i in label_test_data:
    prediction.append( mapIndex[ rf.predict( X=[i] )[0] ])

In [446]:
out_df = pd.DataFrame(test_df['id'], columns=['id'])
out_df['commentaries'] = prediction

In [447]:
out_df.to_csv('out.csv', index= False)
out_df.head()

Unnamed: 0,id,commentaries
0,76456,Black resigns
1,17495,Black resigns
2,18471,Black resigns
3,72817,White resigns
4,51510,Black resigns
