In [1]:
import os
import sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
def init_spark():
    return SparkSession \
        .builder \
        .master("local[4]") \
        .appName("Chess Predict") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

spark = init_spark()

In [3]:
seed = 42

## Load Dataset

In [4]:
df = spark.read.csv('data/games.csv', header=True, inferSchema=True)

## Data Inspection

In [5]:
import pyspark.pandas as ps
import pandas as pd



In [6]:
df.toPandas().head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


### Inspect Datatypes

In [7]:
df.dtypes

[('id', 'string'),
 ('rated', 'boolean'),
 ('created_at', 'double'),
 ('last_move_at', 'double'),
 ('turns', 'int'),
 ('victory_status', 'string'),
 ('winner', 'string'),
 ('increment_code', 'string'),
 ('white_id', 'string'),
 ('white_rating', 'int'),
 ('black_id', 'string'),
 ('black_rating', 'int'),
 ('moves', 'string'),
 ('opening_eco', 'string'),
 ('opening_name', 'string'),
 ('opening_ply', 'int')]

In [8]:
categorical = [feature for feature, dtype in df.dtypes if dtype in {'string', 'boolean'}]
numerical = [feature for feature, dtype in df.dtypes if dtype in {'double', 'int'}]

#### Numerical Features

In [9]:
for col in numerical:
    unique_values = df.select(col).distinct()
    n_unique = unique_values.count()
    if n_unique < 50:
        print(f'{col:20s}:{[row[col] for row in unique_values.collect()]}')
    else:
        print(f'{col:20s}:{n_unique} unique values')

created_at          :13151 unique values
last_move_at        :13186 unique values
turns               :211 unique values
white_rating        :1516 unique values
black_rating        :1521 unique values
opening_ply         :[28, 12, 1, 13, 6, 16, 3, 20, 5, 19, 15, 9, 17, 4, 8, 7, 10, 11, 14, 2, 18, 22, 24]


- `created_at`: Timestamp in UTC
- `last_move_at`: Timestamp in UTC
- `turns`: Number of turns in the match
- `white_rating`: white player rating
- `black_rating`: black player rating
- `opening_ply`: Number of plies used to set up opening

#### Categorical Features

In [10]:
for col in categorical:
    unique_values = df.select(col).distinct()
    n_unique = unique_values.count()
    if n_unique < 50:
        print(f'{col:20s}:{[row[col] for row in unique_values.collect()]}')
    else:
        print(f'{col:20s}:{n_unique} unique values')

id                  :19113 unique values
rated               :[True, False]
victory_status      :['resign', 'outoftime', 'mate', 'draw']
winner              :['white', 'black', 'draw']
increment_code      :400 unique values
white_id            :9438 unique values
black_id            :9331 unique values
moves               :18920 unique values
opening_eco         :365 unique values
opening_name        :1477 unique values


- `id`: Game ID, uniquely identifies a match record
- `rated`: If rated, the game result affects player ratings
- `victory_status`: How the game ended
- `winner`: Match winner
- `increment_code`: Game time setting
- `white_id`: white player id
- `black_id`: black player id
- `moves`: Sequence of moves recorded during the match
- `opening_eco`: ECO classification code for the chess openings moves
- `opening_name`: Name of opening moves

### Look for Anomalies

In [11]:
print( f'unique samples / total samples: {df.distinct().count()} / {df.count()} ' )

unique samples / total samples: 19629 / 20058 


In [12]:
print('Count Null Values in each Column')
df.select([F.count(F.when(F.isnull(col), col)).alias(col) for col in df.columns]).toPandas()

Count Null Values in each Column


Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


- There are duplicate rows, which should be dropped from the dataset.
- Luckily, this dataset does not contain missing values.

## Data Preparation (Preprocessing)
Scikit-Learn offers a range of useful methods for preprocessing and data splits. With the approval from the course instructor, we will transform the datasets into *Pandas DataFrames* in this part.

In [13]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from pyspark.ml.feature import StringIndexer

### Fix Anomalies

In [14]:
# Drop Duplicates
df = df.distinct()

### Label `winner`


In [15]:
label_indexer = StringIndexer(inputCol='winner', outputCol='winner_label')
label_indexer_model = label_indexer.fit(df)
df = label_indexer_model.transform(df)
df.select('winner', 'winner_label').distinct().show()
df = df.drop('winner').withColumnRenamed('winner_label', 'winner')

+------+------------+
|winner|winner_label|
+------+------------+
|  draw|         2.0|
| white|         0.0|
| black|         1.0|
+------+------------+



### Feature `increment_code`
As per inspection, the feature contains a string with two numbers separated by `+`. After research, we found that the first number refers to initial total clock time per player in *minutes*; the second number refers to the number of *seconds* added to the total clock time after the player makes a move. We decided to extract these two numbers as two separate features `clock` and `increment` replacing `increment_code`.

In [16]:
df.select('increment_code').head(3)

[Row(increment_code='30+25'),
 Row(increment_code='10+0'),
 Row(increment_code='10+3')]

In [17]:
splits = F.split(df.increment_code, '[+]')
df = df.withColumn('clock', splits.getItem(0).cast('Integer')).withColumn('increment', splits.getItem(1).cast('Integer'))
df.select('clock', 'increment').tail(3)

[Row(clock=5, increment=60),
 Row(clock=8, increment=0),
 Row(clock=3, increment=10)]

### Feature `opening_eco`
As per inspection, each row of this feature is a concatenation of a letter that denotes an [opening moves category](https://www.365chess.com/eco.php). Although there are variations within each category, we assume that the opening moves in each category to be similar enough that we can ignore the differences within each opening move category. Therefore, we extract the first letter from `opening_eco` to a new feature `open_cat` and ignore `opening_eco` during training.

In [18]:
df.select('opening_eco').head(3)

[Row(opening_eco='B11'), Row(opening_eco='B45'), Row(opening_eco='C60')]

In [19]:
df = df.withColumn('open_category', df.opening_eco.substr(0, 1))
df.select('open_category').distinct().show()

+-------------+
|open_category|
+-------------+
|            E|
|            B|
|            D|
|            C|
|            A|
+-------------+



In [20]:
open_cat_indexer = StringIndexer(inputCol='open_category', outputCol='open_cat')
open_cat_model = open_cat_indexer.fit(df)
df = open_cat_model.transform(df)
df.select('open_category', 'open_cat').distinct().show()

+-------------+--------+
|open_category|open_cat|
+-------------+--------+
|            D|     3.0|
|            B|     1.0|
|            E|     4.0|
|            C|     0.0|
|            A|     2.0|
+-------------+--------+



### Feature `rated`
Contains boolean values only and does not require preprocessing. Rated games affect player ratings and may affect performance of the player. We may choose to separate rated games from unrated games for this fact, here we did not.

### Split into Train/Test Sets
1. Encode player ids into numerical values.
  - Ensure that each player is assigned one and only one numerical id.
2. Use encoded player ids to split dataset based on groups.
  - Each group contains the matches played by one player. Make sure that each group are sampled evenly in the training set.

#### Scikit-Learn Split methods cannot handle distributed datasets.

In [21]:
df_pd = df.toPandas()

#### Encode categorical player `white_id`s `black_id`s to split based on individual players

In [22]:
player_ids = pd.DataFrame(df_pd[['white_id', 'black_id']])
player_ids = player_ids.stack().pipe(lambda s: pd.Series(pd.factorize(s.values)[0], s.index)).unstack()

df_pd['white_id_num'] = player_ids['white_id']
df_pd['black_id_num'] = player_ids['black_id']

df_pd[['white_id', 'white_id_num', 'black_id', 'black_id_num']]

Unnamed: 0,white_id,white_id_num,black_id,black_id_num
0,konst767,0,ducksandcats,1
1,everybodylovesjesus,2,ahmd11,3
2,jeff1983,4,bekzodjon,5
3,mellowg7,6,jesteroz,7
4,omnivoid,8,hasan_al-banna,9
...,...,...,...,...
19624,demontechristo66,9815,eie24,3698
19625,r-mohamadi55,15633,eie24,3698
19626,kastorcito,5322,ed84,4132
19627,ssf7,3469,casr,15634


#### Data Split

In [23]:
from sklearn.model_selection import GroupShuffleSplit

In [24]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
train_val_idx, test_idx = next(gss.split(df_pd, df_pd['winner'], groups=df_pd['white_id_num']))

df_train_val = df_pd.iloc[train_val_idx]

df_test = df_pd.iloc[test_idx]

In [25]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
train_idx, val_idx = next(gss.split(df_train_val, df_train_val['winner'], groups=df_train_val['white_id_num']))

df_train = df_train_val.iloc[train_idx]

df_val = df_train_val.iloc[val_idx]

In [26]:
# Proof that none of the IDs from the training set are present in the test set.
df_train_values = set(df_train['white_id'].values)
df_test_values = set(df_val['white_id'].values)

assert len(df_train_values.intersection(df_test_values)) == 0, 'A group is present in both training and test sets'

### Feature `victory_status`
From inspection, we observe 4 unique values for this categorical feature: `'resign', 'outoftime', 'mate', 'draw'`. We perform one-hot encoding before feeding this into the model.

In [27]:
enc = OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')
enc.fit(df_train['victory_status'].to_numpy().reshape(-1, 1))
encoded_feature_names = 'status_' + enc.categories_[0]
encoded_feature_names

array(['status_draw', 'status_mate', 'status_outoftime', 'status_resign'],
      dtype=object)

In [28]:
one_hot_status = enc.transform(df_train['victory_status'].to_numpy().reshape(-1, 1))
df_train.loc[:,encoded_feature_names] = one_hot_status

one_hot_status = enc.transform(df_val['victory_status'].to_numpy().reshape(-1, 1))
df_val.loc[:,encoded_feature_names] = one_hot_status

one_hot_status = enc.transform(df_test['victory_status'].to_numpy().reshape(-1, 1))
df_test.loc[:,encoded_feature_names] = one_hot_status
df_train[encoded_feature_names].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.loc[:,encoded_feature_names] = one_hot_status
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val.loc[:,encoded_feature_names] = one_hot_status
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.loc[:,encoded_feature_names] = one_hot_status


Unnamed: 0,status_draw,status_mate,status_outoftime,status_resign
1,0,0,0,1
3,0,1,0,0
4,0,0,0,1
6,0,0,0,1
8,0,1,0,0


### Create Spark DF from Pandas DF Train/Test Set

In [29]:
df_train = spark.createDataFrame(df_train)

df_val = spark.createDataFrame(df_val)

df_test = spark.createDataFrame(df_test)

### Feature `moves`
As per inspection, this feature records the list/sequence of moves during the match. Because that the sequence, if parsed correctly, actually indicates the winning player, we need to either drop this feature or obfuscate this information. We find that Spark supports Word2Vec feature transformation, which will turn the sequence into a vector. Such vector representation is independent of the order of the moves, and this can obfuscate part of the information lied within this feature.

In [30]:
from pyspark.ml.feature import Word2Vec

In [31]:
df_train.select('moves').head().moves

'e4 c5 Nf3 e6 d4 cxd4 Nxd4 Nf6 Nc3 Nc6 Bg5 Be7 f4 Qa5 Nxc6 bxc6 Bd3 d5 e5 Nd7 Bxe7 Kxe7 a3 c5 O-O c4 Be2 Qb6+ Kh1 Qxb2 Qd2 Qb6 Rfb1 Qc6 Bf3 Nb6 Nb5 Bd7 Nd6 Rab8 Qf2 a5 Rb5 a4 Rab1 c3 Rxb6 Rxb6 Rxb6 Qc7 Rb7 Qc6 Qa7 f6 Be2'

In [32]:
vectorSize = 100
word2Vec = Word2Vec(vectorSize=vectorSize, seed=seed, inputCol="moves_list", outputCol="moves_vec")

Parameters for `Word2Vec`
- `vectorSize`: size of the output vector, the choice of 100 is arbitrary here
- `minCount`: Ignores all words with total frequency lower than this.
- `inputCol`: `moves_list` is the input feature, a list of `moves` split by space
- `outputCol`: `moves_vec` is the output feature, the transformed vector

The Word2Vec model will be trained with the corpus gathered from the `moves` in the training set. After which, it will transform both training and testing datasets' `moves` feature.

In [33]:
# split moves from string into list of strings, moves -> [move, move, ...]
train_moves_corpus = df_train.select(F.split(df_train.moves, '\s', -1).alias('moves_list'))
train_moves_corpus #.collect()[0][0]

DataFrame[moves_list: array<string>]

In [34]:
fitted_word2Vec = word2Vec.fit(train_moves_corpus)
print('Trained Word2Vec Model')
fitted_word2Vec.getVectors().show(5)

Trained Word2Vec Model
+-----+--------------------+
| word|              vector|
+-----+--------------------+
| Bxd2|[0.15923261642456...|
| Nxf6|[-0.1036005020141...|
|  a2+|[0.00299424631521...|
|Bxd5+|[-0.1865539848804...|
|  Kg8|[-0.2937827110290...|
+-----+--------------------+
only showing top 5 rows





In [35]:
'Transformed Moves for the 1st Match in Training Set: ', fitted_word2Vec.transform(train_moves_corpus).head().moves_vec

('Transformed Moves for the 1st Match in Training Set: ',
 DenseVector([0.0801, 0.0527, 0.0381, -0.0027, -0.0377, -0.0495, 0.0195, 0.0849, 0.0132, 0.0732, -0.1094, -0.0304, 0.1132, -0.0542, -0.1284, -0.0165, -0.034, -0.0655, -0.0232, 0.1114, 0.0473, -0.0779, -0.0428, 0.0102, -0.0605, -0.0117, -0.0537, -0.0395, 0.0376, -0.0755, 0.0671, 0.0369, 0.0548, 0.0066, 0.1328, 0.0624, -0.0662, -0.0139, 0.0254, 0.0208, -0.0083, -0.0289, 0.046, -0.072, -0.009, -0.0572, -0.0061, -0.0154, -0.0156, 0.08, 0.0911, -0.0302, -0.0606, -0.0485, 0.02, 0.026, 0.0439, 0.1367, -0.0823, 0.0053, 0.0516, -0.021, -0.0118, 0.0026, 0.0037, -0.0455, -0.0694, -0.0722, 0.0301, 0.009, -0.0422, -0.0853, 0.1044, 0.0771, -0.0991, -0.0081, 0.0616, 0.0114, -0.0498, 0.0937, -0.0183, 0.0079, 0.0393, -0.006, -0.0728, -0.0771, 0.0086, -0.0312, 0.0564, -0.0141, -0.0677, -0.0712, 0.0704, 0.0141, -0.0129, -0.1696, 0.0374, -0.0111, 0.1529, -0.0078]))

In [36]:
df_train = df_train.withColumn('moves_list', F.split(df_train.moves, '\s', -1))
df_train = fitted_word2Vec.transform(df_train)

df_val = df_val.withColumn('moves_list', F.split(df_val.moves, '\s', -1))
df_val = fitted_word2Vec.transform(df_val)

df_test = df_test.withColumn('moves_list', F.split(df_test.moves, '\s', -1))
df_test = fitted_word2Vec.transform(df_test)

## Feature Selection

In [39]:
features = 'rated', 'turns', 'status_draw', 'status_mate', 'status_resign', 'status_outoftime', 'clock', 'increment', 'white_rating', 'black_rating', 'open_cat', 'opening_ply', 'moves_vec'
target = 'winner',
columns = features + target
columns

('rated',
 'turns',
 'status_draw',
 'status_mate',
 'status_resign',
 'status_outoftime',
 'clock',
 'increment',
 'white_rating',
 'black_rating',
 'open_cat',
 'opening_ply',
 'moves_vec',
 'winner')

In [40]:
df_train.select(*columns).toPandas().head()

Unnamed: 0,rated,turns,status_draw,status_mate,status_resign,status_outoftime,clock,increment,white_rating,black_rating,open_cat,opening_ply,moves_vec,winner
0,True,55,0,0,1,0,10,0,2131,2018,1.0,10,"[0.08013723800805482, 0.05273693208582699, 0.0...",0.0
1,True,17,0,1,0,0,10,0,1442,1270,0.0,8,"[0.006477387772653909, -0.16206761164700284, -...",0.0
2,True,42,0,0,1,0,10,0,1463,1432,2.0,1,"[0.15237697943424183, 0.05841163913941099, -0....",0.0
3,True,31,0,0,1,0,10,0,1671,1671,0.0,3,"[-0.015539421008959893, -0.0945617536142949, 0...",0.0
4,True,39,0,1,0,0,10,8,1297,1413,0.0,4,"[0.09782497773472315, 0.0007970325726394851, 0...",0.0
