In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
def init_spark():
    return SparkSession \
        .builder \
        .master("local[4]") \
        .appName("Chess Predict") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

spark = init_spark()

In [3]:
seed = 42

## Load Dataset

In [4]:
df = spark.read.csv('data/games.csv', header=True, inferSchema=True)

## Data Inspection

In [5]:
import pandas as pd
import numpy as np

In [6]:
df.toPandas().head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


### Inspect Datatypes

In [7]:
np.array(df.dtypes).T

array([['id', 'rated', 'created_at', 'last_move_at', 'turns',
        'victory_status', 'winner', 'increment_code', 'white_id',
        'white_rating', 'black_id', 'black_rating', 'moves',
        'opening_eco', 'opening_name', 'opening_ply'],
       ['string', 'boolean', 'double', 'double', 'int', 'string',
        'string', 'string', 'string', 'int', 'string', 'int', 'string',
        'string', 'string', 'int']], dtype='<U14')

In [8]:
categorical = [feature for feature, dtype in df.dtypes if dtype in {'string', 'boolean'}]
numerical = [feature for feature, dtype in df.dtypes if dtype in {'double', 'int'}]

In [9]:
print('Numerical features')
for col in numerical:
    unique_values = df.select(col).distinct()
    n_unique = unique_values.count()
    if n_unique < 50:
        print(f'{col:20s}:{[row[col] for row in unique_values.collect()]}')
    else:
        print(f'{col:20s}:{n_unique} unique values')

Numerical features
created_at          :13151 unique values
last_move_at        :13186 unique values
turns               :211 unique values
white_rating        :1516 unique values
black_rating        :1521 unique values
opening_ply         :[28, 12, 22, 1, 13, 6, 16, 3, 20, 5, 19, 15, 9, 17, 4, 8, 7, 10, 24, 11, 14, 2, 18]


- `created_at`: Timestamp in UTC
- `last_move_at`: Timestamp in UTC
- `turns`: Number of turns in the match
- `white_rating`: white player rating
- `black_rating`: black player rating
- `opening_ply`: Number of plies used to set up opening

In [10]:
print('Categorical features')
for col in categorical:
    unique_values = df.select(col).distinct()
    n_unique = unique_values.count()
    if n_unique < 50:
        print(f'{col:20s}:{[row[col] for row in unique_values.collect()]}')
    else:
        print(f'{col:20s}:{n_unique} unique values')

Categorical features
id                  :19113 unique values
rated               :[True, False]
victory_status      :['resign', 'outoftime', 'mate', 'draw']
winner              :['white', 'black', 'draw']
increment_code      :400 unique values
white_id            :9438 unique values
black_id            :9331 unique values
moves               :18920 unique values
opening_eco         :365 unique values
opening_name        :1477 unique values


- `id`: Game ID, uniquely identifies a match record
- `rated`: If rated, the game result affects player ratings
- `victory_status`: How the game ended
- `winner`: Match winner
- `increment_code`: Game time setting
- `white_id`: white player id
- `black_id`: black player id
- `moves`: Sequence of moves recorded during the match
- `opening_eco`: ECO classification code for the chess openings moves
- `opening_name`: Name of opening moves

### Check for Anomalies

In [11]:
print( f'unique samples / total samples: {df.distinct().count()} / {df.count()} ' )

unique samples / total samples: 19629 / 20058 


In [12]:
print('Count Null Values in each Column')
df.select([F.count(F.when(F.isnull(col), col)).alias(col) for col in df.columns]).toPandas()

Count Null Values in each Column


Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


- There are duplicate rows, which should be dropped from the dataset.
- Luckily, this dataset does not contain missing values.

## Data Preparation (Preprocessing)
Scikit-Learn offers a range of useful methods for preprocessing and data splits. With the approval from the course instructor, we will transform the datasets into *Pandas DataFrames* in this part.

### Fix Anomalies

In [13]:
# Drop Duplicates
df = df.distinct()

### Feature `opening_eco`
As per inspection, each row of this feature is a concatenation of a letter that denotes an [opening moves category](https://www.365chess.com/eco.php). Although there are variations within each category, we assume that the opening moves in each category to be similar enough that we can ignore the differences within each opening move category. Therefore, we extract the first letter from `opening_eco` to a new feature `open_cat` and ignore `opening_eco` during training.

In [14]:
df.select('opening_eco').head(3)

[Row(opening_eco='C48'), Row(opening_eco='C50'), Row(opening_eco='B06')]

In [15]:
# extract first letter from opening_eco
df = df.withColumn('open_cat', df.opening_eco.substr(0, 1))
df.select('open_cat').distinct().show()

+--------+
|open_cat|
+--------+
|       E|
|       B|
|       D|
|       C|
|       A|
+--------+



### Extract features and target columns

In [16]:
X = df.drop('winner')
y = df.select('winner')

### Split into Train/Test Sets
1. Encode player ids into numerical values.
  - Ensure that each player is assigned one and only one numerical id.
2. Use encoded player ids to split dataset based on groups.
  - Each group contains the matches played by one player. Make sure that each group are sampled evenly in the training set.

In [17]:
# This section uses pandas DF
X_pd = X.toPandas()
y_pd = y.toPandas()

In [18]:
white_black_ids = pd.DataFrame(X_pd[['white_id', 'black_id']])
white_black_ids = white_black_ids.stack().pipe(lambda s: pd.Series(pd.factorize(s.values)[0], s.index)).unstack()

X_pd['white_id_num'] = white_black_ids['white_id']
X_pd['black_id_num'] = white_black_ids['black_id']

X_pd[['white_id', 'white_id_num', 'black_id', 'black_id_num']]

Unnamed: 0,white_id,white_id_num,black_id,black_id_num
0,lagos16,0,doom12384,1
1,hill_j,2,karfedericol,3
2,gr8_m8_m8_im_ir8,4,schachlerno,5
3,duckduckfrog,6,ducksandcats,7
4,valerioneto,8,christina-a-11,9
...,...,...,...,...
19624,malatestabr,15632,dumbluck,192
19625,shatrandjbaz,15633,craciun05,1865
19626,euggeo,8944,craciun05,1865
19627,kferapont,1114,benedictine,6279


In [19]:
from sklearn.model_selection import GroupShuffleSplit

In [20]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
train_idx, test_idx = next(gss.split(X_pd, y_pd, groups=X_pd['white_id_num']))

X_train = X_pd.loc[train_idx]
y_train = y_pd.loc[train_idx]

X_test = X_pd.loc[test_idx]
y_test = y_pd.loc[test_idx]

In [21]:
# Proof that none of the IDs from the training set are present in the test set.
x_train_values = set(X_train['white_id'].values)
x_test_values = set(X_test['white_id'].values)

count = 0
for val in x_train_values:
  if val in x_test_values:
    count += 1
assert count == 0, 'A group is present in both training and test sets'

### Create Spark DF from Pandas DF

In [22]:
X_train = spark.createDataFrame(X_train)
y_train = spark.createDataFrame(y_train)
X_test = spark.createDataFrame(X_test)
y_test = spark.createDataFrame(y_test)

### Feature `moves`
As per inspection, this feature records the list/sequence of moves during the match. Because that the sequence, if parsed correctly, actually indicates the winning player, we need to either drop this feature or obfuscate this information. We find that Spark supports Word2Vec feature transformation, which will turn the sequence into a vector. Such vector representation is independent of the order of the moves, and this can obfuscate part of the information lied within this feature.

In [23]:
from pyspark.ml.feature import Word2Vec

In [24]:
X_train.select('moves').head().moves

'e4 e5 Nf3 Nf6 Nc3 Nc6 Bb5 a6 Bxc6 dxc6 O-O Bg4 d3 Bb4 Bd2 O-O a3 Ba5 b4 Bb6 a4 c5 a5 Ba7 Rb1 b6 bxc5 Rb8 axb6 cxb6 cxb6 Rxb6 Rxb6 Bxb6 Na4 Bxf3 Qxf3 Nd7 Be3 Bxe3 Qxe3 Qa5 Nb2 Qc3 Qc1 Rc8 Nc4 Nc5 Qb2 Na4 Qxc3 Nxc3 Nxe5 Ne2+ Kh1 Rxc2 Ra1 Nc3 h3 f6 Nf3 Rxf2 Rxa6 g5 Nd4 Rd2 Rxf6 Rxd3 Nf5 Nxe4 Re6 Nf2+ Kh2 Kf7 Re2 Nd1 Nh6+ Kg6 Ng4 Nc3 Ne5+ Kh5 Nxd3 Nxe2 g4+ Kg6 Ne5+ Kf6 Nf3 Nf4 Kg3 h5 gxh5 Nxh5+ Kg4 Nf4 Nxg5 Nxh3 Nxh3'

In [25]:
vectorSize = 100
word2Vec = Word2Vec(vectorSize=vectorSize, seed=seed, inputCol="moves_list", outputCol="moves_vec")

Parameters for `Word2Vec`
- `vectorSize`: size of the output vector, the choice of 100 is arbitrary here
- `minCount`: Ignores all words with total frequency lower than this.
- `inputCol`: `moves_list` is the input feature, a list of `moves` split by space
- `outputCol`: `moves_vec` is the output feature, the transformed vector

The Word2Vec model will be trained with the corpus gathered from the `moves` in the training set. After which, it will transform both training and testing datasets' `moves` feature.

In [26]:
# split moves from string into list of strings, moves -> [move, move, ...]
train_moves_list = X_train.select(F.split(X_train.moves, '\s', -1).alias('moves_list'))
train_moves_list #.collect()[0][0]

DataFrame[moves_list: array<string>]

In [27]:
fitted_word2Vec = word2Vec.fit(train_moves_list)
print('Trained Word2Vec Model')
fitted_word2Vec.getVectors().show(5)

Trained Word2Vec Model
+-----+--------------------+
| word|              vector|
+-----+--------------------+
| Bxd2|[-0.1266546249389...|
| Nxf6|[0.29022216796875...|
|  a2+|[-0.0953406244516...|
|Bxd5+|[-0.0100416885688...|
|  Kg8|[-0.1110763028264...|
+-----+--------------------+
only showing top 5 rows



In [28]:
'Transformed Moves for the 1st Match in Training Set: ', fitted_word2Vec.transform(train_moves_list).head().moves_vec

('Transformed Moves for the 1st Match in Training Set: ',
 DenseVector([0.0573, 0.1425, 0.0121, 0.0129, -0.0134, -0.0847, -0.0186, 0.0562, -0.0182, 0.0671, -0.0916, 0.0292, 0.0069, 0.002, -0.0676, -0.0436, 0.0509, 0.0401, -0.0087, 0.0217, -0.0279, -0.0171, -0.0066, -0.0231, -0.0284, -0.0292, -0.0606, -0.0912, 0.0858, 0.0427, -0.004, -0.0192, -0.0162, 0.0587, 0.0311, -0.052, -0.0709, 0.043, 0.058, -0.0143, -0.0042, 0.0059, 0.0179, -0.0585, -0.0182, -0.0185, 0.037, 0.0582, 0.026, 0.0995, -0.0034, -0.007, -0.0572, 0.0209, 0.092, 0.0257, 0.003, 0.0246, 0.0917, -0.0003, 0.0738, 0.0628, -0.0136, 0.0531, 0.0273, -0.0589, -0.0159, -0.0734, 0.0622, 0.1045, -0.0245, -0.0768, 0.0263, 0.006, -0.1302, 0.0315, 0.0265, -0.0744, 0.0901, 0.1239, -0.0127, -0.0006, 0.0839, 0.0085, -0.044, -0.0328, -0.044, 0.0254, -0.0079, 0.0789, -0.0249, -0.008, 0.0044, 0.0311, 0.0017, -0.0421, -0.0252, -0.004, 0.0814, 0.0266]))

In [29]:
X_train = X_train.withColumn('moves_list', F.split(X_train.moves, '\s', -1))
X_train = fitted_word2Vec.transform(X_train)

X_test = X_test.withColumn('moves_list', F.split(X_test.moves, '\s', -1))
X_test = fitted_word2Vec.transform(X_test)

In [31]:
X_test.select('rated', 'turns', 'victory_status', 'increment_code', 'white_rating', 'black_rating', 'open_cat', 'moves_vec').toPandas().head()

Unnamed: 0,rated,turns,victory_status,increment_code,white_rating,black_rating,open_cat,moves_vec
0,False,27,resign,10+0,1501,1501,C,"[0.12456387053761217, 0.10505826923030394, 0.0..."
1,True,98,mate,10+0,1834,1848,D,"[0.008515973289126568, 0.09164226585904098, 0...."
2,False,124,resign,5+5,2186,1017,C,"[0.0009078230922891487, 0.0554626110618213, -0..."
3,True,5,resign,8+8,1674,1095,A,"[0.13264439702033998, 0.24680592715740204, -0...."
4,True,38,mate,10+0,1387,1406,B,"[0.0915889980847408, 0.10312156854687553, 0.09..."
