In [1]:
import os
import sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
def init_spark():
    return SparkSession \
        .builder \
        .master("local[4]") \
        .appName("Chess Predict") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

spark = init_spark()

In [3]:
seed = 42

## Load Dataset

In [4]:
df = spark.read.csv('data/games.csv', header=True, inferSchema=True)

## Data Inspection

In [5]:
import pyspark.pandas as ps
import pandas as pd



In [6]:
df.toPandas().head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


### Inspect Datatypes

In [7]:
df.dtypes

[('id', 'string'),
 ('rated', 'boolean'),
 ('created_at', 'double'),
 ('last_move_at', 'double'),
 ('turns', 'int'),
 ('victory_status', 'string'),
 ('winner', 'string'),
 ('increment_code', 'string'),
 ('white_id', 'string'),
 ('white_rating', 'int'),
 ('black_id', 'string'),
 ('black_rating', 'int'),
 ('moves', 'string'),
 ('opening_eco', 'string'),
 ('opening_name', 'string'),
 ('opening_ply', 'int')]

In [8]:
categorical = [feature for feature, dtype in df.dtypes if dtype in {'string', 'boolean'}]
numerical = [feature for feature, dtype in df.dtypes if dtype in {'double', 'int'}]

#### Numerical Features

In [9]:
for col in numerical:
    unique_values = df.select(col).distinct()
    n_unique = unique_values.count()
    if n_unique < 50:
        print(f'{col:20s}:{[row[col] for row in unique_values.collect()]}')
    else:
        print(f'{col:20s}:{n_unique} unique values')

created_at          :13151 unique values
last_move_at        :13186 unique values
turns               :211 unique values
white_rating        :1516 unique values
black_rating        :1521 unique values
opening_ply         :[28, 12, 1, 13, 6, 16, 3, 20, 5, 19, 15, 9, 17, 4, 8, 7, 10, 11, 14, 2, 18, 22, 24]


- `created_at`: Timestamp in UTC
- `last_move_at`: Timestamp in UTC
- `turns`: Number of turns in the match
- `white_rating`: white player rating
- `black_rating`: black player rating
- `opening_ply`: Number of plies used to set up opening

#### Categorical Features

In [10]:
for col in categorical:
    unique_values = df.select(col).distinct()
    n_unique = unique_values.count()
    if n_unique < 50:
        print(f'{col:20s}:{[row[col] for row in unique_values.collect()]}')
    else:
        print(f'{col:20s}:{n_unique} unique values')

id                  :19113 unique values
rated               :[True, False]
victory_status      :['resign', 'outoftime', 'mate', 'draw']
winner              :['white', 'black', 'draw']
increment_code      :400 unique values
white_id            :9438 unique values
black_id            :9331 unique values
moves               :18920 unique values
opening_eco         :365 unique values
opening_name        :1477 unique values


- `id`: Game ID, uniquely identifies a match record
- `rated`: If rated, the game result affects player ratings
- `victory_status`: How the game ended
- `winner`: Match winner
- `increment_code`: Game time setting
- `white_id`: white player id
- `black_id`: black player id
- `moves`: Sequence of moves recorded during the match
- `opening_eco`: ECO classification code for the chess openings moves
- `opening_name`: Name of opening moves

### Look for Anomalies

In [11]:
print( f'unique samples / total samples: {df.distinct().count()} / {df.count()} ' )

unique samples / total samples: 19629 / 20058 


In [12]:
print('Count Null Values in each Column')
df.select([F.count(F.when(F.isnull(col), col)).alias(col) for col in df.columns]).toPandas()

Count Null Values in each Column


Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


- There are duplicate rows, which should be dropped from the dataset.
- Luckily, this dataset does not contain missing values.

## Data Preparation (Preprocessing)
Scikit-Learn offers a range of useful methods for preprocessing and data splits. With the approval from the course instructor, we will transform the datasets into *Pandas DataFrames* in this part.

In [13]:
from sklearn.preprocessing import OneHotEncoder

### Fix Anomalies

In [14]:
# Drop Duplicates
df = df.distinct()

### Extract features and target columns

In [15]:
X = df.drop('winner')
y = df.select('winner')

### Feature `increment_code`
As per inspection, the feature contains a string with two numbers separated by `+`. After research, we found that the first number refers to initial total clock time per player in *minutes*; the second number refers to the number of *seconds* added to the total clock time after the player makes a move. We decided to extract these two numbers as two separate features `clock` and `increment` replacing `increment_code`.

In [16]:
X.select('increment_code').head(3)

[Row(increment_code='30+25'),
 Row(increment_code='10+0'),
 Row(increment_code='10+3')]

In [17]:
splits = F.split(df.increment_code, '[+]')
X = X.withColumn('clock', splits.getItem(0)).withColumn('increment', splits.getItem(1))
X.select('clock', 'increment').tail(3)

[Row(clock='5', increment='60'),
 Row(clock='8', increment='0'),
 Row(clock='3', increment='10')]

### Feature `opening_eco`
As per inspection, each row of this feature is a concatenation of a letter that denotes an [opening moves category](https://www.365chess.com/eco.php). Although there are variations within each category, we assume that the opening moves in each category to be similar enough that we can ignore the differences within each opening move category. Therefore, we extract the first letter from `opening_eco` to a new feature `open_cat` and ignore `opening_eco` during training.

In [18]:
X.select('opening_eco').head(3)

[Row(opening_eco='B11'), Row(opening_eco='B45'), Row(opening_eco='C60')]

In [19]:
# extract first letter from opening_eco
X = X.withColumn('open_cat', X.opening_eco.substr(0, 1))
X.select('open_cat').distinct().show()

+--------+
|open_cat|
+--------+
|       E|
|       B|
|       D|
|       C|
|       A|
+--------+



### Feature `rated`
Contains boolean values only and does not require preprocessing. Rated games affect player ratings and may affect performance of the player. We may choose to separate rated games from unrated games for this fact, here we did not.

### Split into Train/Test Sets
1. Encode player ids into numerical values.
  - Ensure that each player is assigned one and only one numerical id.
2. Use encoded player ids to split dataset based on groups.
  - Each group contains the matches played by one player. Make sure that each group are sampled evenly in the training set.

#### Scikit-Learn Split methods cannot handle distributed datasets.

In [20]:
X_pd = X.toPandas()
y_pd = y.toPandas()

#### Encode categorical player ids

In [21]:
player_ids = pd.DataFrame(X_pd[['white_id', 'black_id']])
player_ids = player_ids.stack().pipe(lambda s: pd.Series(pd.factorize(s.values)[0], s.index)).unstack()

X_pd['white_id_num'] = player_ids['white_id']
X_pd['black_id_num'] = player_ids['black_id']

X_pd[['white_id', 'white_id_num', 'black_id', 'black_id_num']]

Unnamed: 0,white_id,white_id_num,black_id,black_id_num
0,konst767,0,ducksandcats,1
1,everybodylovesjesus,2,ahmd11,3
2,jeff1983,4,bekzodjon,5
3,mellowg7,6,jesteroz,7
4,omnivoid,8,hasan_al-banna,9
...,...,...,...,...
19624,demontechristo66,9815,eie24,3698
19625,r-mohamadi55,15633,eie24,3698
19626,kastorcito,5322,ed84,4132
19627,ssf7,3469,casr,15634


#### Data Split

In [22]:
from sklearn.model_selection import GroupShuffleSplit

In [23]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
train_idx, test_idx = next(gss.split(X_pd, y_pd, groups=X_pd['white_id_num']))

X_train = X_pd.iloc[train_idx]
y_train = y_pd.iloc[train_idx]

X_test = X_pd.iloc[test_idx]
y_test = y_pd.iloc[test_idx]

In [24]:
# Proof that none of the IDs from the training set are present in the test set.
x_train_values = set(X_train['white_id'].values)
x_test_values = set(X_test['white_id'].values)

count = 0
for val in x_train_values:
  if val in x_test_values:
    count += 1
assert count == 0, 'A group is present in both training and test sets'

### Feature `victory_status`
From inspection, we observe 4 unique values for this categorical feature: `'resign', 'outoftime', 'mate', 'draw'`. We perform one-hot encoding before feeding this into the model.

In [25]:
enc = OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore')
enc.fit(X_train['victory_status'].to_numpy().reshape(-1, 1))
encoded_feature_names = 'status_' + enc.categories_[0]
encoded_feature_names

array(['status_draw', 'status_mate', 'status_outoftime', 'status_resign'],
      dtype=object)

In [26]:
one_hot_status = enc.transform(X_train['victory_status'].to_numpy().reshape(-1, 1))
X_train.loc[:,encoded_feature_names] = one_hot_status
X_train[encoded_feature_names].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,encoded_feature_names] = one_hot_status


Unnamed: 0,status_draw,status_mate,status_outoftime,status_resign
0,0,0,0,1
1,0,0,0,1
3,0,1,0,0
4,0,0,0,1
6,0,0,0,1


In [27]:
one_hot_status = enc.transform(X_test['victory_status'].to_numpy().reshape(-1, 1))
X_test.loc[:,encoded_feature_names] = one_hot_status
X_test[encoded_feature_names].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[:,encoded_feature_names] = one_hot_status


Unnamed: 0,status_draw,status_mate,status_outoftime,status_resign
2,0,0,0,1
5,0,1,0,0
10,0,0,0,1
11,0,0,0,1
14,0,0,0,1


### Create Spark DF from Pandas DF Train/Test Set

In [28]:
X_train = spark.createDataFrame(X_train)
y_train = spark.createDataFrame(y_train)
X_test = spark.createDataFrame(X_test)
y_test = spark.createDataFrame(y_test)

### Feature `moves`
As per inspection, this feature records the list/sequence of moves during the match. Because that the sequence, if parsed correctly, actually indicates the winning player, we need to either drop this feature or obfuscate this information. We find that Spark supports Word2Vec feature transformation, which will turn the sequence into a vector. Such vector representation is independent of the order of the moves, and this can obfuscate part of the information lied within this feature.

In [29]:
from pyspark.ml.feature import Word2Vec

In [30]:
X_train.select('moves').head().moves

'e4 c6 Nc3 d5 Nf3 Bg4 h3 Bh5 exd5 cxd5 Bb5+ Nc6 g4 Bg6 Ne5 Qb6 Nxg6 hxg6 Nxd5 Qc5 Qf3 Rc8 Nc3 e6 d3 Qe5+ Qe4 Bd6 Be3 Ne7 Bd4 Qxe4+ Nxe4 Bb8 Bxg7 Rh4 Bf6 Rh6'

In [31]:
vectorSize = 100
word2Vec = Word2Vec(vectorSize=vectorSize, seed=seed, inputCol="moves_list", outputCol="moves_vec")

Parameters for `Word2Vec`
- `vectorSize`: size of the output vector, the choice of 100 is arbitrary here
- `minCount`: Ignores all words with total frequency lower than this.
- `inputCol`: `moves_list` is the input feature, a list of `moves` split by space
- `outputCol`: `moves_vec` is the output feature, the transformed vector

The Word2Vec model will be trained with the corpus gathered from the `moves` in the training set. After which, it will transform both training and testing datasets' `moves` feature.

In [32]:
# split moves from string into list of strings, moves -> [move, move, ...]
train_moves_corpus = X_train.select(F.split(X_train.moves, '\s', -1).alias('moves_list'))
train_moves_corpus #.collect()[0][0]

DataFrame[moves_list: array<string>]

In [33]:
fitted_word2Vec = word2Vec.fit(train_moves_corpus)
print('Trained Word2Vec Model')
fitted_word2Vec.getVectors().show(5)

Trained Word2Vec Model
+-----+--------------------+
| word|              vector|
+-----+--------------------+
| Bxd2|[0.08646679669618...|
| Nxf6|[-0.3127570748329...|
|  a2+|[0.02550127729773...|
|Bxd5+|[0.14410454034805...|
|  Kg8|[-0.1851859688758...|
+-----+--------------------+
only showing top 5 rows





In [34]:
'Transformed Moves for the 1st Match in Training Set: ', fitted_word2Vec.transform(train_moves_corpus).head().moves_vec

('Transformed Moves for the 1st Match in Training Set: ',
 DenseVector([0.1149, 0.0761, 0.019, 0.0671, -0.1265, -0.0495, -0.0001, 0.0269, -0.0027, 0.0543, -0.0921, -0.043, 0.0581, 0.0162, -0.0572, -0.0128, 0.0236, -0.0254, 0.0416, 0.104, -0.0107, -0.0327, -0.0123, -0.0158, -0.0854, -0.0706, -0.1003, -0.0493, 0.0998, -0.0557, 0.025, 0.0252, 0.0767, -0.0238, 0.1444, 0.1162, -0.0034, 0.0435, -0.0162, -0.1023, -0.0363, 0.0349, -0.0397, -0.0861, -0.1683, -0.1017, -0.0548, 0.0306, -0.0268, 0.0998, -0.0735, 0.0093, 0.0021, -0.0197, -0.0052, 0.0371, 0.033, 0.0168, -0.0196, 0.0726, -0.0009, 0.0104, -0.0574, 0.0021, -0.0818, -0.0809, 0.0882, 0.0114, -0.0434, -0.0145, 0.0182, -0.0209, -0.0122, 0.1118, 0.0584, -0.0295, -0.0278, -0.0114, 0.115, 0.1062, -0.0852, -0.0015, 0.0958, 0.0385, -0.0011, 0.0064, 0.0006, -0.0648, 0.0568, -0.0183, -0.0443, -0.1293, 0.0597, 0.0492, -0.0374, -0.133, 0.0243, -0.056, 0.0682, 0.0676]))

In [35]:
X_train = X_train.withColumn('moves_list', F.split(X_train.moves, '\s', -1))
X_train = fitted_word2Vec.transform(X_train)

X_test = X_test.withColumn('moves_list', F.split(X_test.moves, '\s', -1))
X_test = fitted_word2Vec.transform(X_test)

## Feature Selection

In [36]:
features = 'rated', 'turns', 'status_draw', 'status_mate', 'status_resign', 'status_outoftime', 'clock', 'increment', 'white_rating', 'black_rating', 'open_cat', 'opening_ply', 'moves_vec'

In [38]:
X_test.select(*features).toPandas().head()

Unnamed: 0,rated,turns,status_draw,status_mate,status_resign,status_outoftime,clock,increment,white_rating,black_rating,open_cat,opening_ply,moves_vec
0,False,40,0,0,1,0,10,3,1850,1804,C,5,"[0.058609917113790294, 0.07966271424520528, -0..."
1,True,88,0,1,0,0,10,0,1437,1635,C,3,"[0.0038684618636828172, 0.052846872710771015, ..."
2,False,19,0,0,1,0,45,0,2030,2264,B,6,"[0.07270992459043076, 0.14869173645581069, -0...."
3,True,39,0,0,1,0,10,0,1951,1862,A,2,"[0.037499880083860494, 0.12510147299139926, 0...."
4,True,11,0,0,1,0,15,15,1473,1166,C,3,"[0.09142499065703967, 0.03824410198087042, -0...."
