In [1]:
import os
import sys
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml.feature import Word2Vec

In [2]:
def init_spark():
    return SparkSession \
        .builder \
        .master("local[4]") \
        .appName("Chess Predict") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

spark = init_spark()

In [3]:
seed = 42

## Load Dataset

In [4]:
df = spark.read.csv('data/games.csv', header=True)

## Data Exploration

In [5]:
print(df.columns)

['id', 'rated', 'created_at', 'last_move_at', 'turns', 'victory_status', 'winner', 'increment_code', 'white_id', 'white_rating', 'black_id', 'black_rating', 'moves', 'opening_eco', 'opening_name', 'opening_ply']


## Data Preprocessing

### Feature `opening_eco`

In [6]:
# extract first letter from opening_eco
df = df.withColumn('open_cat', df.opening_eco.substr(0, 1))
df.select('open_cat').show(5)

+--------+
|open_cat|
+--------+
|       D|
|       B|
|       C|
|       D|
|       C|
+--------+
only showing top 5 rows



### Feature `black_id` and `white_id`

https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.StringIndexer.html#stringindexer

In [7]:
df.selectExpr('white_id AS id').union(df.selectExpr('black_id AS id')).distinct().count()

15635

### Feature `moves`

In [8]:
# split moves from string into list of strings, moves -> [move, move, ...]
moves_list_df = df.select(F.split(df.moves, '\s', -1).alias('moves_list'))
moves_list_df#.collect()[0][0]

DataFrame[moves_list: array<string>]

In [9]:
# (may be useful later) collect all moves into a single corpus
moves_agg_df = df.agg(F.collect_list('moves').alias('agg_moves_list'))
moves_corpus = moves_agg_df.select(F.array_join(moves_agg_df['agg_moves_list'], ' ').alias('joined_corpus'))
moves_corpus#.collect()[0][0]

DataFrame[joined_corpus: string]

In [10]:
vectorSize = 100
word2Vec = Word2Vec(vectorSize=vectorSize, seed=seed, inputCol="moves_list", outputCol="model")

Parameters for `Word2Vec`
- `vectorSize`: size of the output vector
- `minCount`: Ignores all words with total frequency lower than this.

In [11]:
model = word2Vec.fit(moves_list_df)
print('Trained Word2Vec Model')
model.getVectors().show(5)

Trained Word2Vec Model
+-----+--------------------+
| word|              vector|
+-----+--------------------+
| Bxd2|[0.20013783872127...|
| Nxf6|[-0.0895327404141...|
|  a2+|[0.00983080640435...|
|axb7+|[-0.0051464270800...|
|Bxd5+|[0.15780340135097...|
+-----+--------------------+
only showing top 5 rows



In [12]:
print('Transformed Moves for the First Match', f'')
model.transform(moves_list_df).head().model

Transformed Moves for the First Match 


DenseVector([0.0806, 0.049, 0.0171, 0.094, -0.2188, 0.0144, 0.0062, 0.0342, -0.0275, 0.0866, 0.0437, -0.054, 0.1169, -0.1548, -0.0953, 0.0017, 0.0528, -0.1113, 0.0859, 0.1969, -0.1144, -0.0303, -0.1103, 0.0432, -0.1319, -0.0056, -0.0772, -0.1376, -0.0915, -0.0633, -0.1008, 0.0676, 0.166, -0.1348, 0.1817, -0.0032, 0.0731, 0.0235, 0.0382, -0.0444, -0.0811, 0.0599, 0.0422, -0.1846, -0.2262, -0.0182, -0.1167, 0.1092, -0.0119, 0.2056, -0.11, -0.1343, -0.0981, -0.0275, -0.1241, 0.0801, 0.1296, 0.0306, 0.0335, 0.1004, 0.1729, 0.1424, -0.0856, -0.1352, 0.0709, 0.0421, 0.3001, 0.0955, 0.004, -0.1982, 0.0782, 0.139, 0.0668, 0.1588, -0.106, -0.0414, -0.0703, 0.1176, -0.1553, 0.0089, 0.0285, 0.0383, -0.0568, 0.108, 0.0509, -0.044, 0.0625, -0.052, -0.0909, 0.0422, -0.0683, 0.0577, 0.007, 0.0895, -0.0421, -0.0052, 0.0461, -0.0412, -0.0446, 0.0173])