In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


import pandas as pd
import numpy as np

from fractions import Fraction as frac

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("fa21-ds5110-group10-rk") \
    .config("spark.driver.memory", "12g") \
    .getOrCreate()

In [3]:
spark.sparkContext

First we add in our cached dataset from our prior feature engineering.

In [4]:
df = spark.read.parquet("../../../data/processed/training.parquet")
testing_df = spark.read.parquet("../../../data/processed/testing.parquet")

df_all = spark.read.parquet("../../../data/processed/chess_games_moves_model.parquet")

In [5]:
df.cache()

DataFrame[event: string, white_result: string, first_two: array<string>, ECO: string, EloDiff: int, Opening: string, game_complexity: int]

Next we review the resulting data points of interest.
We notice that ECO and the first two sets of moves are distinct of one another, and may influence the overall model's prediction.

In [6]:
print("First Turns: {}".format(df.select("first_two").distinct().count()))
print("ECO Types: {}".format(df.select("ECO").distinct().count()))
print("Opening Variants: {}".format(df.select("Opening").distinct().count()))
print( df.groupBy('first_two').count().sort(F.col("count").desc()).show(5), df.count())

First Turns: 289154
ECO Types: 491
Opening Variants: 2832
+------------------+------+
|         first_two| count|
+------------------+------+
|  [e4 e5, Nf3 Nc6]|199152|
|   [e4 e5, Nf3 d6]| 77714|
|[e4 d5, exd5 Qxd5]| 51726|
|  [e4 c5, Nf3 Nc6]| 46965|
|    [e4 e6, d4 d5]| 43334|
+------------------+------+
only showing top 5 rows

None 2308233


We will now begin to build a model, keying in on the opening move and the white_result columns.
Note that both of these are categorical values, so we will need to encode them using the StringIndexer for pyspark to do model evaluations.

In [7]:
opening_vectorizor = StringIndexer(inputCol="ECO", outputCol="opening_ohe")
gametype_vectorizer = StringIndexer(inputCol="event", outputCol="event_vector")
result_vectorizor = StringIndexer(inputCol="white_result", outputCol="white_result_vector")

Next we perform One-Hot Encoding on our Opening type (or ECO) and do our comparision.  THis will create a new column that we will use for our random forest model.

In [8]:
move_encoder = OneHotEncoder(inputCols=["opening_ohe"],
                             outputCols=["ECO_Type"])

Now that we have the OHE of our ECO, we can combine it with other features to build out our predictors for random forest.

In [9]:
features_assembler = VectorAssembler(inputCols=['ECO_Type', "EloDiff", "event_vector"], outputCol='features')

Now that our data is model-ready, we will do a split, fit, transform, and evaluation to determine the performance of our model.
Note that we have chosen the default tunings, but in the future we will likely apply a cross-validation technique in pyspark to select the correct hyperparameters.

In [10]:
rf = RandomForestClassifier(numTrees=20, maxDepth=5, featuresCol='features', labelCol='white_result_vector', seed=1337, leafCol="leafId")

In [11]:
feature_pipeline = Pipeline(stages=[opening_vectorizor,
                                    gametype_vectorizer,
                                    result_vectorizor,
                                    move_encoder])
model_pipeline = Pipeline(stages=[features_assembler,
                                  rf])

In [12]:
# Create our transforms from our full dataset (considers all cases)
feature_model_df = feature_pipeline.fit(df_all)

# Transform both training and testing dfs
df = feature_model_df.transform(df)
testing_df = feature_model_df.transform(testing_df)

In [13]:
model = model_pipeline.fit(df)

In [14]:
result = model.transform(testing_df)

In [15]:
accuracy_evaluator = BinaryClassificationEvaluator(labelCol='white_result_vector', metricName='areaUnderROC')
model_accuracy = accuracy_evaluator.evaluate(result)

In [16]:
print("Model Accuracy: {}".format(model_accuracy))

Model Accuracy: 0.6571804526107925


Here we see that our model's performance seems to work well, with roughtly 50% of games resulting in a match to white win.

Let's review our classifications and confusion matrix next to determine the overall performance.

In [18]:
designer_matrix = result.select(['prediction','white_result_vector'])
metrics_rdd = MulticlassMetrics(designer_matrix.rdd.map(tuple))
print(metrics_rdd.confusionMatrix().toArray())

[[381983. 199779.]
 [227090. 347270.]]


Reviewing this model, we see that this model is not good.
Of the three classes, we found that the precision for the white loss performs okay, however the white win and tie both evaluate to 0 for precision.
This means that our model is overfitting and failed to correctly identify either ties or losses.

Further tuning will be required to better distribute our data (either adjusting the threshold, or tuning the tree).