This is looking at the individual features and their effectiveness. We are using Gradient Boosted Trees as the model of choice.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import pandas as pd
import numpy as np

from fractions import Fraction as frac

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("fa21-ds5110-group10-rk") \
    .config("spark.driver.memory", "36g") \
    .getOrCreate()

In [3]:
spark.sparkContext

In [4]:
spark.sparkContext.cancelAllJobs()

First we add in our cached dataset from our prior feature engineering.

In [5]:
df = spark.read.parquet("../../../data/processed/chess_games_moves_model-limited.parquet")
trainData = spark.read.parquet("../../../data/processed/training-limited.parquet")
testData = spark.read.parquet("../../../data/processed/testing-limited.parquet")

In [6]:
df.cache()
trainData.cache()

DataFrame[event: string, white_result: string, first_two: array<string>, ECO: string, EloDiff: int, Opening: string, game_complexity: int, opening_class: string]

In [7]:
df.show(2)

+-----+------------+----------------+---+-------+--------------------+---------------+-----------------+
|event|white_result|       first_two|ECO|EloDiff|             Opening|game_complexity|    opening_class|
+-----+------------+----------------+---+-------+--------------------+---------------+-----------------+
|Blitz|        loss|[e4 e5, d4 exd4]|C22|     75|Center Game: Paul...|              4|       Open Games|
|Blitz|        loss|  [d4 d5, c4 c6]|D45|     41|Semi-Slav Defense...|              5|Semi-Closed Games|
+-----+------------+----------------+---+-------+--------------------+---------------+-----------------+
only showing top 2 rows



Next we review the resulting data points of interest.
We notice that ECO and the first two sets of moves are distinct of one another, and may influence the overall model's prediction.

In [8]:
print("First Turns: {}".format(df.select("first_two").distinct().count()))
print("ECO Types: {}".format(df.select("ECO").distinct().count()))
print( df.groupBy('first_two').count().sort(F.col("count").desc()).show(5), df.count())

First Turns: 250695
ECO Types: 491
+------------------+------+
|         first_two| count|
+------------------+------+
|  [e4 e5, Nf3 Nc6]|180608|
|   [e4 e5, Nf3 d6]| 68187|
|  [e4 c5, Nf3 Nc6]| 44943|
|[e4 d5, exd5 Qxd5]| 44702|
|    [e4 e6, d4 d5]| 41142|
+------------------+------+
only showing top 5 rows

None 2045840


We will now begin to build a model, keying in on the opening move and the white_result columns.
Note that both of these are categorical values, so we will need to encode them using the StringIndexer for pyspark to do model evaluations.

In [9]:
# features
opening_vectorizor = StringIndexer(inputCol="ECO", outputCol="opening_ohe")
gametype_vectorizer = StringIndexer(inputCol="event", outputCol="event_vector")
class_vectorizer = StringIndexer(inputCol="opening_class", outputCol="opening_class_vector")
# target
result_vectorizor = StringIndexer(inputCol="white_result", outputCol="white_result_vector")

Next we perform One-Hot Encoding on our Opening type (or ECO) and do our comparision.  THis will create a new column that we will use for our random forest model.

In [10]:
move_encoder = OneHotEncoder(inputCols=["opening_ohe"],
                        outputCols=["ECO_Type"])

In [11]:
class_encoder = OneHotEncoder(inputCols=["opening_class_vector"],
                        outputCols=["Class_Type"])

Now that we have the OHE of our ECO, we can combine it with other features to build out our predictors for random forest.

In [12]:
features_assembler = VectorAssembler(inputCols=['ECO_Type', "event_vector","EloDiff"], outputCol='features')

Now that our data is model-ready, we will do a split, fit, transform, and evaluation to determine the performance of our model.
Note that we have chosen the default tunings, but in the future we will likely apply a cross-validation technique in pyspark to select the correct hyperparameters.

In [13]:
gbt = GBTClassifier(maxDepth=5, maxIter=5, labelCol='white_result_vector', seed=1337, leafCol="leafId")

In [14]:
data_pipeline = Pipeline(stages=[opening_vectorizor,
                            gametype_vectorizer,
                            class_vectorizer,
                            result_vectorizor,
                            move_encoder,
                            class_encoder,
                            features_assembler])
ml_pipeline = Pipeline(stages=[gbt])

data_model = data_pipeline.fit(df)
data_model_train = data_model.transform(trainData)
data_model_test = data_model.transform(testData)

ml_model = ml_pipeline.fit(data_model_train)

gbt = GBTClassifier(maxDepth=5, maxIter=10, labelCol='white_result_vector', seed=0, leafCol="leafId")
    
    
paramGrid = ParamGridBuilder().build()

features_assembler = VectorAssembler(inputCols=["EloDiff"], outputCol='features')         
data_pipeline = Pipeline(stages=[opening_vectorizor,
                            gametype_vectorizer,
                            result_vectorizor,
                            move_encoder,
                            features_assembler])
ml_pipeline = Pipeline(stages=[gbt])
        
data_model = data_pipeline.fit(df)

data_transformed = data_model.transform(df)
        
trainval = CrossValidator(estimator=ml_pipeline,
                                        estimatorParamMaps=paramGrid,
                                        evaluator=BinaryClassificationEvaluator(labelCol='white_result_vector'),
                                        seed=0,
                                        numFolds=10)
cvModel = trainval.setParallelism(4).fit(data_transformed) #
print((var,cvModel.bestModel.stages[-1].summary.accuracy,cvModel.bestModel.stages[-1].summary.areaUnderROC))

In [15]:
vars_selected = ['ECO_Type', "event_vector","EloDiff"]

def compute_univariate_aucs(df, target, varName, folds, max_iterations, seed):
    
    #encoder = OneHotEncoder(inputCol=target, outputCol="target_one")
    
    gbt = GBTClassifier(maxDepth=5, maxIter=max_iterations, labelCol=target, seed=seed, leafCol="leafId")
    
    
    paramGrid = ParamGridBuilder() \
        .build()
        #.addGrid(gbt.subsamplingRate, [0.8, 1]) \
        #.addGrid(gbt.minInstancesPerNode, [2, 10,25]) \
        #.addGrid(gbt.maxDepth, [5, 8]) \
        
    results = []
    
    features_assembler = VectorAssembler(inputCols=[varName], outputCol='features')         
    data_pipeline = Pipeline(stages=[opening_vectorizor,
                            gametype_vectorizer,
                            class_vectorizer,
                            result_vectorizor,
                            move_encoder,
                            class_encoder,
                            features_assembler])
    ml_pipeline = Pipeline(stages=[gbt])

    data_model = data_pipeline.fit(df)
    data_transformed = data_model.transform(df)

    trainval = CrossValidator(estimator=ml_pipeline,
                                    estimatorParamMaps=paramGrid,
                                    evaluator=BinaryClassificationEvaluator(labelCol=target),
                                    seed=seed,
                                    numFolds=folds)
    print(varName)
    cvModel = trainval.setParallelism(4).fit(data_transformed)
    
    return list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))

With all data:
* 'ECO_Type' = 0.5346240980278584
** 0.5161513310165593
* "event_vector" = 0.5013733029902808
** 0.5010066514700525
* "EloDiff" = 0.6877069013414416
** 0.5705630031932493
* "Class_Type" = 0.514669790072792
** 0.50852791306638

In [16]:
results1 = compute_univariate_aucs(df, 'white_result_vector', 'ECO_Type', 5, 10, 0)
results1

ECO_Type


[(0.5161513310165593, {})]

In [16]:
results2 = compute_univariate_aucs(df, 'white_result_vector', "event_vector", 5, 10, 0)
results2

event_vector


[(0.5010066514700525, {})]

In [17]:
results3 = compute_univariate_aucs(df, 'white_result_vector', "EloDiff", 5, 10, 0)
results3

EloDiff


[(0.5705630031932493, {})]

In [18]:
results4 = compute_univariate_aucs(df, 'white_result_vector', "Class_Type", 5, 10, 0)
results4

Class_Type


[(0.50852791306638, {})]