# Exporatory Analytics of Dataset

## DS 5110
* Fall 2021
* October 3rd
* Group 10
  * Antone Edelman
  * Xin Huang
  * Robert Knuuti

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

import pandas as pd
import numpy as np

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("fa21-ds5110-group10") \
    .getOrCreate()

df = spark.read.parquet("../../data/processed/chess_games_blitz_classic.parquet")

There's some additional cleanup we have to do beyond the filtering of games to just classic and blitz.
Below are our transforms to add new features and to transform the data.

In [3]:
df.printSchema()

print("Abandoned Games to drop: {}".format(df.filter(df.result.contains("*")).count()))
# Remove all abandoned games.
df_filtered = df.filter(~df.result.contains("*"))

from fractions import Fraction as frac

# Convert result column into seperate white/black win columns
white_win_udf = F.udf(lambda result: float(frac(result.split('-')[0])), T.DoubleType())
black_win_udf = F.udf(lambda result: float(frac(result.split('-')[1])), T.DoubleType())

# We break apar the AN notation for chess moves into arrays
def movetype(x):
    import re
    moves = re.split('\d+\. ', x)[1:]
    return [x.strip() for x in moves]

# We establish a bin rate to determine the complexity of a match (based on number of moves)
def bin_moves(x):
    if x == 1:
        return 1
    elif x <= 10:
        return 2
    elif x <= 20:
        return 3
    elif x <= 30:
        return 4
    elif x <= 40:
        return 5
    elif x <= 50:
        return 6
    else:
        return 7

udf_movetype = F.udf(lambda x: movetype(x), T.ArrayType(T.StringType()))
df_filtered = df_filtered.withColumn('moves', udf_movetype(F.col('AN')))

# Calculate the total number of moves in a game
count_udf  = F.udf(lambda result: len(result), T.IntegerType())
game_complexity_udf = F.udf(lambda result: bin_moves(result), T.IntegerType())

df_filtered = df_filtered.withColumn("white_games_won", white_win_udf(df_filtered.result))
df_filtered = df_filtered.withColumn("black_games_won", black_win_udf(df_filtered.result))
df_filtered = df_filtered.withColumn("tie", df_filtered.white_games_won == df_filtered.black_games_won)
df_filtered = df_filtered.withColumn("result_moves", count_udf(df_filtered.moves))
df_filtered = df_filtered.withColumn("game_complexity", game_complexity_udf(df_filtered.result_moves))


udf_splic_array = F.udf(lambda x: x[:10], T.ArrayType(T.StringType()))
df_filtered=df_filtered.withColumn("first_ten", udf_splic_array(F.col("moves")))

print("Refined schema\n-------------------")
df_filtered.printSchema()

root
 |-- event: string (nullable = true)
 |-- white: string (nullable = true)
 |-- black: string (nullable = true)
 |-- result: string (nullable = true)
 |-- UTCDate: date (nullable = true)
 |-- UTCTime: string (nullable = true)
 |-- WhiteElo: integer (nullable = true)
 |-- BlackElo: integer (nullable = true)
 |-- WhiteRatingDiff: double (nullable = true)
 |-- BlackRatingDiff: double (nullable = true)
 |-- ECO: string (nullable = true)
 |-- Opening: string (nullable = true)
 |-- TimeControl: string (nullable = true)
 |-- Termination: string (nullable = true)
 |-- AN: string (nullable = true)

Abandoned Games to drop: 739
Refined schema
-------------------
root
 |-- event: string (nullable = true)
 |-- white: string (nullable = true)
 |-- black: string (nullable = true)
 |-- result: string (nullable = true)
 |-- UTCDate: date (nullable = true)
 |-- UTCTime: string (nullable = true)
 |-- WhiteElo: integer (nullable = true)
 |-- BlackElo: integer (nullable = true)
 |-- WhiteRatingDiff: d

In [4]:
def label_wins(x):
    if x == 0.0:
        return 'loss'
    elif x == 0.5:
        return 'tie'
    else:
        return 'win'
    
udf_label_wins = F.udf(lambda x: label_wins(x), T.StringType())

df_filtered = df_filtered.withColumn("white_result", udf_label_wins(df_filtered.white_games_won))

In [7]:
vars_to_keep = ["event", 
              "white_result", 
              "WhiteElo", 
              "BlackElo", 
              "first_ten",
              "game_complexity",
               ]

df_filtered.select(vars_to_keep).show(5)

+---------+------------+--------+--------+--------------------+--------------------+---------------+
|    event|white_result|WhiteElo|BlackElo|               moves|           first_ten|game_complexity|
+---------+------------+--------+--------+--------------------+--------------------+---------------+
|    Blitz|         win|    2068|    1846|[c4 c5, Nc3 Nf6, ...|[c4 c5, Nc3 Nf6, ...|              6|
|    Blitz|         win|    1708|    1399|[d4 b6, c4 Bb7, N...|[d4 b6, c4 Bb7, N...|              5|
|Classical|        loss|    1542|    1790|[e4 Nc6, d4 d5, e...|[e4 Nc6, d4 d5, e...|              7|
|    Blitz|         win|    1467|    1679|[e4 e5, d4 exd4, ...|[e4 e5, d4 exd4, ...|              4|
|    Blitz|        loss|    1249|    1174|[e4 e5, d4 exd4, ...|[e4 e5, d4 exd4, ...|              4|
+---------+------------+--------+--------+--------------------+--------------------+---------------+
only showing top 5 rows



In [8]:
# Select variables
vars_to_keep = ["event", 
              "white_result", 
              "WhiteElo", 
              "BlackElo", 
              "first_ten",
              "game_complexity",
               ]

# subset the dataframe on these predictors
df_final=df_filtered.select(vars_to_keep)

In [9]:
df_final.write.mode("overwrite").parquet("../../data/processed/chess_games_moves_model.parquet")