## Import

In [31]:
import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, udf,isnan, count, sum, avg, format_number

## Start Connection

In [3]:
spark = SparkSession.builder\
.master("local")\
.appName("Word Count")\
.getOrCreate()
#.config("spark.some.config.option", "some-value")\

## Basic Info

In [4]:
df1 = spark.read.options(header='true', inferSchema='true').csv("../data_source/chess_games.csv")

In [5]:
df1.show(5)
df1.printSchema()

+------------------+---------------+----------+------+----------+-------------------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
|             Event|          White|     Black|Result|   UTCDate|            UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|             Opening|TimeControl| Termination|                  AN|
+------------------+---------------+----------+------+----------+-------------------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+--------------------+
|        Classical |        eisaaaa|  HAMID449|   1-0|2016.06.30|2023-11-11 22:00:01|    1901|    1896|           11.0|          -11.0|D10|        Slav Defense|      300+5|Time forfeit|1. d4 d5 2. c4 c6...|
|            Blitz |         go4jas|Sergei1973|   0-1|2016.06.30|2023-11-11 22:00:01|    1641|    1627|          -11.0|           12.0|C20|King's Pawn Openi...|      300+0|

### Check for row duplication

In [6]:
df1\
.groupBy(df1.columns)\
.count().\
where(col('count') > 1)\
.select(sum('count'))\
.withColumnRenamed("sum(count)", "dupliacte")\
.show()

+---------+
|dupliacte|
+---------+
|        2|
+---------+



### Check for NULL, NAN

In [7]:
columns_to_check = [c for c in df1.columns if c != 'UTCTime']

df1.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_to_check]).show()

+-----+-----+-----+------+-------+--------+--------+---------------+---------------+---+-------+-----------+-----------+---+
|Event|White|Black|Result|UTCDate|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|Opening|TimeControl|Termination| AN|
+-----+-----+-----+------+-------+--------+--------+---------------+---------------+---+-------+-----------+-----------+---+
|    0|    0|    0|     0|      0|       0|       0|           4668|           4668|  0|      0|          0|          0|  0|
+-----+-----+-----+------+-------+--------+--------+---------------+---------------+---+-------+-----------+-----------+---+



In [8]:
columns_to_check = [c for c in df1.columns if c != 'UTCTime']

df1.select([count(when(isnan(c), c)).alias(c) for c in columns_to_check]).show()

+-----+-----+-----+------+-------+--------+--------+---------------+---------------+---+-------+-----------+-----------+---+
|Event|White|Black|Result|UTCDate|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|Opening|TimeControl|Termination| AN|
+-----+-----+-----+------+-------+--------+--------+---------------+---------------+---+-------+-----------+-----------+---+
|    0|    0|    0|     0|      0|       0|       0|              0|              0|  0|      0|          0|          0|  0|
+-----+-----+-----+------+-------+--------+--------+---------------+---------------+---+-------+-----------+-----------+---+



In [9]:
df1.filter(col("UTCTime").isNull()).count()

0

## Preprocessing

### Drop duplicate rows

In [10]:
df2 = df1.dropDuplicates().drop("AN")

## Format and filter an Event Column 

In [11]:
df3 = df2.filter(
    (df2["Event"].contains("Blitz")) |
    (df2["Event"].contains("Classic")) |
    (df2["Event"].contains("Bullet"))
)

In [12]:
df4 = df3.withColumn("Event", when(df3["Event"].contains("Blitz"), "Blitz").when(df3["Event"].contains("Classic"), "Classic").when(df3["Event"].contains("Bullet"), "Bullet").otherwise(df3["Event"]))

In [13]:
df5 = df4.filter(
    (df4["Result"] != '*') |
    (df4["WhiteRatingDiff"].isNotNull()) |
    (df4["WhiteRatingDiff"].isNotNull())
)

In [32]:
df6 = df5.withColumn("Result", when((col("WhiteRatingDiff") < 0) & (col("BlackRatingDiff") > 0), "0-1")
    .when((col("WhiteRatingDiff") > 0) & (col("BlackRatingDiff") < 0), "1-0")
    .when((col("WhiteRatingDiff") == 0) & (col("BlackRatingDiff") == 0), "1/2-1/2")
    .otherwise(col("Result"))
)

In [27]:
mean_white_rating_diff_white_win = df6.filter(col("Result") == "1-0" ).agg(avg("WhiteRatingDiff")).collect()[0][0]
mean_black_rating_diff_white_win = df6.filter(col("Result") == "1-0" ).agg(avg("BlackRatingDiff")).collect()[0][0]
mean_white_rating_diff_black_win = df6.filter(col("Result") == "0-1" ).agg(avg("WhiteRatingDiff")).collect()[0][0]
mean_black_rating_diff_black_win = df6.filter(col("Result") == "0-1" ).agg(avg("BlackRatingDiff")).collect()[0][0]
mean_white_rating_diff_draw = df6.filter(col("Result") == "1/2-1/2" ).agg(avg("WhiteRatingDiff")).collect()[0][0]
mean_black_rating_diff_draw = df6.filter(col("Result") == "1/2-1/2" ).agg(avg("BlackRatingDiff")).collect()[0][0]

In [36]:
math.round(mean_white_rating_diff_white_win)

NameError: name 'math' is not defined

In [33]:
df7 = df6.withColumn("WhiteRatingDiff",
    when((col("Result") == "1-0") & col("WhiteRatingDiff").isNull(), mean_white_rating_diff_white_win)
    .when((col("Result") == "0-1") & col("WhiteRatingDiff").isNull(), mean_white_rating_diff_black_win)
    .when((col("Result") == "1/2-1/2") & col("WhiteRatingDiff").isNull(), mean_white_rating_diff_draw)
    .otherwise(col("WhiteRatingDiff"))
)
df7 = df6.withColumn("BlackRatingDiff",
    when((col("Result") == "1-0") & col("BlackRatingDiff").isNull(), mean_black_rating_diff_white_win)
    .when((col("Result") == "0-1") & col("BlackRatingDiff").isNull(), mean_black_rating_diff_black_win)
    .when((col("Result") == "1/2-1/2") & col("BlackRatingDiff").isNull(), mean_black_rating_diff_draw)
    .otherwise(col("BlackRatingDiff"))
)

In [None]:
df8 = df7.

In [20]:
df7.show(5)

+-------+---------+------------+------+----------+-------------------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
|  Event|    White|       Black|Result|   UTCDate|            UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|BlackRatingDiff|ECO|             Opening|TimeControl| Termination|
+-------+---------+------------+------+----------+-------------------+--------+--------+---------------+---------------+---+--------------------+-----------+------------+
| Bullet|chupakabr|      Skiter|   0-1|2016.06.30|2023-11-11 22:00:36|    2070|    2021|          -14.0|           15.0|C46|Three Knights Ope...|       60+0|Time forfeit|
|  Blitz|Pirkoress|Profissional|   0-1|2016.06.30|2023-11-11 22:01:44|    1783|    1761|          -11.0|           11.0|D02|Queen's Pawn Game...|      180+0|      Normal|
|Classic| walshark| xadrezpeque|   1-0|2016.06.30|2023-11-11 22:04:42|    1486|    1376|            8.0|           -7.0|C20|King's Pawn Game:...|

In [None]:
df2.groupBy('Event').count().show()

In [None]:
df2.groupBy('Result').count().show()

In [None]:
df2.groupBy('Termination').count().show()

In [None]:
df1.where("Termination = 'Rules infraction'").show()

In [None]:
df1.where("Termination = 'Abandoned'").show()

In [None]:
df2.where("Result = '*'").show()

In [None]:
df2.where("Result = '*'").groupBy('Termination').count().show()

In [None]:
df2.show()

In [None]:
columns_to_check = [c for c in df1.columns if c != 'UTCTime']

df2.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_to_check]).show()

In [None]:
df5.where("Result = '*'").show()

In [24]:
df7.filter(col("WhiteRatingDiff").isNull()).show()

+-------+-------------------+-----------------+------+----------+-------------------+--------+--------+---------------+-------------------+---+--------------------+-----------+------------+
|  Event|              White|            Black|Result|   UTCDate|            UTCTime|WhiteElo|BlackElo|WhiteRatingDiff|    BlackRatingDiff|ECO|             Opening|TimeControl| Termination|
+-------+-------------------+-----------------+------+----------+-------------------+--------+--------+---------------+-------------------+---+--------------------+-----------+------------+
|Classic|           IGGY_KHV|       maratmurka|   1-0|2016.07.01|2023-11-11 13:27:25|    1968|    2057|           NULL|-11.758047954791898|A27|English Opening: ...|      600+0|Time forfeit|
|  Blitz|         adelrafaat|      RONINPERITO|   0-1|2016.07.01|2023-11-11 13:23:36|    2052|    1500|           NULL| 11.827098585395014|A03|Bird Opening: Dut...|      300+0|      Normal|
|Classic|           sogooxon|          Ammreda|   

In [None]:
df6.filter(col("WhiteRatingDiff").isNull() & col("BlackRatingDiff").isNull() & (col("Result") != '*')).show()

In [None]:
df5.filter(col("WhiteRatingDiff").isNotNull() & col("BlackRatingDiff").isNotNull() & (col("Result") == '*')).show()

In [None]:
columns_to_check = [c for c in df7.columns if c != 'UTCTime']

df7.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_to_check]).show()