## Import

In [23]:
import sys
import math
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, when, udf,isnan, count, sum, avg, format_number, round, to_timestamp, to_date, concat, lit, abs, split, monotonically_increasing_id
from pyspark.ml.stat import Correlation

## Start Connection

In [2]:
spark = SparkSession.builder\
.master("local")\
.appName("Word Count")\
.getOrCreate()
#.config("spark.some.config.option", "some-value")\

## Basic Info

In [3]:
df1 = spark.read.options(header='true', inferSchema='true').csv("../data_source/chess_games.csv")

In [4]:
#df1.show()
df1.printSchema()

root
 |-- Event: string (nullable = true)
 |-- White: string (nullable = true)
 |-- Black: string (nullable = true)
 |-- Result: string (nullable = true)
 |-- UTCDate: string (nullable = true)
 |-- UTCTime: timestamp (nullable = true)
 |-- WhiteElo: integer (nullable = true)
 |-- BlackElo: integer (nullable = true)
 |-- WhiteRatingDiff: double (nullable = true)
 |-- BlackRatingDiff: double (nullable = true)
 |-- ECO: string (nullable = true)
 |-- Opening: string (nullable = true)
 |-- TimeControl: string (nullable = true)
 |-- Termination: string (nullable = true)
 |-- AN: string (nullable = true)



### Check for row duplication

In [None]:
df1\
.groupBy(df1.columns)\
.count().\
where(col('count') > 1)\
.select(sum('count'))\
.withColumnRenamed("sum(count)", "dupliacte")\
.show()

### Check for NULL, NAN

In [None]:
columns_to_check = [c for c in df1.columns if c != 'UTCTime']

df1.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_to_check]).show()

In [None]:
columns_to_check = [c for c in df1.columns if c != 'UTCTime']

df1.select([count(when(isnan(c), c)).alias(c) for c in columns_to_check]).show()

In [None]:
df1.filter(col("UTCTime").isNull()).count()

In [None]:
'''
df1 = df1.withColumn("diff", abs(col("WhiteElo") - col("BlackElo")))
df1 = df1.withColumn("penalty", abs(col("WhiteRatingDiff")) + abs(col("BlackRatingDiff")))
df2.show()
'''

In [None]:
#df1.corr("penalty", "diff")

## Preprocessing

### Drop duplicate rows

In [7]:
df2 = df1.withColumn("Result", when((col("WhiteRatingDiff") < 0) & (col("BlackRatingDiff") > 0), "0-1")
    .when((col("WhiteRatingDiff") > 0) & (col("BlackRatingDiff") < 0), "1-0")
    .when((col("WhiteRatingDiff") == 0) & (col("BlackRatingDiff") == 0), "1/2-1/2")
    .otherwise(col("Result"))
)

In [8]:
df3 = df2.dropDuplicates().drop("TimeControl").drop("AN").drop("WhiteRatingDiff").drop("BlackRatingDiff").drop("UTCTime")

## Format and filter an Event Column 

In [9]:
df4 = df3.filter(
    (df2["Event"].contains("Blitz")) |
    (df2["Event"].contains("Classic")) |
    (df2["Event"].contains("Bullet"))
)

In [10]:
df5 = df4.withColumn("Event", when(df3["Event"].contains("Blitz"), "Blitz").when(df3["Event"].contains("Classic"), "Classic").when(df3["Event"].contains("Bullet"), "Bullet").otherwise(df3["Event"]))

In [None]:
'''
df5 = df4.filter(
    (df4["Result"] != '*') |
    (df4["WhiteRatingDiff"].isNotNull()) |
    (df4["BlackRatingDiff"].isNotNull())
)
'''

In [None]:
'''
mean_white_rating_diff_white_win = df6.filter(col("Result") == "1-0" ).agg(avg("WhiteRatingDiff")).collect()[0][0]
mean_black_rating_diff_white_win = df6.filter(col("Result") == "1-0" ).agg(avg("BlackRatingDiff")).collect()[0][0]
mean_white_rating_diff_black_win = df6.filter(col("Result") == "0-1" ).agg(avg("WhiteRatingDiff")).collect()[0][0]
mean_black_rating_diff_black_win = df6.filter(col("Result") == "0-1" ).agg(avg("BlackRatingDiff")).collect()[0][0]
mean_white_rating_diff_draw = df6.filter(col("Result") == "1/2-1/2" ).agg(avg("WhiteRatingDiff")).collect()[0][0]
mean_black_rating_diff_draw = df6.filter(col("Result") == "1/2-1/2" ).agg(avg("BlackRatingDiff")).collect()[0][0]
'''

In [None]:
'''
df7 = df6.withColumn("WhiteRatingDiff",
    when((col("Result") == "1-0") & col("WhiteRatingDiff").isNull(), mean_white_rating_diff_white_win)
    .when((col("Result") == "0-1") & col("WhiteRatingDiff").isNull(), mean_white_rating_diff_black_win)
    .when((col("Result") == "1/2-1/2") & col("WhiteRatingDiff").isNull(), mean_white_rating_diff_draw)
    .otherwise(col("WhiteRatingDiff"))
)
df8 = df7.withColumn("BlackRatingDiff",
    when((col("Result") == "1-0") & col("BlackRatingDiff").isNull(), mean_black_rating_diff_white_win)
    .when((col("Result") == "0-1") & col("BlackRatingDiff").isNull(), mean_black_rating_diff_black_win)
    .when((col("Result") == "1/2-1/2") & col("BlackRatingDiff").isNull(), mean_black_rating_diff_draw)
    .otherwise(col("BlackRatingDiff"))
)
'''

In [None]:
'''
df9 = df8.withColumn("WhiteRatingDiff",
    when((col("Result") == "1-0") & col("WhiteRatingDiff").isNull(), round(col("WhiteRatingDiff"), 1))
    .when((col("Result") == "0-1") & col("WhiteRatingDiff").isNull(), round(col("WhiteRatingDiff"), 1))
    .when((col("Result") == "1/2-1/2") & col("WhiteRatingDiff").isNull(), round(col("WhiteRatingDiff"), 1))
    .otherwise(col("WhiteRatingDiff"))
)
df10 = df9.withColumn("BlackRatingDiff",
    when((col("Result") == "1-0") & col("BlackRatingDiff").isNull(), round(col("BlackRatingDiff"), 1))
    .when((col("Result") == "0-1") & col("BlackRatingDiff").isNull(), round(col("BlackRatingDiff"), 1))
    .when((col("Result") == "1/2-1/2") & col("BlackRatingDiff").isNull(), round(col("BlackRatingDiff"), 1))
    .otherwise(col("BlackRatingDiff"))
)
'''

In [12]:
df6 = df5.withColumn("UTCDate", to_date("UTCDate", "yyyy.MM.dd"))
df6.show()

+-------+-------------+--------------+------+----------+--------+--------+---+--------------------+------------+
|  Event|        White|         Black|Result|   UTCDate|WhiteElo|BlackElo|ECO|             Opening| Termination|
+-------+-------------+--------------+------+----------+--------+--------+---+--------------------+------------+
|  Blitz|       Blique|  rashidbishop|   1-0|2016-06-30|    1920|    1858|A22|English Opening: ...|      Normal|
|Classic| Chessapple44|    fourtwenty|   1-0|2016-06-30|    1369|    1443|A04|Zukertort Opening...|      Normal|
|Classic|       loco22|      Marco753|   0-1|2016-06-30|    1462|    2147|A04|Zukertort Opening...|      Normal|
|  Blitz|        r4m0s|         doguy|   0-1|2016-06-30|    1372|    1068|B20|    Sicilian Defense|Time forfeit|
|  Blitz|         em12|       altemio|   0-1|2016-06-30|    2235|    2200|B06|Modern Defense: P...|      Normal|
|  Blitz|         zs06|         NOVA3|   0-1|2016-06-30|    1740|    1851|C41| Philidor Defense 

In [21]:
df7 =  df6.filter(col("Result") != '*')

df7.groupBy("Result").count().show()

+-------+-------+
| Result|  count|
+-------+-------+
|1/2-1/2|  47812|
|    1-0|3198560|
|    0-1|2985534|
+-------+-------+



In [None]:
df8 = df7.withColumn("id", monotonically_increasing_id()())

In [None]:
'''
df11 = df11.withColumn('WhiteRatingDiff', col('WhiteRatingDiff').cast(IntegerType()))
df11 = df11.withColumn('BlackRatingDiff', col('BlackRatingDiff').cast(IntegerType()))

df11.show()
df11.printSchema()
'''

## Analytics

### Win-Loss Ratio: Calculate the win-loss ratio for White and Black players to understand if there is a bias toward one side winning more frequently.

In [None]:
df11.groupBy('Result').count().show()

### Popular Openings: Determine the most frequently played openings by examining the ECO codes

In [None]:
df11.groupBy('ECO').count().show()

### Analyze the success rates of these openings to identify which ones are more effective.

In [None]:
df11.groupBy("ECO").pivot("Result", values=["1-0", "0-1", "1/2-1/2"]).agg(count("*").alias("count")).show()

### Are certain openings associated with higher win rates for White or Black?

In [None]:
df12 = df11.groupBy("ECO", "Result").agg((when(col("Result") == "1-0", 1).otherwise(0)).alias("white_win"),
                                      (when(col("Result") == "0-1", 1).otherwise(0)).alias("black_win")) \
.groupBy("ECO").agg((sum("white_win") / (sum("white_win") + sum("black_win"))).alias("white_win_rate"),
                        (sum("black_win") / (sum("white_win") + sum("black_win"))).alias("black_win_rate"))

In [None]:
df12 = df12.withColumn("white_win_rate", format_number("white_win_rate", 2)) \
    .withColumn("black_win_rate", format_number("black_win_rate", 2)).show(n=df12.count(), truncate=False)

### Do shorter time controls lead to more decisive results, such as wins or losses?

### Do longer games tend to favor stronger players?

### Track the popularity of openings over time and see if certain openings go in and out of fashion

In [None]:
df2.groupBy('Event').count().show()

In [None]:
df2.groupBy('Result').count().show()

In [None]:
df2.groupBy('Termination').count().show()

In [None]:
df1.where("Termination = 'Rules infraction'").show()

In [None]:
df1.where("Termination = 'Abandoned'").show()

In [None]:
df2.where("Result = '*'").show()

In [None]:
df2.where("Result = '*'").groupBy('Termination').count().show()

In [None]:
df2.show()

In [None]:
columns_to_check = [c for c in df1.columns if c != 'UTCTime']

df2.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_to_check]).show()

In [None]:
df5.where("Result = '*'").show()

In [None]:
df8.filter(col("WhiteRatingDiff").isNull()).show()

In [None]:
df6.filter(col("WhiteRatingDiff").isNull() & col("BlackRatingDiff").isNull() & (col("Result") != '*')).show()

In [None]:
df5.filter(col("WhiteRatingDiff").isNotNull() & col("BlackRatingDiff").isNotNull() & (col("Result") == '*')).show()

In [None]:
columns_to_check = [c for c in df11.columns if c != 'UTCTime']

df11.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_to_check]).show()