In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType
import pyspark.sql.functions as F

import pandas as pd
import numpy as np

In [22]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("fa21-ds5110-group10") \
    .getOrCreate()

In [26]:
chess_schema = StructType([StructField('event', StringType(), False), 
                           StructField('white', StringType(), False),
                           StructField('black', StringType(), False),
                           StructField('result', StringType(), False),
                           StructField('UTCDate', DateType(), False),
                           StructField('UTCTime', StringType(), False),
                           StructField('WhiteElo', IntegerType(), False),
                           StructField('BlackElo', IntegerType(), False),
                           StructField('WhiteRatingDiff', DoubleType(), False),
                           StructField('BlackRatingDiff', DoubleType(), False),
                           StructField('ECO', StringType(), False),
                           StructField('Opening', StringType(), False),
                           StructField('TimeControl', StringType(), False),
                           StructField('Termination', StringType(), False),
                           StructField('AN', StringType(), False)])


df = spark.read.csv(path="../../data/raw/chess_games.csv",
                    schema=chess_schema,
                    header=True,
                    ignoreLeadingWhiteSpace=True,
                    ignoreTrailingWhiteSpace=True,
                    dateFormat='yyyy.mm.dd')

df = df.filter((F.col('event') == 'Classical') | (F.col('event') == 'Blitz'))
df.write.parquet("../../data/processed/chess_games_blitz_classic.parquet")