In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession\
    .builder\
    .appName("PySpark learning")\
    .getOrCreate()

In [3]:
df = spark.read.csv("data/chess_games.csv", header=True)

In [4]:
df.show(5)

+--------+-----+-----------+------------+-----+--------------+------+--------------+-------------+------------+------------+------------+--------------------+-----------+--------------------+-----------+
|      id|rated| created_at|last_move_at|turns|victory_status|winner|increment_code|     white_id|white_rating|    black_id|black_rating|               moves|opening_eco|        opening_name|opening_ply|
+--------+-----+-----------+------------+-----+--------------+------+--------------+-------------+------------+------------+------------+--------------------+-----------+--------------------+-----------+
|TZJHLljE|FALSE|1.50421E+12| 1.50421E+12|   13|     outoftime| white|          15+2|     bourgris|        1500|        a-00|        1191|d4 d5 c4 c6 cxd5 ...|        D10|Slav Defense: Exc...|          5|
|l1NXvwaE| TRUE|1.50413E+12| 1.50413E+12|   16|        resign| black|          5+10|         a-00|        1322|   skinnerua|        1261|d4 Nc6 e4 e5 f4 f...|        B00|Nimzowitsch De

In [5]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- rated: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- last_move_at: string (nullable = true)
 |-- turns: string (nullable = true)
 |-- victory_status: string (nullable = true)
 |-- winner: string (nullable = true)
 |-- increment_code: string (nullable = true)
 |-- white_id: string (nullable = true)
 |-- white_rating: string (nullable = true)
 |-- black_id: string (nullable = true)
 |-- black_rating: string (nullable = true)
 |-- moves: string (nullable = true)
 |-- opening_eco: string (nullable = true)
 |-- opening_name: string (nullable = true)
 |-- opening_ply: string (nullable = true)



In [6]:
df.show(1, vertical=True)

-RECORD 0------------------------------
 id             | TZJHLljE             
 rated          | FALSE                
 created_at     | 1.50421E+12          
 last_move_at   | 1.50421E+12          
 turns          | 13                   
 victory_status | outoftime            
 winner         | white                
 increment_code | 15+2                 
 white_id       | bourgris             
 white_rating   | 1500                 
 black_id       | a-00                 
 black_rating   | 1191                 
 moves          | d4 d5 c4 c6 cxd5 ... 
 opening_eco    | D10                  
 opening_name   | Slav Defense: Exc... 
 opening_ply    | 5                    
only showing top 1 row



In [7]:
df.columns

['id',
 'rated',
 'created_at',
 'last_move_at',
 'turns',
 'victory_status',
 'winner',
 'increment_code',
 'white_id',
 'white_rating',
 'black_id',
 'black_rating',
 'moves',
 'opening_eco',
 'opening_name',
 'opening_ply']

In [8]:
df.select("white_rating", "black_rating", "turns")\
    .describe()\
    .show()

+-------+------------------+------------------+-----------------+
|summary|      white_rating|      black_rating|            turns|
+-------+------------------+------------------+-----------------+
|  count|             20058|             20058|            20058|
|   mean|1596.6318675840064|1588.8319872370128|60.46599860404826|
| stddev|291.25337573701825| 291.0361259603342|33.57058475353715|
|    min|              1000|              1000|                1|
|    max|               999|               999|               99|
+-------+------------------+------------------+-----------------+



In [16]:
df.withColumn("rating_difference", df.white_rating - df.black_rating)\
    .select("id", "rating_difference")\
    .show(5)

+--------+-----------------+
|      id|rating_difference|
+--------+-----------------+
|TZJHLljE|            309.0|
|l1NXvwaE|             61.0|
|mIICvQHh|             -4.0|
|kWKvrqYL|            -15.0|
|9tXo1AUZ|             54.0|
+--------+-----------------+
only showing top 5 rows



In [17]:
spark.stop()