# Neanderthal's Guide to Apache Spark

https://towardsdatascience.com/a-neanderthals-guide-to-apache-spark-in-python-9ef1f156d427

## Getting PySpark Running 

for Colab env

## Setting up

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("test1") \
    .getOrCreate()

## Load Data

In [2]:
data = spark.read.csv('./data/video_game_sales.csv.gz',inferSchema=True, header =True)

In [3]:
data.columns

['Name',
 'Platform',
 'Year_of_Release',
 'Genre',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales',
 'Critic_Score',
 'Critic_Count',
 'User_Score',
 'User_Count',
 'Developer',
 'Rating']

In [4]:
data.count(), len(data.columns)


(16720, 16)

## Viewing DataFrames

In [5]:
data.show(5)

+--------------------+--------+---------------+------------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|                Name|Platform|Year_of_Release|       Genre|Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|Critic_Score|Critic_Count|User_Score|User_Count|Developer|Rating|
+--------------------+--------+---------------+------------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|          Wii Sports|     Wii|           2006|      Sports| Nintendo|   41.36|   28.96|    3.77|       8.45|       82.53|          76|          51|         8|       322| Nintendo|     E|
|   Super Mario Bros.|     NES|           1985|    Platform| Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|        null|        null|      null|      null|     null|  null|
|      Mario Kart Wii|     Wii|           2008|      Racing|

In [6]:
data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year_of_Release: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)
 |-- Critic_Score: integer (nullable = true)
 |-- Critic_Count: integer (nullable = true)
 |-- User_Score: string (nullable = true)
 |-- User_Count: integer (nullable = true)
 |-- Developer: string (nullable = true)
 |-- Rating: string (nullable = true)



In [7]:
data.dtypes

[('Name', 'string'),
 ('Platform', 'string'),
 ('Year_of_Release', 'string'),
 ('Genre', 'string'),
 ('Publisher', 'string'),
 ('NA_Sales', 'double'),
 ('EU_Sales', 'double'),
 ('JP_Sales', 'double'),
 ('Other_Sales', 'double'),
 ('Global_Sales', 'double'),
 ('Critic_Score', 'int'),
 ('Critic_Count', 'int'),
 ('User_Score', 'string'),
 ('User_Count', 'int'),
 ('Developer', 'string'),
 ('Rating', 'string')]

In [8]:
data.select("Name","Platform","User_Score","User_Count").show(15, truncate=False)

+---------------------------+--------+----------+----------+
|Name                       |Platform|User_Score|User_Count|
+---------------------------+--------+----------+----------+
|Wii Sports                 |Wii     |8         |322       |
|Super Mario Bros.          |NES     |null      |null      |
|Mario Kart Wii             |Wii     |8.3       |709       |
|Wii Sports Resort          |Wii     |8         |192       |
|Pokemon Red/Pokemon Blue   |GB      |null      |null      |
|Tetris                     |GB      |null      |null      |
|New Super Mario Bros.      |DS      |8.5       |431       |
|Wii Play                   |Wii     |6.6       |129       |
|New Super Mario Bros. Wii  |Wii     |8.4       |594       |
|Duck Hunt                  |NES     |null      |null      |
|Nintendogs                 |DS      |null      |null      |
|Mario Kart DS              |DS      |8.6       |464       |
|Pokemon Gold/Pokemon Silver|GB      |null      |null      |
|Wii Fit                

## Summary Statistics

In [9]:
data.describe(["User_Score","User_Count"]).show()

+-------+------------------+------------------+
|summary|        User_Score|        User_Count|
+-------+------------------+------------------+
|  count|             10015|              7590|
|   mean|7.1250461133070315|162.22990777338603|
| stddev|1.5000060936257986| 561.2823262473789|
|    min|                 0|                 4|
|    max|               tbd|             10665|
+-------+------------------+------------------+



In [11]:
from pyspark.sql.functions import desc
data.groupBy("Platform").count().orderBy(desc("count")).show()

+--------+-----+
|Platform|count|
+--------+-----+
|     PS2| 2161|
|      DS| 2152|
|     PS3| 1331|
|     Wii| 1320|
|    X360| 1262|
|     PSP| 1209|
|      PS| 1197|
|      PC|  974|
|      XB|  824|
|     GBA|  822|
|      GC|  556|
|     3DS|  520|
|     PSV|  432|
|     PS4|  393|
|     N64|  319|
|    XOne|  247|
|    SNES|  239|
|     SAT|  173|
|    WiiU|  147|
|    2600|  133|
+--------+-----+
only showing top 20 rows



In [12]:
data.groupBy("Platform").count().orderBy("count", ascending=False).show()

+--------+-----+
|Platform|count|
+--------+-----+
|     PS2| 2161|
|      DS| 2152|
|     PS3| 1331|
|     Wii| 1320|
|    X360| 1262|
|     PSP| 1209|
|      PS| 1197|
|      PC|  974|
|      XB|  824|
|     GBA|  822|
|      GC|  556|
|     3DS|  520|
|     PSV|  432|
|     PS4|  393|
|     N64|  319|
|    XOne|  247|
|    SNES|  239|
|     SAT|  173|
|    WiiU|  147|
|    2600|  133|
+--------+-----+
only showing top 20 rows



In [13]:
data.groupBy("Publisher").count().orderBy("count",ascending=False).show(20,False)

+--------------------------------------+-----+
|Publisher                             |count|
+--------------------------------------+-----+
|Electronic Arts                       |1356 |
|Activision                            |985  |
|Namco Bandai Games                    |939  |
|Ubisoft                               |933  |
|Konami Digital Entertainment          |834  |
|THQ                                   |715  |
|Nintendo                              |706  |
|Sony Computer Entertainment           |687  |
|Sega                                  |638  |
|Take-Two Interactive                  |422  |
|Capcom                                |386  |
|Atari                                 |367  |
|Tecmo Koei                            |348  |
|Square Enix                           |236  |
|Warner Bros. Interactive Entertainment|235  |
|Disney Interactive Studios            |218  |
|Unknown                               |201  |
|Eidos Interactive                     |198  |
|Midway Games

## Filtering DataFrames

In [30]:
condition1 = (data.User_Score.isNotNull()) & (data.User_Count.isNotNull()) \
            & (data.Global_Sales.isNotNull()) & (data.Critic_Score.isNotNull())
condition2 = data.User_Score != "tbd"
data2 = data.filter(condition1).filter(condition2)

data2.show(5,False)

+---------------------+--------+---------------+--------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|Name                 |Platform|Year_of_Release|Genre   |Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|Critic_Score|Critic_Count|User_Score|User_Count|Developer|Rating|
+---------------------+--------+---------------+--------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|Wii Sports           |Wii     |2006           |Sports  |Nintendo |41.36   |28.96   |3.77    |8.45       |82.53       |76          |51          |8         |322       |Nintendo |E     |
|Mario Kart Wii       |Wii     |2008           |Racing  |Nintendo |15.68   |12.76   |3.79    |3.29       |35.52       |82          |73          |8.3       |709       |Nintendo |E     |
|Wii Sports Resort    |Wii     |2009           |Sports  |Nintendo |15.61   

In [32]:
data2.count()   # 7463  
# after filtering 7017

7017

In [16]:
data.select("Name","Platform","User_Score","User_Count").filter(data.User_Score == "tbd").show(truncate=False)

+-----------------------------------------+--------+----------+----------+
|Name                                     |Platform|User_Score|User_Count|
+-----------------------------------------+--------+----------+----------+
|Zumba Fitness                            |Wii     |tbd       |null      |
|Namco Museum: 50th Anniversary           |PS2     |tbd       |null      |
|Zumba Fitness 2                          |Wii     |tbd       |null      |
|uDraw Studio                             |Wii     |tbd       |null      |
|Frogger\'s Adventures: Temple of the Frog|GBA     |tbd       |null      |
|Just Dance Kids                          |Wii     |tbd       |null      |
|Dance Dance Revolution X2                |PS2     |tbd       |null      |
|The Incredibles                          |GBA     |tbd       |null      |
|Who wants to be a millionaire            |PC      |tbd       |null      |
|Tetris Worlds                            |GBA     |tbd       |null      |
|Imagine: Teacher        

In [17]:
data2.select("Name","Platform","User_Score","User_Count").describe(["User_Score","User_Count"]).show()

+-------+------------------+------------------+
|summary|        User_Score|        User_Count|
+-------+------------------+------------------+
|  count|              7590|              7590|
|   mean|7.1250461133070315|162.22990777338603|
| stddev|1.5000060936257986| 561.2823262473789|
|    min|                 0|                 4|
|    max|               9.7|             10665|
+-------+------------------+------------------+



In [18]:
data2.select("Name","Platform","User_Score","User_Count").orderBy("User_Score", ascending=False).show(5, truncate=False)

+-------------------------------------+--------+----------+----------+
|Name                                 |Platform|User_Score|User_Count|
+-------------------------------------+--------+----------+----------+
|Breath of Fire III                   |PSP     |9.7       |6         |
|Boktai: The Sun is in Your Hand      |GBA     |9.6       |16        |
|Harvest Moon: Friends of Mineral Town|GBA     |9.6       |116       |
|Golden Sun: The Lost Age             |GBA     |9.5       |150       |
|MLB SlugFest Loaded                  |PS2     |9.5       |4         |
+-------------------------------------+--------+----------+----------+
only showing top 5 rows



In [19]:
# data2.show()

## Building a Model in PySpark

### Linear Regression

In [20]:
data2.show(5)

+--------------------+--------+---------------+--------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|                Name|Platform|Year_of_Release|   Genre|Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|Critic_Score|Critic_Count|User_Score|User_Count|Developer|Rating|
+--------------------+--------+---------------+--------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|          Wii Sports|     Wii|           2006|  Sports| Nintendo|   41.36|   28.96|    3.77|       8.45|       82.53|          76|          51|         8|       322| Nintendo|     E|
|      Mario Kart Wii|     Wii|           2008|  Racing| Nintendo|   15.68|   12.76|    3.79|       3.29|       35.52|          82|          73|       8.3|       709| Nintendo|     E|
|   Wii Sports Resort|     Wii|           2009|  Sports| Nintendo|   15.61|   10

In [21]:
data2.select("Year_of_Release").distinct().orderBy("Year_of_Release", ascending=True).show(20,False)

+---------------+
|Year_of_Release|
+---------------+
|1985           |
|1988           |
|1992           |
|1994           |
|1996           |
|1997           |
|1998           |
|1999           |
|2000           |
|2001           |
|2002           |
|2003           |
|2004           |
|2005           |
|2006           |
|2007           |
|2008           |
|2009           |
|2010           |
|2011           |
+---------------+
only showing top 20 rows



In [22]:
data2.groupBy("Publisher").count().orderBy("count",ascending=False).show()

+--------------------+-----+
|           Publisher|count|
+--------------------+-----+
|     Electronic Arts| 1026|
|          Activision|  573|
|             Ubisoft|  557|
|                 THQ|  342|
|Sony Computer Ent...|  327|
|Take-Two Interactive|  302|
|                Sega|  297|
|            Nintendo|  294|
|Konami Digital En...|  270|
|  Namco Bandai Games|  265|
|              Capcom|  204|
|               Atari|  186|
|Warner Bros. Inte...|  169|
|Microsoft Game St...|  146|
|          Tecmo Koei|  144|
|         Square Enix|  142|
|   Eidos Interactive|  131|
|       Vivendi Games|  123|
|         Codemasters|  116|
|        Midway Games|  111|
+--------------------+-----+
only showing top 20 rows



In [23]:
data2 = data2.filter(data2.Year_of_Release != "N/A")
data2.select("Year_of_Release").distinct().orderBy("Year_of_Release", ascending=True).show(20,False)

+---------------+
|Year_of_Release|
+---------------+
|1985           |
|1988           |
|1992           |
|1994           |
|1996           |
|1997           |
|1998           |
|1999           |
|2000           |
|2001           |
|2002           |
|2003           |
|2004           |
|2005           |
|2006           |
|2007           |
|2008           |
|2009           |
|2010           |
|2011           |
+---------------+
only showing top 20 rows



In [34]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
data2 = data2.withColumn("Year_of_Release", data2["Year_of_Release"].cast(DoubleType()))
data2 = data2.withColumn("User_Score", data2["User_Score"].cast(DoubleType()))
data2 = data2.withColumn("User_Count", data2["User_Count"].cast(DoubleType()))
data2 = data2.withColumn("Critic_Score", data2["Critic_Score"].cast(DoubleType()))

data2.dtypes

[('Name', 'string'),
 ('Platform', 'string'),
 ('Year_of_Release', 'double'),
 ('Genre', 'string'),
 ('Publisher', 'string'),
 ('NA_Sales', 'double'),
 ('EU_Sales', 'double'),
 ('JP_Sales', 'double'),
 ('Other_Sales', 'double'),
 ('Global_Sales', 'double'),
 ('Critic_Score', 'double'),
 ('Critic_Count', 'int'),
 ('User_Score', 'double'),
 ('User_Count', 'double'),
 ('Developer', 'string'),
 ('Rating', 'string')]

In [38]:
#Input all the features in one vector column
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Year_of_Release', 'Global_Sales', 'Critic_Score', 'User_Count'], outputCol = 'predictors')
output = assembler.setHandleInvalid("skip").transform(data2)
#Input vs Output
finalized_data = output.select("predictors","User_Score")
finalized_data.show(5)


+--------------------+----------+
|          predictors|User_Score|
+--------------------+----------+
|[2006.0,82.53,76....|       8.0|
|[2008.0,35.52,82....|       8.3|
|[2009.0,32.77,80....|       8.0|
|[2006.0,29.8,89.0...|       8.5|
|[2006.0,28.92,58....|       6.6|
+--------------------+----------+
only showing top 5 rows



https://towardsdatascience.com/building-a-linear-regression-with-pyspark-and-mllib-d065c3ba246a

In [39]:
from pyspark.ml.regression import LinearRegression
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])

lr = LinearRegression(
    featuresCol = 'predictors', 
    labelCol = 'User_Score')

lrModel = lr.fit(train_data)

pred = lrModel.evaluate(test_data)

pred.predictions.show(5)

+--------------------+----------+-----------------+
|          predictors|User_Score|       prediction|
+--------------------+----------+-----------------+
|[1996.0,1.03,86.0...|       8.5| 8.27690619563333|
|[1997.0,1.01,86.0...|       8.3| 8.98898155666734|
|[1997.0,1.24,85.0...|       9.0| 8.90331257820202|
|[1997.0,1.27,93.0...|       9.4|9.347113052990125|
|[1997.0,1.99,87.0...|       8.8|9.028103965035967|
+--------------------+----------+-----------------+
only showing top 5 rows



In [40]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [-0.07563992236487724,-0.01623812905426233,0.06281631358508365,-0.00022230110695025141]
Intercept: 154.65832707242427
numIterations: 1
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
|  -2.414532814759224|
|  -5.089557675645483|
| -0.7232484443666358|
|  -1.845135885903102|
| -1.4914112160889257|
| -0.8144536411395116|
| -0.6710009430232056|
| -2.0182952210392155|
| -0.6894657681290202|
| -0.2561919221991502|
| -0.2558123675213775|
| -0.7658650095244299|
|0.005244036985800449|
|  -1.512199018520886|
|  0.4588409681097225|
| -0.5998135781151923|
| -0.8967630899122465|
| -0.5371355108228961|
|  0.2338266768408399|
|  0.2583216739254901|
+--------------------+
only showing top 20 rows

RMSE: 1.121484
r2: 0.397971


### Evaluating model

In [41]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(
    labelCol="User_Score", 
    predictionCol="prediction", 
    metricName="rmse")
# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)
# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)
# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 1.065
MSE: 1.134
MAE: 0.812
r2: 0.434
