# BigData Final Project | Steam
## <font color = 'blue'> Notebook1.2 | Clean Data_Part2</font>
### Team Member: Jim Fang, WooJong Choi, Han Jeon, Tam Nguyen

June 2020
___

### Import Libraries

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark.sql.types as t
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import isnan, when, count, col, size
from pyspark.sql.functions import year, month, dayofmonth
from functools import reduce

import pandas as pd
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation


import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
spark = SparkSession.builder.enableHiveSupport().appName('CleanData').getOrCreate()
sc = spark.sparkContext

In [None]:
!hdfs dfs -ls /user/tamng/jwht/SteamData/steamData_new

---
### Function

In [4]:
def check_missing(df):
    ''' Check missing value'''
    df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [5]:
def rename_col(df, newColumns):
    ''' Rename all columns        
        Note: newColumns is a list of columns name '''
    oldColumns = df.schema.names
    df = reduce(lambda df, idx: df.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df)
    return df

In [155]:
def basic_info(df):
    
    '''
        Print out the basic ddescription for each table, icluding:
        1. total rows/ observation
        2. Check missing value by columns
        3. Print out the first 3 lines
        4. Basic description
    '''
    
    print('TOTAL ROWS:', df.count())
    print('\n')
    print('*-------------'*5)
    print('\n')
    print('MISSING VALUE:')
    check_missing(df)
    print('*-------------'*5)
    print('\n')
    print('PRINT OUT THE 1st 3 LINES:')
    df.show(3, truncate = True)
    print('*-------------'*5)
    print('\n')
    print('TABLE BASIC DESCRIPTION:')
    df.describe().show(10,truncate = True)
    print('*-------------'*5)
    distinct_count = []
    column_name = df.columns
    for i in column_name:
        distinct_count.append(df.select(col(i)).distinct().count())

    print('DISTINCT COUNT BY COLUMN:')
    print('\n')
    print(pd.DataFrame(zip(column_name,distinct_count)).\
      rename(columns={0:'column_name', 1:'distinct_count'}))

---
## Import Data

Clean table by table

### 1. game2_df

In [6]:
game2_df = spark.read.csv('/user/tamng/jwht/CleanData/game2_df.csv', inferSchema = True, header = True)

In [7]:
game2_df.printSchema()

root
 |-- steam_id: long (nullable = true)
 |-- app_id: integer (nullable = true)
 |-- playtime_2weeks: integer (nullable = true)
 |-- playtime_forever: integer (nullable = true)
 |-- dateretrieved: timestamp (nullable = true)



In [8]:
game2_df.show(3, truncate = True)

+-----------------+------+---------------+----------------+-------------------+
|         steam_id|app_id|playtime_2weeks|playtime_forever|      dateretrieved|
+-----------------+------+---------------+----------------+-------------------+
|76561197960265729|    10|              0|               0|2014-08-14 14:04:18|
|76561197960265729|    20|              0|               0|2014-08-14 14:04:18|
|76561197960265729|    30|              0|               0|2014-08-14 14:04:18|
+-----------------+------+---------------+----------------+-------------------+
only showing top 3 rows



In [20]:
game2_df.select('app_id').distinct().count()

4161

### 2. app_id_info

In [9]:
app_id_info = spark.read.csv('/user/tamng/jwht/SteamData/steamData_new/App_id_info_new.csv', inferSchema = True, header = False)

In [10]:
app_id_info.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)



In [15]:
app_id_info.head(2)

[Row(_c0=10, _c1='Counter-Strike', _c2='game', _c3='9.99', _c4='2000-11-01 00:00:00', _c5='88', _c6=0, _c7=1),
 Row(_c0=20, _c1='Team Fortress Classic', _c2='game', _c3='4.99', _c4='1999-04-01 00:00:00', _c5='-1', _c6=0, _c7=1)]

In [17]:
# Rename the header columns
newColumns = ['app_id', 'title', 'type', 'price', 
              'releasedDate','rating','requiredAge', 'isMultiplayer']
app_id_info = rename_col(app_id_info, newColumns)

In [18]:
app_id_info.show(3, truncate = True)

+------+--------------------+----+-----+-------------------+------+-----------+-------------+
|app_id|               title|type|price|       releasedDate|rating|requiredAge|isMultiplayer|
+------+--------------------+----+-----+-------------------+------+-----------+-------------+
|    10|      Counter-Strike|game| 9.99|2000-11-01 00:00:00|    88|          0|            1|
|    20|Team Fortress Cla...|game| 4.99|1999-04-01 00:00:00|    -1|          0|            1|
|    30|       Day of Defeat|game| 4.99|2003-05-01 00:00:00|    79|          0|            1|
+------+--------------------+----+-----+-------------------+------+-----------+-------------+
only showing top 3 rows



In [19]:
app_id_info.count()

17783

In [21]:
# Save app_id_info table
app_id_info.write.csv('/user/tamng/jwht/CleanData/app_id_info.csv', header = True)

### 3. gamesDeveloper

In [22]:
games_developer = spark.read.csv('/user/tamng/jwht/SteamData/steamData_new/Games_Developers_new.csv',\
                             inferSchema = True, header = False)

In [23]:
games_developer.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)



In [24]:
games_developer.show(2)

+---+-----+
|_c0|  _c1|
+---+-----+
| 10|Valve|
| 20|Valve|
+---+-----+
only showing top 2 rows



In [25]:
# Rename the header columns
newColumns = ['app_id', 'gamesDeveloper']
games_developer = rename_col(games_developer, newColumns)

In [26]:
games_developer.printSchema()

root
 |-- app_id: integer (nullable = true)
 |-- gamesDeveloper: string (nullable = true)



In [27]:
games_developer.count()

19333

In [28]:
games_developer.select('gamesDeveloper').distinct().count()

6582

In [29]:
# Save games_developer table
games_developer.write.csv('/user/tamng/jwht/CleanData/games_developer.csv', header = True)

### 4. gamesGenre

In [30]:
games_genres = spark.read.csv('/user/tamng/jwht/SteamData/steamData_new/Games_Genres_new.csv',\
                             inferSchema = True, header = False)

In [31]:
games_genres.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)



In [32]:
games_genres.show(2)

+---+------+
|_c0|   _c1|
+---+------+
| 10|Action|
| 20|Action|
+---+------+
only showing top 2 rows



In [33]:
# Rename the header columns
newColumns = ['app_id', 'gamesGenre']
games_genres = rename_col(games_genres, newColumns)

In [34]:
games_genres.printSchema()

root
 |-- app_id: integer (nullable = true)
 |-- gamesGenre: string (nullable = true)



In [35]:
games_genres.count()

39669

In [36]:
games_genres.select('gamesGenre').distinct().count()

22

In [58]:
games_genres.select('app_id').distinct().count()

17195

In [59]:
games_genres.select('app_id').distinct().count()

17195

In [37]:
# Save games_genres table
games_genres.write.csv('/user/tamng/jwht/CleanData/games_genres.csv', header = True)

### 5. gamesPublisher

In [38]:
games_publisher = spark.read.csv('/user/tamng/jwht/SteamData/steamData_new/Games_Publishers_new.csv',\
                             inferSchema = True, header = False)

In [39]:
games_publisher.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)



In [40]:
games_publisher.show(2)

+---+-----+
|_c0|  _c1|
+---+-----+
| 10|Valve|
| 20|Valve|
+---+-----+
only showing top 2 rows



In [41]:
# Rename the header columns
newColumns = ['app_id', 'gamesPublisher']
games_publisher = rename_col(games_publisher, newColumns)

In [42]:
games_publisher.printSchema()

root
 |-- app_id: integer (nullable = true)
 |-- gamesPublisher: string (nullable = true)



In [43]:
games_publisher.count()

18761

In [44]:
games_publisher.select('gamesPublisher').distinct().count()

4425

In [57]:
games_publisher.select('app_id').distinct().count()

17783

In [45]:
# Save games_publisher table
games_publisher.write.csv('/user/tamng/jwht/CleanData/games_publisher.csv', header = True)

### 6. Groups

In [46]:
groups = spark.read.csv('/user/tamng/jwht/SteamData/steamData_new/Groups.csv',\
                             inferSchema = True, header = False)

In [47]:
groups.printSchema()

root
 |-- _c0: long (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: timestamp (nullable = true)



In [48]:
groups.show(2)

+-----------------+---+-------------------+
|              _c0|_c1|                _c2|
+-----------------+---+-------------------+
|76561197960265730|  4|2013-05-06 02:09:28|
|76561197960265730|  5|2013-05-06 02:09:28|
+-----------------+---+-------------------+
only showing top 2 rows



In [49]:
# Rename the header columns
newColumns = ['steam_id', 'group_id', 'dateretrieved']
groups = rename_col(groups, newColumns)

In [51]:
groups.printSchema()

root
 |-- steam_id: long (nullable = true)
 |-- group_id: integer (nullable = true)
 |-- dateretrieved: timestamp (nullable = true)



In [50]:
groups.show(2)

+-----------------+--------+-------------------+
|         steam_id|group_id|      dateretrieved|
+-----------------+--------+-------------------+
|76561197960265730|       4|2013-05-06 02:09:28|
|76561197960265730|       5|2013-05-06 02:09:28|
+-----------------+--------+-------------------+
only showing top 2 rows



In [52]:
# Save groups table
groups.write.csv('/user/tamng/jwht/CleanData/groups.csv', header = True)

### 7. Friends

In [53]:
friends = spark.read.csv('/user/tamng/jwht/CleanData/friends.csv',\
                             inferSchema = True, header = True)

In [54]:
friends.printSchema()

root
 |-- steam_id_a: long (nullable = true)
 |-- steam_id_b: long (nullable = true)
 |-- relationship: string (nullable = true)
 |-- friend_since: timestamp (nullable = true)
 |-- dateretrieved: timestamp (nullable = true)



In [55]:
friends.show(3, truncate = True)

+-----------------+-----------------+------------+-------------------+-------------------+
|       steam_id_a|       steam_id_b|relationship|       friend_since|      dateretrieved|
+-----------------+-----------------+------------+-------------------+-------------------+
|76561197960265744|76561197960265731|      friend|1969-12-31 17:00:00|2013-05-05 01:52:30|
|76561197960265744|76561197960265733|      friend|1969-12-31 17:00:00|2013-05-06 15:14:15|
|76561197960265744|76561197960265738|      friend|1969-12-31 17:00:00|2013-05-06 15:56:24|
+-----------------+-----------------+------------+-------------------+-------------------+
only showing top 3 rows



### 8. Join game_developer, game_publisher, and game_genre

<font color = 'blue'> Join games_developer with games_genres </font>

In [60]:
# Rename app_id column to join to table and drop later
games_genres = games_genres.withColumnRenamed('app_id', 'app_id_1')
games_genres.show(2)

+--------+----------+
|app_id_1|gamesGenre|
+--------+----------+
|      10|    Action|
|      20|    Action|
+--------+----------+
only showing top 2 rows



In [90]:
game_dgp = games_developer.join(broadcast(games_genres),\
                                games_developer["app_id"] == games_genres["app_id_1"], how='left')

In [92]:
game_dgp= game_dgp.drop('app_id_1')
game_dgp.show(2)

+------+--------------+----------+
|app_id|gamesDeveloper|gamesGenre|
+------+--------------+----------+
|    10|         Valve|    Action|
|    20|         Valve|    Action|
+------+--------------+----------+
only showing top 2 rows



<font color = 'blue'> Join games_developer, games_genres, game_publisher </font>

In [84]:
# Rename app_id column to join to table and drop later
games_publisher = games_publisher.withColumnRenamed('app_id', 'app_id_1')
games_publisher.show(2)

+--------+--------------+
|app_id_1|gamesPublisher|
+--------+--------------+
|      10|         Valve|
|      20|         Valve|
+--------+--------------+
only showing top 2 rows



In [93]:
game_dgp = game_dgp.join(broadcast(games_publisher),\
                                game_dgp["app_id"] == games_publisher["app_id_1"], how='left')

In [95]:
game_dgp= game_dgp.drop('app_id_1')
game_dgp.show(2)

+------+--------------+----------+--------------+
|app_id|gamesDeveloper|gamesGenre|gamesPublisher|
+------+--------------+----------+--------------+
|    10|         Valve|    Action|         Valve|
|    20|         Valve|    Action|         Valve|
+------+--------------+----------+--------------+
only showing top 2 rows



In [96]:
game_dgp.count()

46404

In [97]:
check_missing(game_dgp)

+------+--------------+----------+--------------+
|app_id|gamesDeveloper|gamesGenre|gamesPublisher|
+------+--------------+----------+--------------+
|     0|             0|       580|          6102|
+------+--------------+----------+--------------+



In [98]:
# Save game_dgp table
game_dgp.write.csv('/user/tamng/jwht/CleanData/game_dgp.csv', header = True)

In [149]:
# Create the name for each dataframe
game2_df.name = 'game2_df'
app_id_info.name = 'app_id_info'
game_dgp.name = 'game_dgp'
friends.name = 'friends'
groups.name = 'groups'

In [156]:
# Incase we want to print out the basic information of all tables:
tables = [game2_df, app_id_info, game_dgp, friends, groups]
for tab in tables:
    print('TABLE NAME:', tab.name)
    print(basic_info(tab))
    print('\n')

TABLE NAME: game2_df
TOTAL ROWS: 100000000


*-------------*-------------*-------------*-------------*-------------


MISSING VALUE:
+--------+------+---------------+----------------+-------------+
|steam_id|app_id|playtime_2weeks|playtime_forever|dateretrieved|
+--------+------+---------------+----------------+-------------+
|       0|     0|              0|               0|            0|
+--------+------+---------------+----------------+-------------+

*-------------*-------------*-------------*-------------*-------------


PRINT OUT THE 1st 3 LINES:
+-----------------+------+---------------+----------------+-------------------+
|         steam_id|app_id|playtime_2weeks|playtime_forever|      dateretrieved|
+-----------------+------+---------------+----------------+-------------------+
|76561197960265729|    10|              0|               0|2014-08-14 14:04:18|
|76561197960265729|    20|              0|               0|2014-08-14 14:04:18|
|76561197960265729|    30|              0

In [271]:
positiveReviewPercent = spark.read.csv('/user/tamng/jwht/SteamData/positiveReviewPercent.csv',\
                             inferSchema = True, header = True)

In [272]:
positiveReviewPercent.show(3, truncate = True)

+------+--------------------+----+-----+--------------+------+--------------+-------------+---------------------+
|app_id|               title|type|price|   releaseDate|rating|ageRequirement|isMultiplayer|positiveReviewPercent|
+------+--------------------+----+-----+--------------+------+--------------+-------------+---------------------+
|    10|      Counter-Strike|game| 9.99|11/1/2000 0:00|    88|             0|            1|                   96|
|    20|Team Fortress Cla...|game| 4.99| 4/1/1999 0:00|    -1|             0|            1|                   82|
|    30|       Day of Defeat|game| 4.99| 5/1/2003 0:00|    79|             0|            1|                   86|
+------+--------------------+----+-----+--------------+------+--------------+-------------+---------------------+
only showing top 3 rows



In [273]:
check_missing(positiveReviewPercent)

+------+-----+----+-----+-----------+------+--------------+-------------+---------------------+
|app_id|title|type|price|releaseDate|rating|ageRequirement|isMultiplayer|positiveReviewPercent|
+------+-----+----+-----+-----------+------+--------------+-------------+---------------------+
|     0|    0|   0|    0|          0|     0|             0|            0|                    0|
+------+-----+----+-----+-----------+------+--------------+-------------+---------------------+



_Replace 'No review' with 999_

In [282]:
positiveReviewPercent = positiveReviewPercent.withColumn("positiveReviewPercent", \
              when(positiveReviewPercent["positiveReviewPercent"] == 'No review', 999).otherwise(positiveReviewPercent["positiveReviewPercent"]))

In [283]:
positiveReviewPercent.filter(col("rating") == -1).show(30)

+------+--------------------+----+-----+---------------+------+--------------+-------------+---------------------+
|app_id|               title|type|price|    releaseDate|rating|ageRequirement|isMultiplayer|positiveReviewPercent|
+------+--------------------+----+-----+---------------+------+--------------+-------------+---------------------+
|    20|Team Fortress Cla...|game| 4.99|  4/1/1999 0:00|    -1|             0|            1|                   82|
|    40|  Deathmatch Classic|game| 4.99|  6/1/2001 0:00|    -1|             0|            1|                   79|
|    50|Half-Life: Opposi...|game| 4.99| 11/1/1999 0:00|    -1|             0|            1|                   95|
|    60|            Ricochet|game| 4.99| 11/1/2000 0:00|    -1|             0|            1|                   78|
|   660|Portal 2 Sixense ...| dlc|    0|  1/1/1970 0:00|    -1|             0|            0|                  999|
|  3302|Bejeweled 2 Delux...|demo|    0|  1/1/1970 0:00|    -1|             0|  

In [284]:
positiveReviewPercent.filter(col("positiveReviewPercent") == 999).count()

8564

In [281]:
positiveReviewPercent.count()

17783

In [285]:
# Save data
positiveReviewPercent.write.csv('/user/tamng/jwht/CleanData/app_if_info_PosReview.csv', header = True)

In [288]:
positiveReviewPercent.printSchema()

root
 |-- app_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- ageRequirement: integer (nullable = true)
 |-- isMultiplayer: integer (nullable = true)
 |-- positiveReviewPercent: string (nullable = true)

