# pySpark Example: Data Prepare, Data Cleaning, Aggregation

## findspark

In [1]:
import findspark
findspark.init()
print('Done')

Done


## import

In [15]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

## Create SparkContext, SparkSession instance

In [3]:
sc = SparkContext('local', 'app')

In [4]:
spark = SparkSession.Builder().appName('app').getOrCreate()

## Design Schema

In [44]:
schema_rate = T.StructType([T.StructField('userid', T.StringType(), True),
                            T.StructField('accoid', T.StringType(), True),
                            T.StructField('rating', T.FloatType(), True)
                           ])

schema_accommodation = T.StructType([T.StructField('id', T.StringType(), True),
                                    T.StructField('title', T.StringType(), True),
                                    T.StructField('location',T.StringType(), True),
                                    T.StructField('price', T.IntegerType(), True),
                                    T.StructField('rooms', T.IntegerType(), True),
                                    T.StructField('rating', T.FloatType(), True),
                                    T.StructField('type', T.StringType(), True)
                                  ])

## Read CSV file to Spark DataFrame

In [166]:
dfRate = spark.read.csv('recomend/rating.csv', header = 'False', schema=schema_rate)
print(dfRate.count())
dfRate.show(5)

1187
+------+------+------+
|userid|accoid|rating|
+------+------+------+
|    10|     1|   1.0|
|    18|     1|   2.0|
|    13|     1|   1.0|
|     7|     2|   2.0|
|     4|     2|   2.0|
+------+------+------+
only showing top 5 rows



In [192]:
dfAccos = spark.read.csv('recomend/accommodation.csv', header = False,  schema=schema_accommodation)
print(dfAccos.count())

110


## Find Duplicates

In [198]:
dfAccos.groupBy('id', 'title', 'location', 'price', 'rooms', 'rating', 'type')\
            .agg(F.count('id').alias('num_duplicates')).where(F.col('num_duplicates') > 1).show()          

+---+------------------+---------+-----+-----+------+-------+--------------+
| id|             title| location|price|rooms|rating|   type|num_duplicates|
+---+------------------+---------+-----+-----+------+-------+--------------+
|  1|Comfy Quiet Chalet|Vancouver|   50|    3|   3.1|cottage|             2|
|  2|     Cozy Calm Hut|   London|   65|    2|   4.1|cottage|             3|
| 23|   Homy Calm House|    Paris|   70|    2|   2.0|cottage|             5|
+---+------------------+---------+-----+-----+------+-------+--------------+



## Find nulls

### Get count of null values in pyspark

In [194]:
dfAccos.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in dfAccos.columns]).show()

+---+-----+--------+-----+-----+------+----+
| id|title|location|price|rooms|rating|type|
+---+-----+--------+-----+-----+------+----+
|  0|    1|       2|    2|    2|     2|   3|
+---+-----+--------+-----+-----+------+----+



### Show rows with null values

In [195]:
dfAccos.filter(F.col('type').isNull()).show(5)

+---+--------------------+--------+-----+-----+------+----+
| id|               title|location|price|rooms|rating|type|
+---+--------------------+--------+-----+-----+------+----+
|  1|                null|    null| null| null|  null|null|
|  3| Agreable Calm Place|  London|   65|    4|   4.8|null|
|  4|Colossal Quiet Ch...|    null| null| null|  null|null|
+---+--------------------+--------+-----+-----+------+----+



## Drop Duplicates

In [204]:
print('Num Of Rows before Drop Dupliates: ', dfAccos.count())
dfAccos.select('*').where(dfAccos.id == '23').show()

Num Of Rows before Drop Dupliates:  110
+---+---------------+--------+-----+-----+------+-------+
| id|          title|location|price|rooms|rating|   type|
+---+---------------+--------+-----+-----+------+-------+
| 23|Homy Calm House|   Paris|   70|    2|   2.0|cottage|
| 23|Homy Calm House|   Paris|   70|    2|   2.0|cottage|
| 23|Homy Calm House|   Paris|   70|    2|   2.0|cottage|
| 23|Homy Calm House|   Paris|   70|    2|   2.0|cottage|
| 23|Homy Calm House|   Paris|   70|    2|   2.0|cottage|
+---+---------------+--------+-----+-----+------+-------+



In [205]:
dfAccos_dropDup = dfAccos.dropDuplicates()
print('Num Of Rows After Drop Dupliates: ', dfAccos_dropDup.count())
dfAccos_dropDup.select('*').where(dfAccos_dropDup.id == '23').show()

Num Of Rows After Drop Dupliates:  103
+---+---------------+--------+-----+-----+------+-------+
| id|          title|location|price|rooms|rating|   type|
+---+---------------+--------+-----+-----+------+-------+
| 23|Homy Calm House|   Paris|   70|    2|   2.0|cottage|
+---+---------------+--------+-----+-----+------+-------+



### Verify there is no duplicates in the DataFrame

In [207]:
dfAccos_dropDup.groupBy('id', 'title', 'location', 'price', 'rooms', 'rating', 'type')\
            .agg(F.count('id').alias('num_duplicates')).where(F.col('num_duplicates') > 1).show()    

+---+-----+--------+-----+-----+------+----+--------------+
| id|title|location|price|rooms|rating|type|num_duplicates|
+---+-----+--------+-----+-----+------+----+--------------+
+---+-----+--------+-----+-----+------+----+--------------+



## Drop rows with nulls in some columns

In [209]:
print('Num Of Rows Before Drop rows with nulls: ', dfAccos_dropDup.count())
dfAccos_dropDup.filter(F.col('type').isNull()).show(5)

Num Of Rows Before Drop rows with nulls:  103
+---+--------------------+--------+-----+-----+------+----+
| id|               title|location|price|rooms|rating|type|
+---+--------------------+--------+-----+-----+------+----+
|  1|                null|    null| null| null|  null|null|
|  3| Agreable Calm Place|  London|   65|    4|   4.8|null|
|  4|Colossal Quiet Ch...|    null| null| null|  null|null|
+---+--------------------+--------+-----+-----+------+----+



### Drop rows if specific colum value is null

In [221]:
dfAccos_drop_some_nulls = dfAccos_dropDup.filter(dfAccos_dropDup.title.isNotNull())

print('Num Of Rows After Drop rows for specific column with null: ', dfAccos_drop_some_nulls.count())
dfAccos_drop_some_nulls.filter(F.col('type').isNull()).show(5, False)

# the row with null value on column 'title' was dropped

Num Of Rows After Drop rows for specific column with null:  102
+---+----------------------+--------+-----+-----+------+----+
|id |title                 |location|price|rooms|rating|type|
+---+----------------------+--------+-----+-----+------+----+
|3  |Agreable Calm Place   |London  |65   |4    |4.8   |null|
|4  |Colossal Quiet Chateau|null    |null |null |null  |null|
+---+----------------------+--------+-----+-----+------+----+



### Drop rows if there is null in any columns

In [210]:
dfAccos_clean = dfAccos_dropDup.na.drop('any')

print('Num Of Rows After Drop rows with any nulls: ', dfAccos_clean.count())
dfAccos_clean.filter(F.col('type').isNull()).show(5)

Num Of Rows After Drop rows with any nulls:  100
+---+-----+--------+-----+-----+------+----+
| id|title|location|price|rooms|rating|type|
+---+-----+--------+-----+-----+------+----+
+---+-----+--------+-----+-----+------+----+



### Verify there is no null values in the DataFrame

In [212]:
dfAccos_clean.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in dfAccos_clean.columns]).show()

+---+-----+--------+-----+-----+------+----+
| id|title|location|price|rooms|rating|type|
+---+-----+--------+-----+-----+------+----+
|  0|    0|       0|    0|    0|     0|   0|
+---+-----+--------+-----+-----+------+----+



## Aggregations

### Give me total num of ratings, total number of users rated and the best, worst, average rating

In [224]:
dfRate.agg(F.count('userid').alias('num_ratings'),
           F.countDistinct('userid').alias('num_users'),
           F.max('rating').alias('best_rating'),
           F.min('rating').alias('worst_rating'),
           F.avg('rating').alias('avg_rating')
          ).show()

+-----------+---------+-----------+------------+------------------+
|num_ratings|num_users|best_rating|worst_rating|        avg_rating|
+-----------+---------+-----------+------------+------------------+
|       1187|       25|        5.0|         1.0|2.4667228306655433|
+-----------+---------+-----------+------------+------------------+



In [225]:
dfAccos_clean.columns

['id', 'title', 'location', 'price', 'rooms', 'rating', 'type']

### Give me five accommodations with highest rating

In [274]:
dfAccos_clean.select('*').sort(F.desc('rating')).show(5)

+---+--------------------+--------+-----+-----+------+-------+
| id|               title|location|price|rooms|rating|   type|
+---+--------------------+--------+-----+-----+------+-------+
| 10|Sizable Calm Coun...|Auckland|  650|    9|   4.9|mansion|
| 21|  Big Peaceful Cabin| Seattle|   80|    2|   4.9|cottage|
| 85|  Nice Private Shack|Auckland|   55|    1|   4.9|cottage|
| 33| Pleasant Calm Place|   Tokyo|   30|    2|   4.8|  house|
|  3| Agreable Calm Place|  London|   65|    4|   4.8|  house|
+---+--------------------+--------+-----+-----+------+-------+
only showing top 5 rows



### Find all the accommodations for a specific user that rated before

### Total number of accommodations

In [266]:
dfAccos_clean.count()

100

### Numnber of accommodations rated by a specific user

In [267]:
USER_ID = '1'
dfUserRatings = dfRate.filter(F.col('userid') == USER_ID)\
                      .rdd.map(lambda kv: kv[1]).collect()
print("The num of ratings for userid = '1': ", len(dfUserRatings))

The num of ratings for userid = '1':  46


In [271]:
dfUserRatings

['2',
 '4',
 '5',
 '7',
 '8',
 '11',
 '14',
 '19',
 '23',
 '24',
 '25',
 '27',
 '34',
 '36',
 '37',
 '40',
 '42',
 '44',
 '45',
 '48',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '58',
 '60',
 '62',
 '67',
 '68',
 '71',
 '73',
 '77',
 '80',
 '81',
 '82',
 '87',
 '88',
 '89',
 '91',
 '92',
 '94',
 '95',
 '97',
 '98']

### Number of accommodations never rated by a specific user

In [268]:
pairsPotential = dfAccos_clean.rdd.filter(lambda x: x[0] not in dfUserRatings)\
                                  .map(lambda x: (USER_ID, x[0]))

print("The num of accommodations USER_ID never rated: ", pairsPotential.count())

The num of accommodations USER_ID never rated:  54


In [270]:
pairsPotential.collect()

[('1', '1'),
 ('1', '3'),
 ('1', '6'),
 ('1', '9'),
 ('1', '10'),
 ('1', '12'),
 ('1', '13'),
 ('1', '15'),
 ('1', '16'),
 ('1', '17'),
 ('1', '18'),
 ('1', '20'),
 ('1', '21'),
 ('1', '22'),
 ('1', '26'),
 ('1', '28'),
 ('1', '29'),
 ('1', '30'),
 ('1', '31'),
 ('1', '32'),
 ('1', '33'),
 ('1', '35'),
 ('1', '38'),
 ('1', '39'),
 ('1', '41'),
 ('1', '43'),
 ('1', '46'),
 ('1', '47'),
 ('1', '49'),
 ('1', '50'),
 ('1', '57'),
 ('1', '59'),
 ('1', '61'),
 ('1', '63'),
 ('1', '64'),
 ('1', '65'),
 ('1', '66'),
 ('1', '69'),
 ('1', '70'),
 ('1', '72'),
 ('1', '74'),
 ('1', '75'),
 ('1', '76'),
 ('1', '78'),
 ('1', '79'),
 ('1', '83'),
 ('1', '84'),
 ('1', '85'),
 ('1', '86'),
 ('1', '90'),
 ('1', '93'),
 ('1', '96'),
 ('1', '99'),
 ('1', '100')]