Useful link : https://www.analyticsvidhya.com/blog/2022/05/data-preprocessing-using-pyspark-handling-missing-values/

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("local_spark").getOrCreate()

In [2]:
df = spark.read.csv('calendar.csv', header=True)
df.show()

+----------+----------+---------+------+
|listing_id|      date|available| price|
+----------+----------+---------+------+
|    241032|2016-01-04|        t|$85.00|
|    241032|2016-01-05|        t|$85.00|
|    241032|2016-01-06|        f|  null|
|    241032|2016-01-07|        f|  null|
|    241032|2016-01-08|        f|  null|
|    241032|2016-01-09|        f|  null|
|    241032|2016-01-10|        f|  null|
|    241032|2016-01-11|        f|  null|
|    241032|2016-01-12|        f|  null|
|    241032|2016-01-13|        t|$85.00|
|    241032|2016-01-14|        t|$85.00|
|    241032|2016-01-15|        f|  null|
|    241032|2016-01-16|        f|  null|
|    241032|2016-01-17|        f|  null|
|    241032|2016-01-18|        t|$85.00|
|    241032|2016-01-19|        t|$85.00|
|    241032|2016-01-20|        t|$85.00|
|    241032|2016-01-21|        f|  null|
|    241032|2016-01-22|        f|  null|
|    241032|2016-01-23|        f|  null|
+----------+----------+---------+------+
only showing top

In [3]:
#df = spark.read.format("csv").option("header", "true").load("C:/Users/awedo/SpartaGlobal_Projects/Missing Values/calendar.csv")

IMPORTANT: 
- printSchema() which works in the same way as the “describe” function of pandas.

In [4]:
# nullable = True means that there are some null values in that column
df.printSchema()

root
 |-- listing_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- available: string (nullable = true)
 |-- price: string (nullable = true)



Counting Null values for single columns

In [5]:
df.filter(" available is null").count()

1

In [6]:
df.filter(" price is null").count()

415489

In [7]:
df.filter(" date is null").count()

1

In [8]:
# Couting all the values in the columns 
from pyspark.sql.functions import count, col 
df.select([count(col(i)) for i in df.columns]).show()

+-----------------+-----------+----------------+------------+
|count(listing_id)|count(date)|count(available)|count(price)|
+-----------------+-----------+----------------+------------+
|          1246500|    1246499|         1246499|      831011|
+-----------------+-----------+----------------+------------+



In [9]:
# Counts all the null values with alias
from pyspark.sql.functions import count, col, when
df.select([count(when(col(i).isNull(),i)).alias(i) for i in df.columns]).show()

+----------+----+---------+------+
|listing_id|date|available| price|
+----------+----+---------+------+
|         0|   1|        1|415489|
+----------+----+---------+------+



Dropping the null values in price

In [11]:
df = df.dropna(subset=['price'],how='any')
df.show()

+----------+----------+---------+------+
|listing_id|      date|available| price|
+----------+----------+---------+------+
|    241032|2016-01-04|        t|$85.00|
|    241032|2016-01-05|        t|$85.00|
|    241032|2016-01-13|        t|$85.00|
|    241032|2016-01-14|        t|$85.00|
|    241032|2016-01-18|        t|$85.00|
|    241032|2016-01-19|        t|$85.00|
|    241032|2016-01-20|        t|$85.00|
|    241032|2016-01-24|        t|$85.00|
|    241032|2016-01-25|        t|$85.00|
|    241032|2016-01-26|        t|$85.00|
|    241032|2016-01-27|        t|$85.00|
|    241032|2016-01-28|        t|$85.00|
|    241032|2016-02-01|        t|$85.00|
|    241032|2016-02-02|        t|$85.00|
|    241032|2016-02-03|        t|$85.00|
|    241032|2016-02-04|        t|$85.00|
|    241032|2016-02-05|        t|$85.00|
|    241032|2016-02-06|        t|$85.00|
|    241032|2016-02-07|        t|$85.00|
|    241032|2016-02-08|        t|$85.00|
+----------+----------+---------+------+
only showing top

In [None]:
# drop = df.na.drop(how='any', subset=['price'])
# drop.show()

In [12]:
# Checking whether the null value for price has been dropped
df.filter(" price is null").count()

0

In [13]:
df.filter("date is null").count()

0

In [14]:
df.filter(col('price').isNull()).show()

+----------+----+---------+-----+
|listing_id|date|available|price|
+----------+----+---------+-----+
+----------+----+---------+-----+



#### Importing the listing dataFrame

In [15]:
df = spark.read.csv('listings.csv', header=True)
df.show(vertical=True)

-RECORD 0------------------------------------------------
 id                               | 241032               
 listing_url                      | https://www.airbn... 
 scrape_id                        | 20160104002432       
 last_scraped                     | 2016-01-04           
 name                             | Stylish Queen Ann... 
 summary                          | null                 
 space                            | Make your self at... 
 description                      | Make your self at... 
 experiences_offered              | none                 
 neighborhood_overview            | null                 
 notes                            | null                 
 transit                          | null                 
 thumbnail_url                    | null                 
 medium_url                       | null                 
 picture_url                      | https://a1.muscac... 
 xl_picture_url                   | null                 
 host_id      

In [None]:
#df = spark.read.format("csv").option("header", "true").load("C:/Users/awedo/SpartaGlobal_Projects/Missing Values/listings.csv")

In [16]:
# Counts all the null values with alias
from pyspark.sql.functions import count, col, when
df.select([count(when(col(i).isNull(),i)).alias(i) for i in df.columns]).show(vertical=True)

-RECORD 0--------------------------------
 id                               | 0    
 listing_url                      | 1247 
 scrape_id                        | 1742 
 last_scraped                     | 2038 
 name                             | 2247 
 summary                          | 2629 
 space                            | 3129 
 description                      | 2662 
 experiences_offered              | 2702 
 neighborhood_overview            | 3701 
 notes                            | 4207 
 transit                          | 3670 
 thumbnail_url                    | 3038 
 medium_url                       | 3042 
 picture_url                      | 2810 
 xl_picture_url                   | 3066 
 host_id                          | 2790 
 host_url                         | 2791 
 host_name                        | 2793 
 host_since                       | 2798 
 host_location                    | 2794 
 host_about                       | 3568 
 host_response_time               

In [17]:
df.dtypes

[('id', 'string'),
 ('listing_url', 'string'),
 ('scrape_id', 'string'),
 ('last_scraped', 'string'),
 ('name', 'string'),
 ('summary', 'string'),
 ('space', 'string'),
 ('description', 'string'),
 ('experiences_offered', 'string'),
 ('neighborhood_overview', 'string'),
 ('notes', 'string'),
 ('transit', 'string'),
 ('thumbnail_url', 'string'),
 ('medium_url', 'string'),
 ('picture_url', 'string'),
 ('xl_picture_url', 'string'),
 ('host_id', 'string'),
 ('host_url', 'string'),
 ('host_name', 'string'),
 ('host_since', 'string'),
 ('host_location', 'string'),
 ('host_about', 'string'),
 ('host_response_time', 'string'),
 ('host_response_rate', 'string'),
 ('host_acceptance_rate', 'string'),
 ('host_is_superhost', 'string'),
 ('host_thumbnail_url', 'string'),
 ('host_picture_url', 'string'),
 ('host_neighbourhood', 'string'),
 ('host_listings_count', 'string'),
 ('host_total_listings_count', 'string'),
 ('host_verifications', 'string'),
 ('host_has_profile_pic', 'string'),
 ('host_identi

In [None]:
# cols = ("firstname","middlename","lastname")

# df.drop(*cols) \
#    .printSchema()

In [18]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- name: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- space: string (nullable = true)
 |-- description: string (nullable = true)
 |-- experiences_offered: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- transit: string (nullable = true)
 |-- thumbnail_url: string (nullable = true)
 |-- medium_url: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- xl_picture_url: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate

In [None]:
# Separated the original data frame into 2 groups and assigned them new variable
numerical_value = df.columns[df.dtypes != 'object']
categorical_value = df.columns[df.dtypes == 'object']

In [None]:
print(numerical_value)
print(categorical_value)