In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import requests

In [8]:
#Dowload CSV File

def download_file(url, filename):

    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"File downloaded successfully and saved as {filename}")
    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"OOps: Something Else: {err}")

In [9]:
url = 'https://drive.google.com/uc?id=1phaHg9objxK2MwaZmSUZAKQ8kVqlgng4&export=download'
filename = 'data.csv'

download_file(url, filename)

File downloaded successfully and saved as data.csv


In [10]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()
spark

In [13]:
# consider first row as header
df_pyspark = spark.read.option('header', 'true').csv('data.csv')
df_pyspark.show(5)

+-----+---------------+----------+---------+------+--------------------+--------------------+-------------+------------------+
|Index|        User Id|First Name|Last Name|   Sex|               Email|               Phone|Date of birth|         Job Title|
+-----+---------------+----------+---------+------+--------------------+--------------------+-------------+------------------+
|    1|88F7B33d2bcf9f5|    Shelby|  Terrell|  Male|elijah57@example.net|001-084-906-7849x...|   1945-10-26|   Games developer|
|    2|f90cD3E76f1A9b9|   Phillip|  Summers|Female|bethany14@example...|   214.112.6044x4913|   1910-03-24|    Phytotherapist|
|    3|DbeAb8CcdfeFC2c|  Kristine|   Travis|  Male|bthompson@example...|        277.609.7938|   1992-07-02|         Homeopath|
|    4|A31Bee3c201ef58|   Yesenia| Martinez|  Male|kaitlinkaiser@exa...|        584.094.6111|   2017-08-03| Market researcher|
|    5|1bA7A3dc874da3c|      Lori|     Todd|  Male|buchananmanuel@ex...|   689-207-3558x7233|   1938-12-01|Vete

In [14]:
df_pyspark.printSchema()

root
 |-- Index: string (nullable = true)
 |-- User Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Date of birth: string (nullable = true)
 |-- Job Title: string (nullable = true)



## Dropping Columns

In [18]:
df_pyspark.drop('Sex', 'Phone').show(2)

+-----+---------------+----------+---------+--------------------+-------------+---------------+
|Index|        User Id|First Name|Last Name|               Email|Date of birth|      Job Title|
+-----+---------------+----------+---------+--------------------+-------------+---------------+
|    1|88F7B33d2bcf9f5|    Shelby|  Terrell|elijah57@example.net|   1945-10-26|Games developer|
|    2|f90cD3E76f1A9b9|   Phillip|  Summers|bethany14@example...|   1910-03-24| Phytotherapist|
+-----+---------------+----------+---------+--------------------+-------------+---------------+
only showing top 2 rows



## Deleting entire row with null value in any cell

`.na.drop()` method is specifically designed to remove rows with null values

In [22]:
df_pyspark.na.drop() # thats it

DataFrame[Index: string, User Id: string, First Name: string, Last Name: string, Sex: string, Email: string, Phone: string, Date of birth: string, Job Title: string]

## Understanding .na.drop() Arguments

Additionally:  
`.na.drop()` takes 3 argument ->
1. how = any / all (default any)
2. thresh
3. subset

In [24]:
df_pyspark.na.drop(how="all") # if ALL VALUES in a row are null -> drop the row

DataFrame[Index: string, User Id: string, First Name: string, Last Name: string, Sex: string, Email: string, Phone: string, Date of birth: string, Job Title: string]

In [25]:
df_pyspark.na.drop(how="any", thresh=2) # thresh says ALLEAST 2 NON-NULL values should be present -> else drop

DataFrame[Index: string, User Id: string, First Name: string, Last Name: string, Sex: string, Email: string, Phone: string, Date of birth: string, Job Title: string]

In [27]:
df_pyspark.na.drop(how="any", subset=['First Name']) # if there are any null records in row mentioned in subset -> drop

DataFrame[Index: string, User Id: string, First Name: string, Last Name: string, Sex: string, Email: string, Phone: string, Date of birth: string, Job Title: string]

## Filling the missing values

In [29]:
# Full NULL values with 'Not Available'

df_pyspark.na.fill('Not Available')

DataFrame[Index: string, User Id: string, First Name: string, Last Name: string, Sex: string, Email: string, Phone: string, Date of birth: string, Job Title: string]

In [30]:
# For specific columns

df_pyspark.na.fill('Not Available', ['Sex', 'Email'])

DataFrame[Index: string, User Id: string, First Name: string, Last Name: string, Sex: string, Email: string, Phone: string, Date of birth: string, Job Title: string]

## Imputer Functions

Suppose we can fill null values with mean, median, mode of that column

In [33]:
from pyspark.ml.feature import Imputer

In [34]:
# Example Usage

"""
imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("median")
"""

'\nimputer = Imputer(\n    inputCols=[\'age\', \'Experience\', \'Salary\'], \n    outputCols=["{}_imputed".format(c) for c in [\'age\', \'Experience\', \'Salary\']]\n    ).setStrategy("median")\n'