#Introduction to PySpark
<hr>

In [None]:
# Installation of pyspark package
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=a9b7d76395c9cd5c1d4d6c90b20028af1549a74187a544e6f7907b53e45ff75d
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
import pyspark

In [None]:
# First reading dataset with pandas lib
import pandas as pd
pandas_df = pd.read_csv("dataset.csv")
pandas_df.head()

Unnamed: 0,Gender,Age,Study_year,Living,Scholarship,Part_time_job,Transporting,Smoking,Drinks,Games_&_Hobbies,Cosmetics_&_Self-care,Monthly_Subscription,Monthly_expenses_$
0,Female,21,2.0,Home,No,No,No,No,No,No,Yes,No,150.0
1,Male,25,3.0,Hostel,No,Yes,Motorcycle,No,No,Yes,Yes,Yes,220.0
2,Male,23,2.0,Home,Yes,No,No,No,No,No,No,,180.0
3,Male,19,3.0,Hostel,No,No,Motorcycle,No,No,Yes,Yes,Yes,200.0
4,Female,19,2.0,Home,No,No,Motorcycle,No,No,No,Yes,No,300.0


In [None]:
# Checking the datatypes of respective columns
pandas_df.dtypes

Gender                    object
Age                        int64
Study_year               float64
Living                    object
Scholarship               object
Part_time_job             object
Transporting              object
Smoking                   object
Drinks                    object
Games_&_Hobbies           object
Cosmetics_&_Self-care     object
Monthly_Subscription      object
Monthly_expenses_$       float64
dtype: object

In [None]:
# creating spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Intro").getOrCreate()
spark

In [None]:
# Reading dataset using pyspark
spark_df = spark.read.csv("dataset.csv")

In [None]:
# spark dataframe object
spark_df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string]

In [None]:
# Looking at spark dataframe
spark_df.show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+--------------------+--------------------+------------------+
|    _c0|_c1|       _c2|   _c3|        _c4|          _c5|         _c6|    _c7|   _c8|            _c9|                _c10|                _c11|              _c12|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+--------------------+--------------------+------------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-...|Monthly_Subscription|Monthly_expenses_$|
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                 Yes|                  No|               150|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                 Yes|                 Yes|               220|
|  Male | 23|         

In [None]:
spark_df.dtypes

[('_c0', 'string'),
 ('_c1', 'string'),
 ('_c2', 'string'),
 ('_c3', 'string'),
 ('_c4', 'string'),
 ('_c5', 'string'),
 ('_c6', 'string'),
 ('_c7', 'string'),
 ('_c8', 'string'),
 ('_c9', 'string'),
 ('_c10', 'string'),
 ('_c11', 'string'),
 ('_c12', 'string')]

In [None]:
# Column Header or datatype of columns are not identified by default so we would need to read the dataset by specifying those things
spark_df = spark.read.csv("dataset.csv", header=True, inferSchema=True)
spark_df.show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                  Yes|                  No|               150|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                  Yes|                 Yes|               220|
|  Male | 23|         2|  Home|        Yes|           No|          No|     No|    No|             No|                   No|                null|               180|
|  Male | 19|   

In [None]:
# Checking the change in column names and their datatypes
spark_df.dtypes

[('Gender', 'string'),
 ('Age', 'int'),
 ('Study_year', 'int'),
 ('Living', 'string'),
 ('Scholarship', 'string'),
 ('Part_time_job', 'string'),
 ('Transporting', 'string'),
 ('Smoking', 'string'),
 ('Drinks', 'string'),
 ('Games_&_Hobbies', 'string'),
 ('Cosmetics_&_Self-care', 'string'),
 ('Monthly_Subscription', 'string'),
 ('Monthly_expenses_$', 'int')]

### Let's check the difference in the object type of dataFrame by pandas & pyspark

In [None]:
# pandas
type(pandas_df)


pandas.core.frame.DataFrame

In [None]:
# pyspark
type(spark_df)

pyspark.sql.dataframe.DataFrame