# Manipulating PySpark Dataframe
<hr>

In [None]:
# Installation of pyspark package
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 59.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=97c50a02ca710b2500bbc490c47e661bd1c0edd8f2bed17708e77443266ac826
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
# creating spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark Dataframes").getOrCreate()
spark

<br>

###Reading the dataset


In [None]:
# Reading dataset using pyspark
spark_df = spark.read.csv("dataset.csv", header=True, inferSchema=True)

In [None]:
# Fetching first 5 rows to glance 
spark_df.show(5)

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                  Yes|                  No|               150|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                  Yes|                 Yes|               220|
|  Male | 23|         2|  Home|        Yes|           No|          No|     No|    No|             No|                   No|                null|               180|
|  Male | 19|   

<br>

###Checking the Datatypes fof the columns (Schema)


In [None]:
spark_df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Study_year: integer (nullable = true)
 |-- Living: string (nullable = true)
 |-- Scholarship: string (nullable = true)
 |-- Part_time_job: string (nullable = true)
 |-- Transporting: string (nullable = true)
 |-- Smoking: string (nullable = true)
 |-- Drinks: string (nullable = true)
 |-- Games_&_Hobbies: string (nullable = true)
 |-- Cosmetics_&_Self-care: string (nullable = true)
 |-- Monthly_Subscription: string (nullable = true)
 |-- Monthly_expenses_$: integer (nullable = true)



<br>

###Columns selecting & Indexing


In [None]:
# Selecting the age column from dataset
spark_df.select('Age').show()

+---+
|Age|
+---+
| 21|
| 25|
| 23|
| 19|
| 19|
| 22|
| 21|
| 22|
| 18|
| 19|
| 22|
| 18|
| 18|
| 19|
| 22|
| 22|
| 17|
| 19|
| 23|
| 19|
+---+
only showing top 20 rows



In [None]:
# Selecting multiple columns
spark_df.select('Gender', 'Age', 'Study_year').show()

+-------+---+----------+
| Gender|Age|Study_year|
+-------+---+----------+
|Female | 21|         2|
|  Male | 25|         3|
|  Male | 23|         2|
|  Male | 19|         3|
|Female | 19|         2|
|  Male | 22|         3|
|Female | 21|         2|
|  Male | 22|         3|
|Female | 18|         1|
|  Male | 19|         1|
|  Male | 22|         3|
|  Male | 18|         2|
|Female | 18|         2|
|  Male | 19|         2|
|Female | 22|         4|
|  Male | 22|         4|
|Female | 17|         1|
|Female | 19|         2|
|Female | 23|         4|
|Female | 19|         3|
+-------+---+----------+
only showing top 20 rows



<br>

###Checking out Describe method similar to pandas

In [None]:
spark_df.describe().show()

+-------+-------+------------------+------------------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|summary| Gender|               Age|        Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|
+-------+-------+------------------+------------------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|  count|    105|               105|               101|   103|        105|          100|          91|     94|    92|            105|                   92|                  96|                99|
|   mean|   null| 20.17142857142857|2.6534653465346536|  null|       null|         null|        null|   null|  null|           null|                 null|                null|214.94949494949495|
| stddev|   null|1.898669

<br>

###Adding, Renaming & Dropping columns from dataframe


In [None]:
# for testing adding column that shows age after study year
new_df = spark_df.withColumn('Age after study year', spark_df['Age'] + spark_df['Study_year'])
new_df.show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+--------------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|Age after study year|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+--------------------+
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                  Yes|                  No|               150|                  23|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                  Yes|                 Yes|               220|                  28|
|  Male | 23|         2|  Home|        Yes|           No|          No|     

In [None]:
# Renaming that column now
new_df = new_df.withColumnRenamed('Age after study year', 'Graduation Age')
new_df.show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+--------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|Graduation Age|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+--------------+
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                  Yes|                  No|               150|            23|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                  Yes|                 Yes|               220|            28|
|  Male | 23|         2|  Home|        Yes|           No|          No|     No|    No|             No|    

In [None]:
# Dropping the newly created column
df_after_drop = new_df.drop('Graduation Age')
df_after_drop.show()

+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
| Gender|Age|Study_year|Living|Scholarship|Part_time_job|Transporting|Smoking|Drinks|Games_&_Hobbies|Cosmetics_&_Self-care|Monthly_Subscription|Monthly_expenses_$|
+-------+---+----------+------+-----------+-------------+------------+-------+------+---------------+---------------------+--------------------+------------------+
|Female | 21|         2|  Home|         No|           No|          No|     No|    No|             No|                  Yes|                  No|               150|
|  Male | 25|         3|Hostel|         No|          Yes|  Motorcycle|     No|    No|            Yes|                  Yes|                 Yes|               220|
|  Male | 23|         2|  Home|        Yes|           No|          No|     No|    No|             No|                   No|                null|               180|
|  Male | 19|   