## Tutorial-1

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Intro').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv('/content/test1.csv')

In [5]:
df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string]

In [6]:
df.show()

+---------+---+----------+------+
|      _c0|_c1|       _c2|   _c3|
+---------+---+----------+------+
|     Name|age|Experience|Salary|
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [8]:
df = spark.read.option('header', 'true').csv('/content/test1.csv')
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
type(df)

In [12]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



## Tutorial-2

In [13]:
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder.appName('Part-2').getOrCreate()

In [15]:
spark

In [18]:
df = spark.read.option('header', 'true').csv('/content/test1.csv', inferSchema=True)
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [19]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [20]:
df = spark.read.csv('/content/test1.csv', header=True, inferSchema=True)
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [21]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [22]:
type(df)

In [23]:
df.columns

['Name', 'age', 'Experience', 'Salary']

In [26]:
df.head(3)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [27]:
df.select('Name')

DataFrame[Name: string]

In [28]:
df.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [30]:
df.select(['Name', 'Experience'])

DataFrame[Name: string, Experience: int]

In [31]:
df.select(['Name', 'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [32]:
df['Name']

Column<'Name'>

In [33]:
df.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [34]:
df.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [35]:
df.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [38]:
# Adding column to the dataframe
df_new = df.withColumn('Experience After 2yrs', df['Experience']+2)

In [39]:
df_new.show()

+---------+---+----------+------+---------------------+
|     Name|age|Experience|Salary|Experience After 2yrs|
+---------+---+----------+------+---------------------+
|    Krish| 31|        10| 30000|                   12|
|Sudhanshu| 30|         8| 25000|                   10|
|    Sunny| 29|         4| 20000|                    6|
|     Paul| 24|         3| 20000|                    5|
|   Harsha| 21|         1| 15000|                    3|
|  Shubham| 23|         2| 18000|                    4|
+---------+---+----------+------+---------------------+



In [42]:
# To make the changes save we have to assign it to a variable
# In pandas same is done using inPlace parameter.
df_new = df_new.drop('Experience After 2yrs')

In [43]:
df_new.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [45]:
# Rename the columns
df.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+------+
| New Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## Tutorial-3