### In this Tutorial, you will learn how to:
- Start the spark session
- Read The Data Set
- select, add, remove and rename the columns
    

In [1]:
from pyspark.sql import SparkSession



In [2]:
# Iitialize the spark session
spark = SparkSession.builder.appName('Tutorial').getOrCreate()
spark

In [3]:
# read the dataset in the folder
df_pyspark = spark.read.csv('test.csv',header=True,inferSchema=True)

In [4]:
df_pyspark

DataFrame[Name: string, Age: int, Experience: int]

In [5]:
# show the dataset
df_pyspark.show()

+---------------+---+----------+
|           Name|Age|Experience|
+---------------+---+----------+
|Muhammad Waleed| 29|         3|
|  Shahroz Ahmad| 31|         2|
|   Zia Ul Islam| 30|         1|
+---------------+---+----------+



In [6]:
#show the first row
df_pyspark.head()

Row(Name='Muhammad Waleed', Age=29, Experience=3)

In [7]:
# print the schema of data set
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [8]:
# show the columns in the data set
df_pyspark.columns

['Name', 'Age', 'Experience']

## Data Preprocessing

In [9]:
# read a specific column
df_pyspark.select('Age').show()

+---+
|Age|
+---+
| 29|
| 31|
| 30|
+---+



In [10]:
# read a multiple column
df_pyspark.select(['Name','Experience']).show()

+---------------+----------+
|           Name|Experience|
+---------------+----------+
|Muhammad Waleed|         3|
|  Shahroz Ahmad|         2|
|   Zia Ul Islam|         1|
+---------------+----------+



In [11]:
# show the data types
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [12]:
# describe the dataframe
df_pyspark.describe().show()

+-------+---------------+----+----------+
|summary|           Name| Age|Experience|
+-------+---------------+----+----------+
|  count|              3|   3|         3|
|   mean|           NULL|30.0|       2.0|
| stddev|           NULL| 1.0|       1.0|
|    min|Muhammad Waleed|  29|         1|
|    max|   Zia Ul Islam|  31|         3|
+-------+---------------+----+----------+



In [13]:
# Adding column
df_pyspark = df_pyspark.withColumn('Current Experience',df_pyspark['Experience']+2)
df_pyspark.show()

+---------------+---+----------+------------------+
|           Name|Age|Experience|Current Experience|
+---------------+---+----------+------------------+
|Muhammad Waleed| 29|         3|                 5|
|  Shahroz Ahmad| 31|         2|                 4|
|   Zia Ul Islam| 30|         1|                 3|
+---------------+---+----------+------------------+



In [14]:
# drop the column
df_pyspark = df_pyspark.drop('Current Experience')
df_pyspark.show()

+---------------+---+----------+
|           Name|Age|Experience|
+---------------+---+----------+
|Muhammad Waleed| 29|         3|
|  Shahroz Ahmad| 31|         2|
|   Zia Ul Islam| 30|         1|
+---------------+---+----------+



In [15]:
# Rename the column
df_pyspark = df_pyspark.withColumnRenamed('Name', 'Employee Name')
df_pyspark.show()

+---------------+---+----------+
|  Employee Name|Age|Experience|
+---------------+---+----------+
|Muhammad Waleed| 29|         3|
|  Shahroz Ahmad| 31|         2|
|   Zia Ul Islam| 30|         1|
+---------------+---+----------+

