# Basics of PySpark compared with Pandas

In [136]:
from pyspark.sql import SparkSession
import pandas as pd

In [137]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [138]:
spark

In [139]:
df_spark = spark.read.csv('Department_Dataset.csv')

In [140]:
df_pandas = pd.read_csv('Department_Dataset.csv')

In [141]:
df_spark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string]

In [142]:
df_spark.show(5)

+---+---------+---------+---------------+
|_c0|      _c1|      _c2|            _c3|
+---+---------+---------+---------------+
| ID|Dept_name| location|travel_required|
|  1|       HR|     Pune|            yes|
|  2|  Finance|Bangalore|             no|
|  3|  Finance|Bangalore|             no|
|  4|  Finance|     Pune|             no|
+---+---------+---------+---------------+
only showing top 5 rows



### To read first row as header

In [143]:
df_pandas.head(5)

Unnamed: 0,ID,Dept_name,location,travel_required
0,1,HR,Pune,yes
1,2,Finance,Bangalore,no
2,3,Finance,Bangalore,no
3,4,Finance,Pune,no
4,5,Tech,Mumbai,no


In [144]:
df_spark = spark.read.option('header','true').csv('Department_Dataset.csv')
df_spark

DataFrame[ID: string, Dept_name: string, location: string, travel_required: string]

In [145]:
df_spark.show(5)

+---+---------+---------+---------------+
| ID|Dept_name| location|travel_required|
+---+---------+---------+---------------+
|  1|       HR|     Pune|            yes|
|  2|  Finance|Bangalore|             no|
|  3|  Finance|Bangalore|             no|
|  4|  Finance|     Pune|             no|
|  5|     Tech|   Mumbai|             no|
+---+---------+---------+---------------+
only showing top 5 rows



### To read more information on dataset

In [146]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               35 non-null     int64 
 1   Dept_name        35 non-null     object
 2   location         35 non-null     object
 3   travel_required  35 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.2+ KB


In [147]:
df_spark.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Dept_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- travel_required: string (nullable = true)



By default all the columns are of datatype string because of the parameter inferSchema=False, set it to True and it will infer the schema automatically

# Basic Operations on DataFrames

In [148]:
#df_spark = spark.read.option('header','true').csv('Department_Dataset.csv',inferSchema=True)
df_spark = spark.read.csv('Department_Dataset.csv',header=True,inferSchema=True)

In [149]:
df_spark.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Dept_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- travel_required: string (nullable = true)



In [150]:
print('-------------PySpark-------------')
df_spark.show(5)
print('-------------Pandas-------------')
df_pandas.head(5)

-------------PySpark-------------
+---+---------+---------+---------------+
| ID|Dept_name| location|travel_required|
+---+---------+---------+---------------+
|  1|       HR|     Pune|            yes|
|  2|  Finance|Bangalore|             no|
|  3|  Finance|Bangalore|             no|
|  4|  Finance|     Pune|             no|
|  5|     Tech|   Mumbai|             no|
+---+---------+---------+---------------+
only showing top 5 rows

-------------Pandas-------------


Unnamed: 0,ID,Dept_name,location,travel_required
0,1,HR,Pune,yes
1,2,Finance,Bangalore,no
2,3,Finance,Bangalore,no
3,4,Finance,Pune,no
4,5,Tech,Mumbai,no


In [151]:
print('-------------PySpark-------------')
df_spark.printSchema()
print('-------------Pandas-------------')
df_pandas.info()

-------------PySpark-------------
root
 |-- ID: integer (nullable = true)
 |-- Dept_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- travel_required: string (nullable = true)

-------------Pandas-------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               35 non-null     int64 
 1   Dept_name        35 non-null     object
 2   location         35 non-null     object
 3   travel_required  35 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.2+ KB


In [152]:
print('-------------PySpark-------------')
print(df_spark.columns)
print('-------------Pandas-------------')
print(df_pandas.columns)

-------------PySpark-------------
['ID', 'Dept_name', 'location', 'travel_required']
-------------Pandas-------------
Index(['ID', 'Dept_name', 'location', 'travel_required'], dtype='object')


In [153]:
print('-------------PySpark-------------')
print(df_spark.dtypes)
print('-------------Pandas-------------')
print(df_pandas.dtypes)

-------------PySpark-------------
[('ID', 'int'), ('Dept_name', 'string'), ('location', 'string'), ('travel_required', 'string')]
-------------Pandas-------------
ID                  int64
Dept_name          object
location           object
travel_required    object
dtype: object


In [154]:
print('-------------PySpark-------------')
print(df_spark.describe().show())
print('-------------Pandas-------------')
print(df_pandas.describe())
print(df_pandas.describe(include='all'))

-------------PySpark-------------
+-------+------------------+---------+---------+---------------+
|summary|                ID|Dept_name| location|travel_required|
+-------+------------------+---------+---------+---------------+
|  count|                35|       35|       35|             35|
|   mean|              18.0|     null|     null|           null|
| stddev|10.246950765959598|     null|     null|           null|
|    min|                 1|  Finance|Bangalore|             no|
|    max|                35|     Tech|     Pune|            yes|
+-------+------------------+---------+---------+---------------+

None
-------------Pandas-------------
              ID
count  35.000000
mean   18.000000
std    10.246951
min     1.000000
25%     9.500000
50%    18.000000
75%    26.500000
max    35.000000
               ID Dept_name   location travel_required
count   35.000000        35         35              35
unique        NaN         3          3               2
top           NaN      T

In [155]:
print('-------------PySpark-------------')
print(df_spark.select('Dept_name').show(5))
print('-------------Pandas-------------')
print(df_pandas['Dept_name'].head(5))

-------------PySpark-------------
+---------+
|Dept_name|
+---------+
|       HR|
|  Finance|
|  Finance|
|  Finance|
|     Tech|
+---------+
only showing top 5 rows

None
-------------Pandas-------------
0         HR
1    Finance
2    Finance
3    Finance
4       Tech
Name: Dept_name, dtype: object


### adding column in dataframe

In [162]:
df_spark = df_spark.withColumn('ID_new',df_spark['ID']*100)
df_pandas['ID_new'] = df_pandas['ID']*100

In [163]:
print('-------------PySpark-------------')
print(df_spark.show(5))
print('-------------Pandas-------------')
print(df_pandas.head(5))

-------------PySpark-------------
+---+---------+---------+---------------+------+
| ID|Dept_name| location|travel_required|ID_new|
+---+---------+---------+---------------+------+
|  1|       HR|     Pune|            yes|   100|
|  2|  Finance|Bangalore|             no|   200|
|  3|  Finance|Bangalore|             no|   300|
|  4|  Finance|     Pune|             no|   400|
|  5|     Tech|   Mumbai|             no|   500|
+---+---------+---------+---------------+------+
only showing top 5 rows

None
-------------Pandas-------------
   ID Dept_name   location travel_required  ID_new
0   1        HR       Pune             yes     100
1   2   Finance  Bangalore              no     200
2   3   Finance  Bangalore              no     300
3   4   Finance       Pune              no     400
4   5      Tech     Mumbai              no     500


### Dropping the columns

In [164]:
df_spark = df_spark.drop('ID_new')
df_pandas = df_pandas.drop(['ID_new'],axis=1)

In [165]:
print('-------------PySpark-------------')
print(df_spark.show(5))
print('-------------Pandas-------------')
print(df_pandas.head(5))

-------------PySpark-------------
+---+---------+---------+---------------+
| ID|Dept_name| location|travel_required|
+---+---------+---------+---------------+
|  1|       HR|     Pune|            yes|
|  2|  Finance|Bangalore|             no|
|  3|  Finance|Bangalore|             no|
|  4|  Finance|     Pune|             no|
|  5|     Tech|   Mumbai|             no|
+---+---------+---------+---------------+
only showing top 5 rows

None
-------------Pandas-------------
   ID Dept_name   location travel_required
0   1        HR       Pune             yes
1   2   Finance  Bangalore              no
2   3   Finance  Bangalore              no
3   4   Finance       Pune              no
4   5      Tech     Mumbai              no


### Renaming Column

In [166]:
df_spark = df_spark.withColumnRenamed('Dept_name','Department_name')
df_pandas.rename(columns={'Dept_name':'Department_name'},inplace=True)

In [167]:
print('-------------PySpark-------------')
print(df_spark.show(5))
print('-------------Pandas-------------')
print(df_pandas.head(5))

-------------PySpark-------------
+---+---------------+---------+---------------+
| ID|Department_name| location|travel_required|
+---+---------------+---------+---------------+
|  1|             HR|     Pune|            yes|
|  2|        Finance|Bangalore|             no|
|  3|        Finance|Bangalore|             no|
|  4|        Finance|     Pune|             no|
|  5|           Tech|   Mumbai|             no|
+---+---------------+---------+---------------+
only showing top 5 rows

None
-------------Pandas-------------
   ID Department_name   location travel_required
0   1              HR       Pune             yes
1   2         Finance  Bangalore              no
2   3         Finance  Bangalore              no
3   4         Finance       Pune              no
4   5            Tech     Mumbai              no
