In [1]:
spark.stop()

In [2]:
sc.stop()

In [3]:
from pyspark import SparkConf, SparkContext

config = SparkConf().setMaster('local[4]').setAppName("ETLSession")
sc = SparkContext(conf=config)

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ETL Pipeline').getOrCreate()

In [5]:
spark

In [6]:
sc

In [7]:
hremployeeDF = spark.read.format('jdbc')\
.option('url','jdbc:mysql://localhost:3306/hremployeeDB')\
.option('dbtable', 'HR_Employee')\
.option('user', 'root')\
.option('password', 'hadoop@123')\
.option('driver', 'com.mysql.cj.jdbc.Driver')\
.load()

In [9]:
hremployeeDF.count()

1469

In [10]:
# show physical plan of execution which is known as DAG
hremployeeDF.explain()

== Physical Plan ==
*(1) Scan JDBCRelation(HR_Employee) [numPartitions=1] [EmployeeID#0,Department#1,JobRole#2,Attrition#3,Gender#4,Age#5,MaritalStatus#6,Education#7,EducationField#8,BusinessTravel#9,JobInvolvement#10,JobLevel#11,JobSatisfaction#12,Hourlyrate#13,Income#14,Salaryhike#15,OverTime#16,Workex#17,YearsSinceLastPromotion#18,EmpSatisfaction#19,TrainingTimesLastYear#20,WorkLifeBalance#21,Performance_Rating#22] PushedFilters: [], ReadSchema: struct<EmployeeID:int,Department:string,JobRole:string,Attrition:string,Gender:string,Age:int,Mar...


#### Create materialized view

In [11]:
hremployeeDF.createOrReplaceTempView('hremployee')

#### 1. Display shape of hremployee table
- show number of rows number of columns

In [33]:
no_cols = len(hremployeeDF.columns)
# number of rows:
spark.sql(f"""
select count(*) as rows, {no_cols} as cols
from hremployee
""").show()

+----+----+
|rows|cols|
+----+----+
|1469|  23|
+----+----+



In [98]:
spark.sql(f"""
 SHOW COLUMNS IN hremployee
""").show()

+--------------------+
|            col_name|
+--------------------+
|          EmployeeID|
|          Department|
|             JobRole|
|           Attrition|
|              Gender|
|                 Age|
|       MaritalStatus|
|           Education|
|      EducationField|
|      BusinessTravel|
|      JobInvolvement|
|            JobLevel|
|     JobSatisfaction|
|          Hourlyrate|
|              Income|
|          Salaryhike|
|            OverTime|
|              Workex|
|YearsSinceLastPro...|
|     EmpSatisfaction|
+--------------------+
only showing top 20 rows



In [41]:
len(hremployeeDF.columns)

23

In [70]:
spark.sql('describe hremployee').show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|          EmployeeID|      int|   null|
|          Department|   string|   null|
|             JobRole|   string|   null|
|           Attrition|   string|   null|
|              Gender|   string|   null|
|                 Age|      int|   null|
|       MaritalStatus|   string|   null|
|           Education|   string|   null|
|      EducationField|   string|   null|
|      BusinessTravel|   string|   null|
|      JobInvolvement|   string|   null|
|            JobLevel|      int|   null|
|     JobSatisfaction|   string|   null|
|          Hourlyrate|      int|   null|
|              Income|      int|   null|
|          Salaryhike|      int|   null|
|            OverTime|   string|   null|
|              Workex|      int|   null|
|YearsSinceLastPro...|      int|   null|
|     EmpSatisfaction|   string|   null|
+--------------------+---------+-------+
only showing top

#### 2. write a query to show first three employee from each job role to join the company.

In [99]:
spark.sql("""
select *
from (
select 
    EmployeeID,
    JobRole,
    Workex,
    rank() over (partition by JobRole order by EmployeeID) as rnk
from hremployee
) as _
where rnk <= 3
""").show()

+----------+--------------------+------+---+
|EmployeeID|             JobRole|Workex|rnk|
+----------+--------------------+------+---+
|         1|     Sales Executive|     8|  1|
|        28|     Sales Executive|    10|  2|
|        40|     Sales Executive|    10|  3|
|         9|Manufacturing Dir...|    10|  1|
|        16|Manufacturing Dir...|    10|  2|
|        21|Manufacturing Dir...|     5|  3|
|         3|Laboratory Techni...|     7|  1|
|         5|Laboratory Techni...|     6|  2|
|         6|Laboratory Techni...|     8|  3|
|        22|Sales Representative|    10|  1|
|        34|Sales Representative|    19|  2|
|        37|Sales Representative|     3|  3|
|        10|Healthcare Repres...|    17|  1|
|        29|Healthcare Repres...|    24|  2|
|        32|Healthcare Repres...|     9|  3|
|         2|  Research Scientist|    10|  1|
|         4|  Research Scientist|     8|  2|
|        13|  Research Scientist|     5|  3|
|        19|             Manager|    31|  1|
|        2

#### 3. write a query to show top three employees from each job role earning highest salary

In [112]:
spark.sql("""
select *
from (
select 
    EmployeeID,
    JobRole,
    Income,
    rank() over (partition by JobRole order by Income desc) as rank
from hremployee
) as _
where rank <= 3
""").show()

+----------+--------------------+------+----+
|EmployeeID|             JobRole|Income|rank|
+----------+--------------------+------+----+
|        99|     Sales Executive| 13872|   1|
|       545|     Sales Executive| 13770|   2|
|       839|     Sales Executive| 13758|   3|
|       722|Manufacturing Dir...| 13973|   1|
|       628|Manufacturing Dir...| 13826|   2|
|       744|Manufacturing Dir...| 13726|   3|
|       678|Laboratory Techni...|  7403|   1|
|       817|Laboratory Techni...|  6782|   2|
|       945|Laboratory Techni...|  6674|   3|
|       565|Sales Representative|  6632|   1|
|      1308|Sales Representative|  5405|   2|
|      1220|Sales Representative|  4502|   3|
|      1181|Healthcare Repres...| 13966|   1|
|       317|Healthcare Repres...| 13964|   2|
|       190|Healthcare Repres...| 13734|   3|
|        68|  Research Scientist|  9724|   1|
|      1315|  Research Scientist|  6962|   2|
|      1305|  Research Scientist|  6854|   3|
|       191|             Manager| 

#### 4. Show top 3 highest package from overall job role

In [111]:
spark.sql("""
select EmployeeID, JobRole, Department, Income
from hremployee
order by Income desc
limit 3
""").show()

+----------+-----------------+--------------------+------+
|EmployeeID|          JobRole|          Department|Income|
+----------+-----------------+--------------------+------+
|       191|          Manager|Research & Develo...| 19999|
|       747|Research Director|Research & Develo...| 19973|
|       852|          Manager|Research & Develo...| 19943|
+----------+-----------------+--------------------+------+



#### 5.  write a SQL query to show employee in ascending order with respect to employee income compared with previous income for each job role

In [137]:
spark.sql("""
select
    *,
    prev_income - Income as income_diff
from (
select 
    EmployeeID,
    JobRole,
    Income,
    lag(Income, 1, 0) over (partition by JobRole order by EmployeeID desc) as prev_income
from hremployee
) as _
order by JobRole, income_diff

""").show()

+----------+--------------------+------+-----------+-----------+
|EmployeeID|             JobRole|Income|prev_income|income_diff|
+----------+--------------------+------+-----------+-----------+
|      1466|Healthcare Repres...|  9991|          0|      -9991|
|       269|Healthcare Repres...| 13496|       4741|      -8755|
|      1181|Healthcare Repres...| 13966|       6842|      -7124|
|      1138|Healthcare Repres...| 11245|       4148|      -7097|
|       190|Healthcare Repres...| 13734|       6673|      -7061|
|       675|Healthcare Repres...| 10552|       4014|      -6538|
|       376|Healthcare Repres...| 10965|       4522|      -6443|
|       814|Healthcare Repres...| 12169|       5731|      -6438|
|      1055|Healthcare Repres...| 10466|       4035|      -6431|
|       737|Healthcare Repres...| 10999|       4777|      -6222|
|       730|Healthcare Repres...| 10388|       4240|      -6148|
|      1093|Healthcare Repres...| 10124|       4069|      -6055|
|        65|Healthcare Re

In [139]:
spark.sql("""
select 
    EmployeeID,
    JobRole,
    Income,
    lag(Income, 1, 0) over (partition by JobRole order by EmployeeID desc)  - Income as income_diff
from hremployee
order by JobRole, income_diff

""").show()

+----------+--------------------+------+-----------+
|EmployeeID|             JobRole|Income|income_diff|
+----------+--------------------+------+-----------+
|      1466|Healthcare Repres...|  9991|      -9991|
|       269|Healthcare Repres...| 13496|      -8755|
|      1181|Healthcare Repres...| 13966|      -7124|
|      1138|Healthcare Repres...| 11245|      -7097|
|       190|Healthcare Repres...| 13734|      -7061|
|       675|Healthcare Repres...| 10552|      -6538|
|       376|Healthcare Repres...| 10965|      -6443|
|       814|Healthcare Repres...| 12169|      -6438|
|      1055|Healthcare Repres...| 10466|      -6431|
|       737|Healthcare Repres...| 10999|      -6222|
|       730|Healthcare Repres...| 10388|      -6148|
|      1093|Healthcare Repres...| 10124|      -6055|
|        65|Healthcare Repres...| 10096|      -5944|
|       472|Healthcare Repres...|  9824|      -5735|
|       915|Healthcare Repres...| 13577|      -5599|
|        94|Healthcare Repres...| 10673|      

In [124]:
spark.sql("""
select
    *
from hremployee
where Income = 2132

""").show()

+----------+--------------------+------------------+---------+------+---+-------------+---------+----------------+--------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|EmployeeID|          Department|           JobRole|Attrition|Gender|Age|MaritalStatus|Education|  EducationField|BusinessTravel|JobInvolvement|JobLevel|JobSatisfaction|Hourlyrate|Income|Salaryhike|OverTime|Workex|YearsSinceLastPromotion|EmpSatisfaction|TrainingTimesLastYear|WorkLifeBalance|Performance_Rating|
+----------+--------------------+------------------+---------+------+---+-------------+---------+----------------+--------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|       721|Research & Develo...|Research Scientist|      Yes|Fe

#### 6. lead()
- Row's next records

In [150]:
spark.sql("""
select EmployeeID, Department, JobRole, age, gender, Income, workex,
lead(Income, 2, 0) over(partition by JobRole order by EmployeeID) as next_income
from hremployee
""").show()

+----------+----------+---------------+---+------+------+------+-----------+
|EmployeeID|Department|        JobRole|age|gender|Income|workex|next_income|
+----------+----------+---------------+---+------+------+------+-----------+
|         1|     Sales|Sales Executive| 41|Female|  5993|     8|       5376|
|        28|     Sales|Sales Executive| 42|  Male|  6825|    10|       8726|
|        40|     Sales|Sales Executive| 33|Female|  5376|    10|       4568|
|        44|     Sales|Sales Executive| 27|  Male|  8726|     9|       5772|
|        47|     Sales|Sales Executive| 34|  Male|  4568|    10|       5454|
|        49|     Sales|Sales Executive| 46|  Male|  5772|    14|       4157|
|        53|     Sales|Sales Executive| 44|Female|  5454|     9|       9069|
|        55|     Sales|Sales Executive| 26|Female|  4157|     5|       7637|
|        57|     Sales|Sales Executive| 35|  Male|  9069|     9|       5473|
|        64|     Sales|Sales Executive| 59|Female|  7637|    28|       4312|

#### 7. NTILE():
- dividing records into a number of quartiles

In [152]:
spark.sql("""
select EmployeeID, Department, JobRole, age, gender, Income, workex,
ntile(4) over(order by Income) as income_tile
from hremployee
""").show()

+----------+--------------------+--------------------+---+------+------+------+-----------+
|EmployeeID|          Department|             JobRole|age|gender|Income|workex|income_tile|
+----------+--------------------+--------------------+---+------+------+------+-----------+
|       514|Research & Develo...|  Research Scientist| 20|  Male|  1009|     1|          1|
|       728|Research & Develo...|  Research Scientist| 18|  Male|  1051|     0|          1|
|       765|               Sales|Sales Representative| 28|  Male|  1052|     1|          1|
|      1338|               Sales|Sales Representative| 30|  Male|  1081|     1|          1|
|      1365|               Sales|Sales Representative| 29|  Male|  1091|     1|          1|
|       178|Research & Develo...|Laboratory Techni...| 19|  Male|  1102|     1|          1|
|       912|               Sales|Sales Representative| 25|  Male|  1118|     1|          1|
|      1402|Research & Develo...|Laboratory Techni...| 31|Female|  1129|     1| 

#### 8. find the number of employees in each percentile group 0-25th 25th-50th, 50th-75th, 75th-100th, using percent rank and case when to create category

In [168]:
spark.sql("""
select
    case when percentrank < 0.25 then '0th-25th'
        when percentrank < 0.50 then '25th-50th'
        when percentrank < 0.75 then '50th-75th'
        else '75th-100th'
    end as category,
    count(*) as count
from (
select
    EmployeeID,
    percent_rank() over(partition by Department order by Income) as percentrank
from hremployee
) as _
group by category
order by category
""").show()

+----------+-----+
|  category|count|
+----------+-----+
|  0th-25th|  367|
| 25th-50th|  366|
| 50th-75th|  367|
|75th-100th|  369|
+----------+-----+



#### Hive intergration with PySpark

In [1]:
# close pyspark and start again and start from here
spark.stop()

In [2]:
# make sure every service is running
!jps

21186 SecondaryNameNode
21346 ResourceManager
21511 NodeManager
20760 NameNode
20955 DataNode
22588 SparkSubmit
22671 Jps


In [3]:
# Spark integration with Hive Warehouse
# config name for Hive Integration "spark.sql.warehouse.dir" value=/usr/hive/warehouse
spark = (SparkSession.builder.appName("pyspak-Hive-Integration")
        .config("spark.sql.warehouse.dir", "/usr/hive/warehouse")
        .enableHiveSupport().getOrCreate())

In [4]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [5]:
spark

In [6]:
spark.sql("""
create database if not exists airlines
""")

DataFrame[]

In [7]:
spark.sql("""
use airlines
""")

DataFrame[]

In [9]:
spark.sql("""
show databases
""").show()

+------------+
|databaseName|
+------------+
|    airlines|
|     default|
+------------+



In [11]:
spark.sql("""
show tables
""").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [30]:

sql_statement = """
CREATE TABLE IF NOT EXISTS flights (
    DayofMonth INT,
    DayofWeek INT,
    Carrier STRING,
    OriginAirportId INT,
    DestAirportId INT,
    DepDelay INT,
    ArrDelay INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
TBLPROPERTIES ('skip.header.line.count'='1')
"""

spark.sql(sql_statement)

DataFrame[]

In [31]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|airlines| airports|      false|
|airlines|  flights|      false|
+--------+---------+-----------+



In [32]:
spark.sql("""
load data local inpath '/home/hadoop/Downloads/raw_flight_data.csv' overwrite into table flights
""")

DataFrame[]

In [33]:

sql_statement = """
CREATE TABLE IF NOT EXISTS airports(
   airport_id int,
   city varchar(50),
   state varchar(50),
   name varchar(50)
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
TBLPROPERTIES ('skip.header.line.count'='1')
"""

spark.sql(sql_statement)

DataFrame[]

In [34]:
spark.sql("""
load data local inpath '/home/hadoop/Downloads/airports.csv' overwrite into table airports
""")

DataFrame[]

In [35]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|airlines| airports|      false|
|airlines|  flights|      false|
+--------+---------+-----------+



In [36]:
spark.sql("""
select count(*) from flights
""").show()

+--------+
|count(1)|
+--------+
| 1048575|
+--------+



In [37]:
spark.sql("""
select count(*) from airports
""").show()

+--------+
|count(1)|
+--------+
|     365|
+--------+



In [38]:
spark.sql("""
select * from airports
""").show(5)

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
+----------+-----------+-----+--------------------+
only showing top 5 rows



In [39]:
spark.sql("""
select * from flights
""").show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayofWeek|Carrier|OriginAirportId|DestAirportId|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



#### 1. Extract

In [41]:
flights_df = spark.table("airlines.flights")
airports_df = spark.table("airlines.airports")

In [42]:
airports_df.show(5)

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
+----------+-----------+-----+--------------------+
only showing top 5 rows



In [43]:
flights_df.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayofWeek|Carrier|OriginAirportId|DestAirportId|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



#### 2. Transformation

In [46]:
flights_join = flights_df.join(
    airports_df, 
    on=flights_df.OriginAirportId == airports_df.airport_id, 
    how='inner')

In [47]:
flights_join.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|DayofMonth|DayofWeek|Carrier|OriginAirportId|DestAirportId|DepDelay|ArrDelay|airport_id|          city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|       Detroit|   MI|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City|   UT|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|      Portland|   OR|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|     St. Louis|   MO|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        12892|      -6|     -1

#### 3. Load

In [50]:
flights_join = flights_join.repartition(4)

In [55]:
# write it into local as parquet
flights_join.write.parquet('file:///home/hadoop/Downloads/flights')

In [52]:
# load data from local parquet file into datafile
flights_parquet_df = spark.read.parquet('file:///home/hadoop/Downloads/flights')

In [53]:
flights_parquet_df.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+----------+---------+-----+--------------------+
|DayofMonth|DayofWeek|Carrier|OriginAirportId|DestAirportId|DepDelay|ArrDelay|airport_id|     city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+---------+-----+--------------------+
|        20|        1|     WN|          14908|        13796|      -1|     -10|     14908|Santa Ana|   CA|John Wayne Airpor...|
|         9|        7|     WN|          14831|        13891|      -1|      -3|     14831| San Jose|   CA|Norman Y. Mineta ...|
|         8|        1|     EV|          13931|        12266|      -5|     -24|     13931|  Norfolk|   VA|Norfolk Internati...|
|        31|        5|     UA|          11618|        10721|      36|      25|     11618|   Newark|   NJ|Newark Liberty In...|
|        13|        6|     WN|          13871|        12889|      -3|      -9|     13871|    Omaha|   NE|     E

In [57]:
# write data as parquet into hdfs
flights_join.write.parquet('/flights1')

In [58]:
# write data as parquet into hdfs by partitioning it by carriers
flights_join.write.partitionBy('Carrier').parquet('/flights2')

In [60]:
flights_join.show(1)

+----------+---------+-------+---------------+-------------+--------+--------+----------+---------+-----+--------------------+
|DayofMonth|DayofWeek|Carrier|OriginAirportId|DestAirportId|DepDelay|ArrDelay|airport_id|     city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+---------+-----+--------------------+
|        20|        1|     WN|          14908|        13796|      -1|     -10|     14908|Santa Ana|   CA|John Wayne Airpor...|
+----------+---------+-------+---------------+-------------+--------+--------+----------+---------+-----+--------------------+
only showing top 1 row



In [65]:
flights_join.write.bucketBy(col='state', numBuckets = 50).format('csv').saveAsTable('bucketed_table')

In [66]:
flights_join.write.partitionBy('Carrier').bucketBy(col='state', numBuckets = 50).format('parquet').saveAsTable('part_bucket_table')

In [68]:
spark.sql('select carrier, count(*) from part_bucket_table group by carrier').show()

+-------+--------+
|carrier|count(1)|
+-------+--------+
|     UA|  122443|
|     AA|  124037|
|     EV|   46563|
|     B6|   51381|
|     DL|  134724|
|     OO|   69785|
|     F9|    9811|
|     YV|   14612|
|     US|  100668|
|     MQ|   45926|
|     HA|    4962|
|     AS|   28796|
|     FL|   28053|
|     VX|   14683|
|     WN|  216101|
|     9E|   36030|
+-------+--------+



In [72]:
!hdfs dfs -find / -name bucketed_table

/usr/hive/warehouse/airlines.db/bucketed_table


#### Load on MySQL

In [75]:
connection_properties = {
    'user': 'root',
    'password': 'hadoop@123',
    'driver': 'com.mysql.cj.jdbc.Driver'
}

flights_join.write.jdbc(
    url='jdbc:mysql://localhost:3306/flights', 
    table='airlines', 
    mode='overwrite',
    properties=connection_properties
)

In [1]:
spark.stop()

In [2]:
sc.stop()