In [51]:
sc

In [52]:
spark

In [53]:
people_df = spark.read.json("file:///home/hadoop/Downloads/People.json")

In [54]:
people_df.show(5)

+---------+-----------+----------+------+---+---------+------+
|     city|    country|first_name|gender| id|last_name|salary|
+---------+-----------+----------+------+---+---------+------+
|Mulyosari|  Indonesia|     Valma|Female|  1|     Sans|983107|
|  Niihama|      Japan|     Paolo|  Male|  2|   Kiddie|649173|
|Dū Qal‘ah|Afghanistan|    Miltie|  Male|  3| De Zuani|352898|
|   Iberia|       Peru|    Jarrid|  Male|  4| Dalziell|170398|
| La Ronge|     Canada| Reinaldos|  Male|  5|   Keeffe|440989|
+---------+-----------+----------+------+---+---------+------+
only showing top 5 rows



In [55]:
people_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: long (nullable = true)



#### 1. create a user defined schema for fields of DataFrame

In [56]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType, StructField, LongType

In [57]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", LongType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
])

In [58]:
people_df = spark.read.schema(schema).json("file:///home/hadoop/Downloads/People.json")

In [59]:
people_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



In [60]:
# creating new columns using existing columns
# people_df.withColumn()

In [61]:
people_df.show(5)

+---+----------+---------+------+------+---------+-----------+
| id|first_name|last_name|gender|salary|     city|    country|
+---+----------+---------+------+------+---------+-----------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|   Iberia|       Peru|
|  5| Reinaldos|   Keeffe|  Male|440989| La Ronge|     Canada|
+---+----------+---------+------+------+---------+-----------+
only showing top 5 rows



In [62]:
# if the input source json is a multiline json array we need to set multiLine=True
bank_data = spark.read.json('file:///home/hadoop/Downloads/bank_edited.json', multiLine=True)
bank_data.show(5)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan|marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no|married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no| single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes|married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no|married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown|  no| si

In [63]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



#### 2. Typecasting any column

In [64]:
# any column wise tranformations can be done using withColumn
# casting age as integer using IntegerType()

bank_data.withColumn('age', col('age').cast(IntegerType()))

DataFrame[age: int, balance: bigint, campaign: bigint, contact: string, day: bigint, default: string, duration: bigint, education: string, housing: string, job: string, loan: string, marital: string, month: string, pdays: bigint, poutcome: string, previous: bigint, y: string]

#### 3. creating new column from existing two strings

In [65]:
from pyspark.sql.functions import concat

people_df.withColumn('full_name', concat(col('first_name'), lit(' ') ,  col('last_name'))).show(5)

+---+----------+---------+------+------+---------+-----------+----------------+
| id|first_name|last_name|gender|salary|     city|    country|       full_name|
+---+----------+---------+------+------+---------+-----------+----------------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia|      Valma Sans|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|    Paolo Kiddie|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan| Miltie De Zuani|
|  4|    Jarrid| Dalziell|  Male|170398|   Iberia|       Peru| Jarrid Dalziell|
|  5| Reinaldos|   Keeffe|  Male|440989| La Ronge|     Canada|Reinaldos Keeffe|
+---+----------+---------+------+------+---------+-----------+----------------+
only showing top 5 rows



#### 4. Renaming existing column

In [66]:
people_df = people_df.withColumnRenamed('salary', 'income')
people_df.show(5)

+---+----------+---------+------+------+---------+-----------+
| id|first_name|last_name|gender|income|     city|    country|
+---+----------+---------+------+------+---------+-----------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|   Iberia|       Peru|
|  5| Reinaldos|   Keeffe|  Male|440989| La Ronge|     Canada|
+---+----------+---------+------+------+---------+-----------+
only showing top 5 rows



#### 5. limit()

In [67]:
people_df.limit(5).show()

+---+----------+---------+------+------+---------+-----------+
| id|first_name|last_name|gender|income|     city|    country|
+---+----------+---------+------+------+---------+-----------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|   Iberia|       Peru|
|  5| Reinaldos|   Keeffe|  Male|440989| La Ronge|     Canada|
+---+----------+---------+------+------+---------+-----------+



#### 6. orderBy():
- arrange data in ascending and descensing order

In [68]:
people_df.orderBy(['income'], asc=True).show(5)

+---+----------+---------+------+------+------------+---------+
| id|first_name|last_name|gender|income|        city|  country|
+---+----------+---------+------+------+------------+---------+
| 93|      Cory|    Prigg|  Male| 12876|     Gondang|Indonesia|
|590|      Flem| Tumielli|  Male| 13347| Debre Zeyit| Ethiopia|
|192|       Odo|  Conyers|  Male| 15555|  Raffingora| Zimbabwe|
|407|  Barbabas|Ballingal|  Male| 18598|Beringinjaya|Indonesia|
|297|     Daron|   Melato|Female| 19881|      Phayao| Thailand|
+---+----------+---------+------+------+------------+---------+
only showing top 5 rows



In [69]:
# First order by country name in ascending order,
# and then within each country name, order by income in descending order

people_df.orderBy(['country', 'income'], ascending=[True, False]).show()

+---+----------+------------+------+------+------------------+--------------+
| id|first_name|   last_name|gender|income|              city|       country|
+---+----------+------------+------+------+------------------+--------------+
|490|  Cathlene|    Gatfield|Female|981605|           Mīrābād|   Afghanistan|
|448|      Yuri|     Duggary|  Male|414107|     Sang-e Māshah|   Afghanistan|
|  3|    Miltie|    De Zuani|  Male|352898|         Dū Qal‘ah|   Afghanistan|
|155|    Guntar|    Langmuir|  Male|290613|             Khōst|   Afghanistan|
|983|      Tiff|     Dreakin|Female|208548|             Āsmār|   Afghanistan|
|290|     Myles|      Britch|  Male|191508|         Dū Laīnah|   Afghanistan|
|419|   Ezekiel|   Fleetwood|  Male|163113|      Barakī Barak|   Afghanistan|
|701|    Gerrie|      Heigho|  Male|503327|             Föglö| Aland Islands|
|674|    Ludwig|    Bothwell|  Male|825171|         Martanesh|       Albania|
|421|    Hamnet|     Maruska|  Male|129628|           Hoçisht|  

#### Materialized View
- createOrReplaceTempView()

In [82]:
bank_data.createOrReplaceTempView('bankdata')

In [84]:
spark.sql('select * from bankdata').show(5)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan|marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no|married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no| single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes|married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no|married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown|  no| si

In [86]:
spark.sql('select count(*) from bankdata').show()

+--------+
|count(1)|
+--------+
|   45211|
+--------+



In [110]:
#### Select top 10 youngest age group with highest salary
spark.sql('select age, max(balance) from bankdata group by age order by age asc, max(balance) desc').show(10)

+---+------------+
|age|max(balance)|
+---+------------+
| 18|        1944|
| 19|        5368|
| 20|        8860|
| 21|        8278|
| 22|       10971|
| 23|       19690|
| 24|       23878|
| 25|       16874|
| 26|       24299|
| 27|       24025|
+---+------------+
only showing top 10 rows



In [98]:
#### Select top 10 youngest age group employees with highest salary
spark.sql('select age, balance from bankdata order by age asc, balance desc').show(10)

+---+-------+
|age|balance|
+---+-------+
| 18|   1944|
| 18|    608|
| 18|    608|
| 18|    438|
| 18|    348|
| 18|    156|
| 18|    108|
| 18|    108|
| 18|    108|
| 18|     35|
+---+-------+
only showing top 10 rows



In [99]:
### show the worst 5 job type having minimum salary
### using avg
spark.sql('select job, avg(balance) from bankdata group by job order by avg(balance) asc').show(5)

+-----------+------------------+
|        job|      avg(balance)|
+-----------+------------------+
|   services| 997.0881078478575|
|blue-collar|1078.8266543362104|
|     admin.| 1135.838909301876|
| technician|1252.6320916151112|
|    student|1388.0607675906183|
+-----------+------------------+
only showing top 5 rows



In [108]:
### show the worst 5 job type having minimum salary
### by min salary
spark.sql('select job,min(balance) from bankdata group by job order by min(balance) asc').show(5)

+-------------+------------+
|          job|min(balance)|
+-------------+------------+
|  blue-collar|       -8019|
|   management|       -6847|
|self-employed|       -3313|
|   technician|       -2827|
|     services|       -2122|
+-------------+------------+
only showing top 5 rows

