In [2]:
from pyspark.sql import SparkSession

spark = SparkSession\
.builder\
.master("yarn")\
.appName("assignment-6")\
.enableHiveSupport()\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [4]:
my_list=[("Spring",12.3),("Summer",10.5),("Autumn",8.2),("Winter",15.1)]

In [5]:
df = spark.createDataFrame(my_list).toDF("season","windspeed")
df.show()

+------+---------+
|season|windspeed|
+------+---------+
|Spring|     12.3|
|Summer|     10.5|
|Autumn|      8.2|
|Winter|     15.1|
+------+---------+



In [6]:
df.printSchema()

root
 |-- season: string (nullable = true)
 |-- windspeed: double (nullable = true)



In [7]:
!hdfs dfs -head /public/trendytech/datasets/library_data.json

{"library_name": "Central Library","location": "City Center","books": [{"book_id": "B001","book_name": "The Great Gatsby","author": "F. Scott Fitzgerald","copies_available": 5},{"book_id": "B002","book_name": "To Kill a Mockingbird","author": "Harper Lee","copies_available": 3}],"members": [{"member_id": "M001","member_name": "John Smith","age": 28,"books_borrowed": ["B001"]},{"member_id": "M002","member_name": "Emma Johnson","age": 35,"books_borrowed": []}]},
{"library_name": "Community Library","location": "Suburb","books": [{"book_id": "B003","book_name": "1984","author": "George Orwell","copies_available": 2},{"book_id": "B004","book_name": "Pride and Prejudice","author": "Jane Austen","copies_available": 4}],"members": [{"member_id": "M003","member_name": "Michael Brown","age": 42,"books_borrowed": ["B003","B004"]},{"member_id": "M004","member_name": "Sophia Davis","age": 31,"books_borrowed": ["B004"]}]}


In [22]:
from pyspark.sql.types import *
struct_schema = StructType([
    StructField("library_name",StringType()),
    StructField("location",StringType()),
    StructField("books",ArrayType(StructType([
        StructField("book_id",StringType()),
        StructField("book_name",StringType()),
        StructField("author",StringType()),
        StructField("copies_available",IntegerType())
    ])
                                 )),
    StructField("members",ArrayType(StructType([
        StructField("member_id",StringType()),
        StructField("member_name",StringType()),
        StructField("age",IntegerType()),
        StructField("books_borrowed",ArrayType(StringType()))
    ])
                                   ))
    
])


In [23]:
lib_df = spark.read.format("json").schema(struct_schema).load("/public/trendytech/datasets/library_data.json")

In [26]:
lib_df.show(truncate=0)

+-----------------+-----------+------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|library_name     |location   |books                                                                                           |members                                                                    |
+-----------------+-----------+------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|Central Library  |City Center|[{B001, The Great Gatsby, F. Scott Fitzgerald, 5}, {B002, To Kill a Mockingbird, Harper Lee, 3}]|[{M001, John Smith, 28, [B001]}, {M002, Emma Johnson, 35, []}]             |
|Community Library|Suburb     |[{B003, 1984, George Orwell, 2}, {B004, Pride and Prejudice, Jane Austen, 4}]                   |[{M003, Michael Brown, 42, [B003, B004]}, {M004, Sop

In [27]:
!hdfs dfs -head /public/trendytech/datasets/train.csv

train_number,train_name,seats_available,passenger_name,age,ticket_number,seat_number
123,Express,100,John,25,T123,A1
123,Express,100,Emma,30,T124,B2
456,Superfast,150,Michael,35,T125,C3
456,Superfast,150,Sophia,40,T126,D4
789,Local,50,William,28,T127,E5
789,Local,50,Sophia,32,T128,F6
789,Local,50,Oliver,45,T129,G7


In [28]:
train_df = spark.read.format("csv").\
option("inferSchema","true").\
option("header","true").\
load("/public/trendytech/datasets/train.csv")

train_df.show()

+------------+----------+---------------+--------------+---+-------------+-----------+
|train_number|train_name|seats_available|passenger_name|age|ticket_number|seat_number|
+------------+----------+---------------+--------------+---+-------------+-----------+
|         123|   Express|            100|          John| 25|         T123|         A1|
|         123|   Express|            100|          Emma| 30|         T124|         B2|
|         456| Superfast|            150|       Michael| 35|         T125|         C3|
|         456| Superfast|            150|        Sophia| 40|         T126|         D4|
|         789|     Local|             50|       William| 28|         T127|         E5|
|         789|     Local|             50|        Sophia| 32|         T128|         F6|
|         789|     Local|             50|        Oliver| 45|         T129|         G7|
+------------+----------+---------------+--------------+---+-------------+-----------+



In [29]:
train_df.printSchema()

root
 |-- train_number: integer (nullable = true)
 |-- train_name: string (nullable = true)
 |-- seats_available: integer (nullable = true)
 |-- passenger_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- ticket_number: string (nullable = true)
 |-- seat_number: string (nullable = true)



In [31]:
df1 = train_df.drop("passenger_name","age")
df1.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         123|   Express|            100|         T123|         A1|
|         123|   Express|            100|         T124|         B2|
|         456| Superfast|            150|         T125|         C3|
|         456| Superfast|            150|         T126|         D4|
|         789|     Local|             50|         T127|         E5|
|         789|     Local|             50|         T128|         F6|
|         789|     Local|             50|         T129|         G7|
+------------+----------+---------------+-------------+-----------+



In [32]:
df1.count()

7

In [33]:
df2 = df1.dropDuplicates(["train_number","ticket_number"])
df2.count()

7

In [34]:
df2.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         789|     Local|             50|         T128|         F6|
|         123|   Express|            100|         T124|         B2|
|         123|   Express|            100|         T123|         A1|
|         456| Superfast|            150|         T126|         D4|
|         456| Superfast|            150|         T125|         C3|
|         789|     Local|             50|         T127|         E5|
|         789|     Local|             50|         T129|         G7|
+------------+----------+---------------+-------------+-----------+



In [40]:
df1.select("train_name").distinct().count()

3

In [41]:
!hdfs dfs -head /public/trendytech/datasets/sales_data.json

{"store_id": 1, "product": "Apple", "quantity": 10, "revenue": 100.0}
{"store_id": 2, "product": "Banana", "quantity": 15, "revenue": 75.0}
{"store_id": 3, "product": "Orange", "quantity": 12, "revenue": 90.0}
{"store_id": 4, "product": "Mango", "quantity": 8, "revenue": 120.0}
{"store_id": 5, "product": "Grape", "quantity": 20, "revenue": 150.0}
{"store_id": 6, "product": "Watermelon", "quantity": 5, "revenue": 50.0}
{"store_id": 7, "product": "Strawberry", "quantity": 18, "revenue": 108.0}
{"store_id": 8, "product": "Pineapple", "quantity": 14, "revenue": 140.0}
{"store_id": 9, "product": "Cherry", "quantity": 7, "revenue": 105.0}
{"store_id": 10, "product": "Pear", "quantity": 9, "revenue": 81.0}
{"store_id": 11, "product": "Blueberry", "quantity": 11, "revenue": 88.0}
{"store_id": 12, "product": "Kiwi", "quantity": 16, "revenue": 128.0}
{"store_id": 13, "product": "Peach", "quantity": 13, "revenue": 91.0}
{"store_id": 14, "product": "Plum", "quantity": 6, "revenue": 54.0}
{"store_i

In [47]:
schema1 = 'store_id integer, product string, quantity integer, revenue double'

In [52]:
df1 = spark.read.format("csv").\
schema(schema1).\
option("mode","permissive").\
load("/public/trendytech/datasets/sales_data.json")



In [55]:
df1.count()

22

In [64]:
df_dropmalformed=spark.read.option("mode","dropmalformed").schema(schema1).json("/public/trendytech/datasets/sales_data.json")

In [65]:
df_dropmalformed.count()

21

In [60]:
df_dropmalformed.show()

+--------+----------+--------+-------+
|store_id|   product|quantity|revenue|
+--------+----------+--------+-------+
|       1|     Apple|      10|  100.0|
|       2|    Banana|      15|   75.0|
|       3|    Orange|      12|   90.0|
|       4|     Mango|       8|  120.0|
|       5|     Grape|      20|  150.0|
|       6|Watermelon|       5|   50.0|
|       7|Strawberry|      18|  108.0|
|       8| Pineapple|      14|  140.0|
|       9|    Cherry|       7|  105.0|
|      10|      Pear|       9|   81.0|
|      11| Blueberry|      11|   88.0|
|      12|      Kiwi|      16|  128.0|
|      13|     Peach|      13|   91.0|
|      14|      Plum|       6|   54.0|
|      15|     Lemon|      10|   70.0|
|      16| Raspberry|      17|  136.0|
|      17|   Coconut|       4|   80.0|
|      18|   Avocado|      11|   99.0|
|      19|Blackberry|       8|   64.0|
+--------+----------+--------+-------+



In [66]:
df_failfast=spark.read.option("mode","failfast").schema(schema1).json("/public/trendytech/datasets/sales_data.json")

In [67]:
!hdfs dfs -head /public/trendytech/datasets/hospital.csv

patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost
1,01-01-2022,2022-01-10,Pneumonia,101,5000.00
2,02-05-2022,2022-02-09,Appendicitis,102,7000.00
3,03-12-2022,2022-03-18,Fractured Arm,103,3500.00
4,04-02-2022,2022-04-08,Heart Attack,104,15000.00
5,05-05-2022,2022-05-07,Influenza,105,2500.00
6,06-10-2022,2022-06-15,Appendicitis,106,8000.00
7,07-20-2022,2022-07-25,Pneumonia,107,5500.00
8,08-25-2022,2022-09-01,Heart Attack,108,20000.00
9,09-15-2022,2022-09-22,Fractured Leg,109,6000.00
10,10-05-2022,2022-10-10,Appendicitis,110,7500.00
11,11-02-2022,2022-11-05,Influenza,111,2800.00
12,12-10-2022,2022-12-18,Pneumonia,112,6000.00
13,01-02-2023,2023-01-09,Heart Attack,113,18000.00
14,02-14-2023,2023-02-18,Appendicitis,114,7200.00
15,03-20-2023,2023-03-28,Fractured Arm,115,3800.00
16,04-05-2023,2023-04-11,Influenza,116,2700.00
17,05-08-2023,2023-05-11,Heart Attack,117,16000.00
18,06-15-2023,2023-06-20,Pneumonia,118,4800.00
19,07-22-2023,2023-07-27,Fractured Leg,119,6500.00


In [68]:
schema1="patient_id integer, admission_date date, discharge_date date, diagnosis string,doctor_id integer, total_cost  float"

In [75]:
df = spark.read.format("csv").option("header","true").option("dateFormat","mm-dd-yyyy").schema(schema1).load("/public/trendytech/datasets/hospital.csv")

In [76]:
df.show()

+----------+--------------+--------------+-------------+---------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
+----------+--------------+--------------+-------------+---------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|    5000.0|
|         2|    2022-01-05|    2022-02-09| Appendicitis|      102|    7000.0|
|         3|    2022-01-12|    2022-03-18|Fractured Arm|      103|    3500.0|
|         4|    2022-01-02|    2022-04-08| Heart Attack|      104|   15000.0|
|         5|    2022-01-05|    2022-05-07|    Influenza|      105|    2500.0|
|         6|    2022-01-10|    2022-06-15| Appendicitis|      106|    8000.0|
|         7|    2022-01-20|    2022-07-25|    Pneumonia|      107|    5500.0|
|         8|    2022-01-25|    2022-09-01| Heart Attack|      108|   20000.0|
|         9|    2022-01-15|    2022-09-22|Fractured Leg|      109|    6000.0|
|        10|    2022-01-05|    2022-10-10| Appendicitis|      11

In [77]:
df1 = df.drop("doctor_id")
df1.show()

+----------+--------------+--------------+-------------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|total_cost|
+----------+--------------+--------------+-------------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|    5000.0|
|         2|    2022-01-05|    2022-02-09| Appendicitis|    7000.0|
|         3|    2022-01-12|    2022-03-18|Fractured Arm|    3500.0|
|         4|    2022-01-02|    2022-04-08| Heart Attack|   15000.0|
|         5|    2022-01-05|    2022-05-07|    Influenza|    2500.0|
|         6|    2022-01-10|    2022-06-15| Appendicitis|    8000.0|
|         7|    2022-01-20|    2022-07-25|    Pneumonia|    5500.0|
|         8|    2022-01-25|    2022-09-01| Heart Attack|   20000.0|
|         9|    2022-01-15|    2022-09-22|Fractured Leg|    6000.0|
|        10|    2022-01-05|    2022-10-10| Appendicitis|    7500.0|
|        11|    2022-01-02|    2022-11-05|    Influenza|    2800.0|
|        12|    2022-01-10|    2022-12-18|    Pn

In [79]:
df2 = df1.withColumnRenamed("total_cost","hospital_bill")
df2.show()

+----------+--------------+--------------+-------------+-------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|
+----------+--------------+--------------+-------------+-------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|
|         2|    2022-01-05|    2022-02-09| Appendicitis|       7000.0|
|         3|    2022-01-12|    2022-03-18|Fractured Arm|       3500.0|
|         4|    2022-01-02|    2022-04-08| Heart Attack|      15000.0|
|         5|    2022-01-05|    2022-05-07|    Influenza|       2500.0|
|         6|    2022-01-10|    2022-06-15| Appendicitis|       8000.0|
|         7|    2022-01-20|    2022-07-25|    Pneumonia|       5500.0|
|         8|    2022-01-25|    2022-09-01| Heart Attack|      20000.0|
|         9|    2022-01-15|    2022-09-22|Fractured Leg|       6000.0|
|        10|    2022-01-05|    2022-10-10| Appendicitis|       7500.0|
|        11|    2022-01-02|    2022-11-05|    Influenza|       2800.0|
|     

In [81]:
from pyspark.sql.functions import *
df3 = df2.withColumn("duration_of_stay",expr("discharge_date -admission_date "))
df3.show()

+----------+--------------+--------------+-------------+-------------+----------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|
+----------+--------------+--------------+-------------+-------------+----------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|          9 days|
|         2|    2022-01-05|    2022-02-09| Appendicitis|       7000.0| 1 months 4 days|
|         3|    2022-01-12|    2022-03-18|Fractured Arm|       3500.0| 2 months 6 days|
|         4|    2022-01-02|    2022-04-08| Heart Attack|      15000.0| 3 months 6 days|
|         5|    2022-01-05|    2022-05-07|    Influenza|       2500.0| 4 months 2 days|
|         6|    2022-01-10|    2022-06-15| Appendicitis|       8000.0| 5 months 5 days|
|         7|    2022-01-20|    2022-07-25|    Pneumonia|       5500.0| 6 months 5 days|
|         8|    2022-01-25|    2022-09-01| Heart Attack|      20000.0| 7 months 7 days|
|         9|    2022-01-15|    2

In [82]:
df4 = df3.withColumn("adjusted_total_cost",expr("""case when diagnosis like '%Heart Attack%' then hospital_bill*1.5
when diagnosis like '%Appendicitis%' then hospital_bill*1.2
else hospital_bill end
"""))
df4.show()

+----------+--------------+--------------+-------------+-------------+----------------+-------------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|adjusted_total_cost|
+----------+--------------+--------------+-------------+-------------+----------------+-------------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|          9 days|             5000.0|
|         2|    2022-01-05|    2022-02-09| Appendicitis|       7000.0| 1 months 4 days|             8400.0|
|         3|    2022-01-12|    2022-03-18|Fractured Arm|       3500.0| 2 months 6 days|             3500.0|
|         4|    2022-01-02|    2022-04-08| Heart Attack|      15000.0| 3 months 6 days|            22500.0|
|         5|    2022-01-05|    2022-05-07|    Influenza|       2500.0| 4 months 2 days|             2500.0|
|         6|    2022-01-10|    2022-06-15| Appendicitis|       8000.0| 5 months 5 days|             9600.0|
|         7|    2022-01-20| 

In [84]:
df5=df4.select("patient_id","diagnosis","hospital_bill","adjusted_total_cost")
df5.show()

+----------+-------------+-------------+-------------------+
|patient_id|    diagnosis|hospital_bill|adjusted_total_cost|
+----------+-------------+-------------+-------------------+
|         1|    Pneumonia|       5000.0|             5000.0|
|         2| Appendicitis|       7000.0|             8400.0|
|         3|Fractured Arm|       3500.0|             3500.0|
|         4| Heart Attack|      15000.0|            22500.0|
|         5|    Influenza|       2500.0|             2500.0|
|         6| Appendicitis|       8000.0|             9600.0|
|         7|    Pneumonia|       5500.0|             5500.0|
|         8| Heart Attack|      20000.0|            30000.0|
|         9|Fractured Leg|       6000.0|             6000.0|
|        10| Appendicitis|       7500.0|             9000.0|
|        11|    Influenza|       2800.0|             2800.0|
|        12|    Pneumonia|       6000.0|             6000.0|
|        13| Heart Attack|      18000.0|            27000.0|
|        14| Appendiciti