In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window
from pyspark.sql.types import *

spark = SparkSession\
.builder\
.master("yarn")\
.appName("assignment")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [2]:
!hdfs dfs -cat /public/sms/users/users_10.json | head

{"user_id":900001,"user_first_name":"Vassily","user_last_name":"MacGovern","user_email":"vmacgovern0@newyorker.com","user_gender":"Male","user_phone_numbers":["9419157301","8624563940","2105781515"],"user_address":{"street":"100 Towne Drive","city":"Raleigh","state":"North Carolina","postal_code":"27610"}}
{"user_id":900002,"user_first_name":"Helga","user_last_name":"Scottrell","user_email":"hscottrell1@mac.com","user_gender":"Female","user_phone_numbers":["8648414243","3124971105"],"user_address":{"street":"7261 Rusk Avenue","city":"Denver","state":"Colorado","postal_code":"80217"}}
{"user_id":900003,"user_first_name":"Dacie","user_last_name":"Robertz","user_email":"drobertz2@yahoo.co.jp","user_gender":"Female","user_phone_numbers":["5034695507"],"user_address":{"street":"7057 Kinsman Avenue","city":"West Palm Beach","state":"Florida","postal_code":"33405"}}
{"user_id":900004,"user_first_name":"Tannie","user_last_name":"Bewshaw","user_email":"tbewshaw3@com.com","user_gender":"Male","u

In [2]:
users_schema=StructType([StructField("user_id",IntegerType(),nullable=False),
                         StructField("user_first_name",StringType(),nullable=False),
                         StructField("user_last_name",StringType(),nullable=False),
                         StructField("user_email",StringType(),nullable=False),
                         StructField("user_gender",StringType(),nullable=False),
                         StructField("user_phone_numbers",ArrayType(StringType()),nullable=True),
                         StructField("user_address",StructType([
                             StructField("street",StringType(),nullable=False),
                             StructField("city",StringType(),nullable=False),
                             StructField("state",StringType(),nullable=False),
                             StructField("postal_code",StringType(),nullable=False),]),nullable=False)
                        ])


In [3]:
df = spark.read.format("json").schema(users_schema).\
load("/public/sms/users/")

In [4]:
df.show()

+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+
|user_id|user_first_name|user_last_name|          user_email|user_gender|  user_phone_numbers|        user_address|
+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+
| 200001|         Eirena|     Cutsforth|ecutsforth0@wisc.edu|     Female|[4197404036, 9173...|{8 Warrior Drive,...|
| 200002|          Marja|      Shopcott|mshopcott1@hexun.com|     Female|[9542037028, 2128...|{66 Prairieview T...|
| 200003|           Dawn|       Tointon|  dtointon2@ucsd.edu|     Female|[9523035647, 2134...|{18 Ronald Regan ...|
| 200004|          Goldi|        Leaman|     gleaman3@360.cn|     Female|[2027069459, 7042...|{7696 Calypso Jun...|
| 200005|       Brewster|      Hallagan|bhallagan4@livejo...|       Male|[8134746319, 2152...|{942 Emmet Park, ...|
| 200006|       Florence|       Glashby|fglashby5@deviant...|     Female

In [23]:
df.rdd.getNumPartitions()

3

In [24]:
df.count()

1000000

In [5]:
df2=df.withColumn("street",col("user_address.street"))\
.withColumn("city",col("user_address.city"))\
.withColumn("state",col("user_address.state"))\
.withColumn("postal_code",col("user_address.postal_code"))\
.withColumn("num_phn_numbers",size(col("user_phone_numbers")))

df2.show()

+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+--------------------+-----------------+----------+-----------+---------------+
|user_id|user_first_name|user_last_name|          user_email|user_gender|  user_phone_numbers|        user_address|              street|             city|     state|postal_code|num_phn_numbers|
+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+--------------------+-----------------+----------+-----------+---------------+
| 200001|         Eirena|     Cutsforth|ecutsforth0@wisc.edu|     Female|[4197404036, 9173...|{8 Warrior Drive,...|     8 Warrior Drive|           Dallas|     Texas|      75358|              4|
| 200002|          Marja|      Shopcott|mshopcott1@hexun.com|     Female|[9542037028, 2128...|{66 Prairieview T...|66 Prairieview Te...|           Joliet|  Illinois|      60435|              5|
| 200003|           Dawn|     

In [6]:
df2.createOrReplaceTempView("users")

In [34]:
spark.sql("select count(*) from users where state = 'New York'").show()

+--------+
|count(1)|
+--------+
|   49576|
+--------+



In [35]:
df2.filter("state='New York'").count()

49576

In [36]:
spark.sql("""select state,count(postal_code) as cnt from users group by state order by cnt desc""").show()

+--------------------+-----+
|               state|  cnt|
+--------------------+-----+
|          California|97836|
|               Texas|97236|
|             Florida|73380|
|            New York|49576|
|                Ohio|32561|
|            Virginia|31456|
|        Pennsylvania|28507|
|District of Columbia|28504|
|             Georgia|26036|
|            Illinois|22445|
|      North Carolina|21805|
|            Colorado|20253|
|             Indiana|19280|
|             Arizona|18949|
|            Missouri|18854|
|           Minnesota|18621|
|             Alabama|18485|
|           Tennessee|17645|
|          Washington|17567|
|           Louisiana|17337|
+--------------------+-----+
only showing top 20 rows



In [39]:
spark.sql("""select city,count(user_id) as cnt from users where city is not null group by city order by cnt desc""").show(1)

+----------+-----+
|      city|  cnt|
+----------+-----+
|Washington|28504|
+----------+-----+
only showing top 1 row



In [40]:
df2.where("user_email like '%bizjournals.com'").count()

2015

In [43]:
df2.filter(col("user_email").like("%bizjournals.com")).count()

2015

In [45]:
df2.filter(col("num_phn_numbers")==4).select("user_id").count()

179041

In [51]:
df2.show(5)

+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+--------------------+---------------+----------+-----------+---------------+
|user_id|user_first_name|user_last_name|          user_email|user_gender|  user_phone_numbers|        user_address|              street|           city|     state|postal_code|num_phn_numbers|
+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+--------------------+---------------+----------+-----------+---------------+
| 200001|         Eirena|     Cutsforth|ecutsforth0@wisc.edu|     Female|[4197404036, 9173...|{8 Warrior Drive,...|     8 Warrior Drive|         Dallas|     Texas|      75358|              4|
| 200002|          Marja|      Shopcott|mshopcott1@hexun.com|     Female|[9542037028, 2128...|{66 Prairieview T...|66 Prairieview Te...|         Joliet|  Illinois|      60435|              5|
| 200003|           Dawn|       Tointon|

In [52]:
df2.rdd.getNumPartitions()

3

In [53]:
df2.write.mode("overwrite").save("users_data")

In [54]:
!hdfs dfs -ls users_data/

Found 4 items
-rw-r--r--   3 itv009490 supergroup          0 2023-12-07 06:38 users_data/_SUCCESS
-rw-r--r--   3 itv009490 supergroup   32947815 2023-12-07 06:38 users_data/part-00000-674ac476-c010-4393-aa4b-00553afd9165-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup   32945627 2023-12-07 06:38 users_data/part-00001-674ac476-c010-4393-aa4b-00553afd9165-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup   16701974 2023-12-07 06:38 users_data/part-00002-674ac476-c010-4393-aa4b-00553afd9165-c000.snappy.parquet


In [55]:
spark.sql("""select * from users""").show()

+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+--------------------+-----------------+----------+-----------+---------------+
|user_id|user_first_name|user_last_name|          user_email|user_gender|  user_phone_numbers|        user_address|              street|             city|     state|postal_code|num_phn_numbers|
+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+--------------------+-----------------+----------+-----------+---------------+
| 200001|         Eirena|     Cutsforth|ecutsforth0@wisc.edu|     Female|[4197404036, 9173...|{8 Warrior Drive,...|     8 Warrior Drive|           Dallas|     Texas|      75358|              4|
| 200002|          Marja|      Shopcott|mshopcott1@hexun.com|     Female|[9542037028, 2128...|{66 Prairieview T...|66 Prairieview Te...|           Joliet|  Illinois|      60435|              5|
| 200003|           Dawn|     

In [7]:
spark.sql("""
select state,sum(male_cnt) as Males, sum(fmale_cnt) as Females from (
select state, case when user_gender ='Male' then count(*) end as male_cnt,
case when user_gender ='Female' then count(*) end as fmale_cnt from users
where state is not null and user_phone_numbers is not null
group by state , user_gender
)
group by state
order by state

""").show()

+--------------------+-----+-------+
|               state|Males|Females|
+--------------------+-----+-------+
|             Alabama| 9307|   9178|
|              Alaska| 1882|   1938|
|             Arizona| 9406|   9543|
|            Arkansas| 2420|   2416|
|          California|49120|  48716|
|            Colorado|10128|  10125|
|         Connecticut| 5797|   5917|
|            Delaware| 1651|   1654|
|District of Columbia|14212|  14292|
|             Florida|36692|  36688|
|             Georgia|13008|  13028|
|              Hawaii| 2172|   2062|
|               Idaho| 2058|   2101|
|            Illinois|11178|  11267|
|             Indiana| 9604|   9676|
|                Iowa| 4706|   4726|
|              Kansas| 5962|   5776|
|            Kentucky| 6216|   6108|
|           Louisiana| 8706|   8631|
|               Maine|  225|    228|
+--------------------+-----+-------+
only showing top 20 rows



In [35]:
!hdfs dfs -ls -h /user/itv009490/pivot_assignment_result/

Found 52 items
-rw-r--r--   3 itv009490 supergroup          0 2023-12-07 08:57 /user/itv009490/pivot_assignment_result/_SUCCESS
-rw-r--r--   3 itv009490 supergroup        958 2023-12-07 08:57 /user/itv009490/pivot_assignment_result/part-00000-7920b1c3-c773-496b-ab24-ae723af4d155-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup        949 2023-12-07 08:57 /user/itv009490/pivot_assignment_result/part-00001-7920b1c3-c773-496b-ab24-ae723af4d155-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup        958 2023-12-07 08:57 /user/itv009490/pivot_assignment_result/part-00002-7920b1c3-c773-496b-ab24-ae723af4d155-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup        967 2023-12-07 08:57 /user/itv009490/pivot_assignment_result/part-00003-7920b1c3-c773-496b-ab24-ae723af4d155-c000.snappy.parquet
-rw-r--r--   3 itv009490 supergroup        985 2023-12-07 08:57 /user/itv009490/pivot_assignment_result/part-00004-7920b1c3-c773-496b-ab24-ae723af4d155-c000.snappy.parquet
-rw-r--r--  

In [21]:
!hdfs dfs -ls -h /public/airlines_all/airlines | tail

-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 11:16 /public/airlines_all/airlines/part-01910
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 11:15 /public/airlines_all/airlines/part-01911
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:03 /public/airlines_all/airlines/part-01912
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:02 /public/airlines_all/airlines/part-01913
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:51 /public/airlines_all/airlines/part-01914
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 07:46 /public/airlines_all/airlines/part-01915
-rw-r--r--   2 hdfs supergroup       64 M 2021-01-28 09:33 /public/airlines_all/airlines/part-01916
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 10:48 /public/airlines_all/airlines/part-01917
-rw-r--r--   2 hdfs supergroup     64.0 M 2021-01-28 08:18 /public/airlines_all/airlines/part-01918
-rw-r--r--   2 hdfs supergroup     47.4 M 2021-01-28 08:13 /public/airlines_all/airlines/part-01919


In [18]:
df2 = spark.read.format("csv").load("/public/airlines_all/airlines/")

In [19]:
df2.show()

+----+---+---+---+----+----+----+----+---+----+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
| _c0|_c1|_c2|_c3| _c4| _c5| _c6| _c7|_c8| _c9|  _c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|
+----+---+---+---+----+----+----+----+---+----+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|2001|  8|  2|  4|  NA|1047|  NA|1222| AA|1056|�NKNO�|  NA|  95|  NA|  NA|  NA| MCI| ORD| 403|   0|   0|   1|  NA|   0|  NA|  NA|  NA|  NA|  NA| YES| YES|
|2001|  8|  3|  5|1048|1047|1210|1222| AA|1056|N274A1|  82|  95|  66| -12|   1| MCI| ORD| 403|   6|  10|   0|  NA|   0|  NA|  NA|  NA|  NA|  NA|  NO| YES|
|2001|  8|  4|  6|1043|1047|1159|1222| AA|1056|N513A1|  76|  95|  61| -23|  -4| MCI| ORD| 403|   4|  11|   0|  NA|   0|  NA|  NA|  NA|  NA|  NA|  NO|  NO|
|2001|  8|  5|  7|1043|1047|1203|1222| AA|1056|N532A1|  80|  95|  65| 

In [20]:
df2.rdd.getNumPartitions()

1919

In [22]:
spark.stop()

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window
from pyspark.sql.types import *

spark = SparkSession\
.builder\
.master("yarn")\
.appName("assignment")\
.config("spark.sql.files.maxPartitionBytes","146800640")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [24]:
df2 = spark.read.format("csv").load("/public/airlines_all/airlines/")

In [25]:
df2.rdd.getNumPartitions()

960

In [29]:
!hadoop fs -rm -R /user/itv009490/pivot_assignment_result

2023-12-07 08:52:59,786 INFO fs.TrashPolicyDefault: Moved: 'hdfs://m01.itversity.com:9000/user/itv009490/pivot_assignment_result' to trash at: hdfs://m01.itversity.com:9000/user/itv009490/.Trash/Current/user/itv009490/pivot_assignment_result


In [30]:
spark.stop()

In [31]:
!hadoop fs -mkdir /user/itv009490/pivot_assignment_result