In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Assigment4').getOrCreate()

In [2]:
# Load the new hotels file
base_df = spark.read.csv('Hotels_data_Changed.csv',inferSchema=True,header=True)

In [3]:
base_df.show(2)

+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+--------------------+-----------+-------+------------+-------------+-------+
|Snapshot ID|      Snapshot Date|       Checkin Date|Days|Original Price|Discount Price|Discount Code|Available Rooms|          Hotel Name|Hotel Stars|WeekDay|DiscountDiff| DiscountPerc|DayDiff|
+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+--------------------+-----------+-------+------------+-------------+-------+
|          1|2015-07-17 00:00:00|2015-08-12 00:00:00|   5|          1178|          1040|            1|              6|Best Western Plus...|          3|    Wed|         138| 11.714770798|     26|
|          1|2015-07-17 00:00:00|2015-08-19 00:00:00|   5|          1113|           982|            1|              8|Best Western Plus...|          3|    Wed|         131|11.7699910153|     33|
+-----------+------------

In [4]:
from pyspark.sql.functions import col

# Get 150 Hotels that have the most rows in data
tophotels = base_df.groupBy("Hotel Name").count().sort(col("count").desc()).head(150)
tophotels

[Row(Hotel Name='Newark Liberty International Airport Marriott', count=5346),
 Row(Hotel Name='Hilton Garden Inn Times Square', count=4892),
 Row(Hotel Name='Residence Inn Newark Elizabeth Liberty International Airport', count=4314),
 Row(Hotel Name='Westin New York at Times Square', count=3792),
 Row(Hotel Name='Loews Regency New York Hotel', count=3617),
 Row(Hotel Name='Viceroy New York', count=3565),
 Row(Hotel Name='Four Seasons Hotel New York', count=3243),
 Row(Hotel Name='Langham Place New York Fifth Avenue', count=3203),
 Row(Hotel Name='The Carlyle A Rosewood Hotel', count=3078),
 Row(Hotel Name='DoubleTree by Hilton Metropolitan - New York City', count=2866),
 Row(Hotel Name='Magnuson Convention Center Hotel', count=2862),
 Row(Hotel Name='Hilton Garden Inn New York West 35th Street', count=2822),
 Row(Hotel Name='Hilton Garden Inn New York-Times Square Central', count=2772),
 Row(Hotel Name='Conrad New York', count=2677),
 Row(Hotel Name='Wyndham Garden Brooklyn Sunset Park

In [5]:
# convert the name + count list to dataframe and create a view for it
top_hotel_names_df = spark.sparkContext.parallelize(tophotels).toDF(['hotel_name','COUNT'])
top_hotel_names_df.createOrReplaceTempView("topHotelNames")

# Change a column name to handle in query
base_df.withColumnRenamed('Hotel Name','Hotel_Name').createOrReplaceTempView("base_df")

# Reduced the rows listed to only the ones that are among top 150 names
top_hotels_filtered_base_df = spark.sql("SELECT * FROM base_df WHERE Hotel_Name IN (SELECT hotel_name FROM topHotelNames)")
top_hotels_filtered_base_df.show(2)

+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+-------------+-----------+-------+------------+-------------+-------+
|Snapshot ID|      Snapshot Date|       Checkin Date|Days|Original Price|Discount Price|Discount Code|Available Rooms|   Hotel_Name|Hotel Stars|WeekDay|DiscountDiff| DiscountPerc|DayDiff|
+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+-------------+-----------+-------+------------+-------------+-------+
|        101|2015-08-16 00:00:00|2015-08-17 00:00:00|   5|          2055|          1989|            1|              1|Bentley Hotel|          4|    Mon|          66|3.21167883212|      1|
|        101|2015-08-16 00:00:00|2015-09-06 00:00:00|   5|          1409|          1348|            2|              3|Bentley Hotel|          4|    Sun|          61|4.32931156849|     21|
+-----------+-------------------+-------------------+----+--

In [6]:
# Get 40 Checkin dates that have the most rows in top 150 hotels data
top_checkin_dates = top_hotels_filtered_base_df.groupBy("Checkin Date").count().sort(col("count").desc()).head(40)

# convert the checkin date + count list to dataframe and create a view for it
top_checkin_dates_df = spark.sparkContext.parallelize(top_checkin_dates).toDF()
top_checkin_dates_df.show(40)

+-------------------+-----+
|       Checkin Date|count|
+-------------------+-----+
|2015-11-11 00:00:00| 2302|
|2015-10-14 00:00:00| 1887|
|2015-11-04 00:00:00| 1885|
|2015-08-19 00:00:00| 1883|
|2015-10-28 00:00:00| 1861|
|2015-10-21 00:00:00| 1817|
|2015-11-06 00:00:00| 1808|
|2015-08-12 00:00:00| 1765|
|2015-11-05 00:00:00| 1684|
|2015-10-22 00:00:00| 1662|
|2015-11-12 00:00:00| 1649|
|2015-09-10 00:00:00| 1623|
|2015-10-29 00:00:00| 1623|
|2015-09-09 00:00:00| 1616|
|2015-11-18 00:00:00| 1582|
|2015-08-26 00:00:00| 1559|
|2015-11-10 00:00:00| 1548|
|2015-11-13 00:00:00| 1547|
|2015-10-15 00:00:00| 1473|
|2015-11-21 00:00:00| 1469|
|2015-09-30 00:00:00| 1464|
|2015-10-30 00:00:00| 1412|
|2015-09-16 00:00:00| 1407|
|2015-09-17 00:00:00| 1402|
|2015-11-28 00:00:00| 1383|
|2015-10-01 00:00:00| 1373|
|2015-11-26 00:00:00| 1356|
|2015-09-11 00:00:00| 1332|
|2015-09-18 00:00:00| 1326|
|2015-10-16 00:00:00| 1309|
|2015-11-27 00:00:00| 1306|
|2015-10-02 00:00:00| 1280|
|2015-10-07 00:00:00

In [7]:
# Change a column name to handle in query
top_checkin_dates_df = top_checkin_dates_df.withColumnRenamed('Checkin Date','Checkin_Date')
top_checkin_dates_df.show(3)

+-------------------+-----+
|       Checkin_Date|count|
+-------------------+-----+
|2015-11-11 00:00:00| 2302|
|2015-10-14 00:00:00| 1887|
|2015-11-04 00:00:00| 1885|
+-------------------+-----+
only showing top 3 rows



In [8]:
# Create filtered data for top hotels with top dates

top_checkin_dates_df.createOrReplaceTempView("topCheckinDates")

# Change a column name to handle in query
top_hotels_filtered_base_df.withColumnRenamed('Checkin Date','Checkin_Date').withColumnRenamed('Discount Price','Discount_Price').withColumnRenamed('Discount Code','Discount_Code').createOrReplaceTempView("top_hotels_filtered_base")

# Reduced the rows listed to only the ones that are among top 40 checkin dates
hotel_rows_for_top_dates = spark.sql("SELECT Hotel_Name, Checkin_Date, Discount_Code, Discount_Price FROM top_hotels_filtered_base WHERE Checkin_Date IN (SELECT Checkin_Date FROM topCheckinDates)")

hotel_rows_for_top_dates.show()

+--------------------+-------------------+-------------+--------------+
|          Hotel_Name|       Checkin_Date|Discount_Code|Discount_Price|
+--------------------+-------------------+-------------+--------------+
|Westin New York a...|2015-11-26 00:00:00|            2|          1845|
|Westin New York a...|2015-11-26 00:00:00|            3|          1696|
|Westin New York a...|2015-11-26 00:00:00|            4|          1646|
|Westin New York a...|2015-11-26 00:00:00|            2|          1845|
|Westin New York a...|2015-11-26 00:00:00|            3|          1696|
|Westin New York a...|2015-11-26 00:00:00|            4|          1646|
|Westin New York a...|2015-11-26 00:00:00|            2|          1845|
|Westin New York a...|2015-11-26 00:00:00|            3|          1696|
|Westin New York a...|2015-11-26 00:00:00|            4|          1646|
|Westin New York a...|2015-11-26 00:00:00|            2|          1845|
|Westin New York a...|2015-11-26 00:00:00|            3|        

In [9]:
# Creates dataframe for combination of hotels with checkin dates
only_hotel_names_df = top_hotel_names_df.drop("COUNT")
only_checkin_dates_df = top_checkin_dates_df.drop("COUNT")
joint_df = only_hotel_names_df.crossJoin(only_checkin_dates_df)

In [10]:
joint_df.show(2)

+--------------------+-------------------+
|          hotel_name|       Checkin_Date|
+--------------------+-------------------+
|Newark Liberty In...|2015-11-11 00:00:00|
|Newark Liberty In...|2015-10-14 00:00:00|
+--------------------+-------------------+
only showing top 2 rows



In [11]:
# Creates dataframe for combination of hotels with checkin dates and discount codes
discount_codes_df = spark.range(1,5).withColumnRenamed("id","discount_code")
joint_with_price_codes_df = joint_df.crossJoin(discount_codes_df)

In [12]:
joint_with_price_codes_df.show(5)
joint_with_price_codes_df.count()

+--------------------+-------------------+-------------+
|          hotel_name|       Checkin_Date|discount_code|
+--------------------+-------------------+-------------+
|Newark Liberty In...|2015-11-11 00:00:00|            1|
|Newark Liberty In...|2015-11-11 00:00:00|            2|
|Newark Liberty In...|2015-11-11 00:00:00|            3|
|Newark Liberty In...|2015-11-11 00:00:00|            4|
|Newark Liberty In...|2015-10-14 00:00:00|            1|
+--------------------+-------------------+-------------+
only showing top 5 rows



24000

In [13]:
# Filter (top hotels filtered) base df to leave us with only listings
# of same combinations with minimum price
grouped_df = hotel_rows_for_top_dates.groupBy("Hotel_Name", "Checkin_Date", "Discount_Code").min("Discount_Price")

In [14]:
grouped_renamed_df = grouped_df.withColumnRenamed("Hotel_Name", "hotel_name").withColumnRenamed("Checkin_Date", "checkin_date").withColumnRenamed("Discount_Code", "discount_code").withColumnRenamed("min(Discount_Price)","discount_price")

In [15]:
grouped_renamed_df.show(20)

+--------------------+-------------------+-------------+--------------+
|          hotel_name|       checkin_date|discount_code|discount_price|
+--------------------+-------------------+-------------+--------------+
|Westin New York a...|2015-09-11 00:00:00|            3|          1759|
|Homewood Suites b...|2015-08-12 00:00:00|            1|          1195|
|New York Marriott...|2015-08-12 00:00:00|            2|          1275|
|    Viceroy New York|2015-11-13 00:00:00|            2|          1822|
|Omni Berkshire Place|2015-08-27 00:00:00|            2|          1156|
|     The Plaza Hotel|2015-08-27 00:00:00|            3|          2969|
|Hampton Inn Manha...|2015-11-03 00:00:00|            3|          1255|
|     The Plaza Hotel|2015-11-03 00:00:00|            4|          4088|
| The Kitano New York|2015-09-30 00:00:00|            2|          1893|
|Smyth A Thompson ...|2015-09-30 00:00:00|            3|          2525|
|Hilton Garden Inn...|2015-09-30 00:00:00|            1|        

In [16]:
final_columns_df = top_checkin_dates_df.select("checkin_date").crossJoin(discount_codes_df.select("discount_code"))

In [17]:
final_columns_df.show(2)
final_columns_df.count()

+-------------------+-------------+
|       checkin_date|discount_code|
+-------------------+-------------+
|2015-11-11 00:00:00|            1|
|2015-11-11 00:00:00|            2|
+-------------------+-------------+
only showing top 2 rows



160

In [18]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
w = Window().orderBy("checkin_date", "discount_code")
indexed_final_columns_df = final_columns_df.select(row_number().over(w).alias("ID"), col("*"))

indexed_final_columns_df.show(180)

+---+-------------------+-------------+
| ID|       checkin_date|discount_code|
+---+-------------------+-------------+
|  1|2015-08-12 00:00:00|            1|
|  2|2015-08-12 00:00:00|            2|
|  3|2015-08-12 00:00:00|            3|
|  4|2015-08-12 00:00:00|            4|
|  5|2015-08-13 00:00:00|            1|
|  6|2015-08-13 00:00:00|            2|
|  7|2015-08-13 00:00:00|            3|
|  8|2015-08-13 00:00:00|            4|
|  9|2015-08-19 00:00:00|            1|
| 10|2015-08-19 00:00:00|            2|
| 11|2015-08-19 00:00:00|            3|
| 12|2015-08-19 00:00:00|            4|
| 13|2015-08-26 00:00:00|            1|
| 14|2015-08-26 00:00:00|            2|
| 15|2015-08-26 00:00:00|            3|
| 16|2015-08-26 00:00:00|            4|
| 17|2015-08-27 00:00:00|            1|
| 18|2015-08-27 00:00:00|            2|
| 19|2015-08-27 00:00:00|            3|
| 20|2015-08-27 00:00:00|            4|
| 21|2015-08-28 00:00:00|            1|
| 22|2015-08-28 00:00:00|            2|


In [19]:
w = Window().orderBy("hotel_name")
indexed_hotels_df = only_hotel_names_df.select(row_number().over(w).alias("ID"), col("*"))

indexed_hotels_df.show(170)

+---+--------------------+
| ID|          hotel_name|
+---+--------------------+
|  1|        Aloft Harlem|
|  2|Andaz 5th Avenue ...|
|  3|Andaz Wall Street...|
|  4|Baccarat Hotel an...|
|  5|       Bentley Hotel|
|  6|Best Western Bays...|
|  7|Best Western Bowe...|
|  8|Best Western Plus...|
|  9|Best Western Plus...|
| 10|    Blakely New York|
| 11|Cassa Hotel 45th ...|
| 12|         Chelsea Inn|
| 13|Comfort Inn Times...|
| 14|     Conrad New York|
| 15|Courtyard New Yor...|
| 16|Courtyard Newark ...|
| 17|Courtyard by Marr...|
| 18|Courtyard by Marr...|
| 19|Courtyard by Marr...|
| 20|Courtyard by Marr...|
| 21|Courtyard by Marr...|
| 22|Crowne Plaza Time...|
| 23|Days Inn Bronx Ne...|
| 24|DoubleTree Suites...|
| 25|DoubleTree by Hil...|
| 26|DoubleTree by Hil...|
| 27|DoubleTree by Hil...|
| 28|DoubleTree by Hil...|
| 29|DoubleTree by Hil...|
| 30|Dumont NYC-an Aff...|
| 31|Embassy Suites Ne...|
| 32|Eventi Hotel a Ki...|
| 33|Fairfield Inn by ...|
| 34|Four Seasons Hote...|
|

In [20]:
def initMat(x,y):
        return -1
finalMat = [[initMat(x,y) for x in range(161)] for y in range(150)]

In [21]:
grouped_renamed_df.count()

8227

In [22]:
hotel_dict = {}
for row in indexed_hotels_df.collect():
    hotel_dict[row.hotel_name] = row.ID

In [23]:
combinations_dict = {}
for row in indexed_final_columns_df.collect():
    combinations_dict[row.checkin_date, row.discount_code] = row.ID

In [24]:
for row in grouped_renamed_df.collect():
    column_to_update = combinations_dict[row.checkin_date, row.discount_code] - 1
    row_to_update = hotel_dict[row.hotel_name] - 1
    print(row_to_update,column_to_update)
    finalMat[row_to_update][column_to_update] = row.discount_price

143 34
66 0
89 1
137 133
92 17
126 18
40 102
126 103
117 49
109 50
57 48
142 24
35 24
146 98
138 98
127 77
36 158
116 28
131 65
36 67
116 72
126 81
35 80
55 7
75 45
63 128
101 60
53 69
78 68
35 34
95 122
136 132
113 102
82 102
87 9
134 118
99 26
122 25
2 26
129 144
56 147
87 105
4 86
4 98
126 98
91 88
42 158
44 156
96 157
43 28
33 66
0 74
76 75
93 137
129 137
116 80
123 81
42 153
24 44
56 46
92 129
23 113
102 13
86 38
85 20
36 34
92 2
52 122
106 121
92 135
126 132
85 134
53 17
33 100
13 103
60 50
31 9
10 8
21 8
99 126
54 125
131 118
65 27
129 147
79 145
129 94
91 76
12 79
54 157
13 29
60 30
68 29
26 65
52 67
2 65
2 66
75 142
41 140
95 143
93 80
78 80
76 155
149 5
16 6
72 7
91 45
97 45
27 130
121 131
128 113
63 61
24 69
57 70
83 36
63 34
91 32
117 0
79 133
42 55
126 51
49 50
52 126
62 126
148 119
75 94
8 90
96 90
85 156
99 31
98 73
57 72
53 57
75 57
41 82
60 155
143 131
45 129
145 113
41 60
65 62
10 150
28 14
80 70
85 39
85 32
28 33
25 34
79 111
81 120
126 123
57 134
117 52
95 19
102 18

75 100
36 102
29 48
145 50
6 48
37 8
57 9
128 116
50 117
17 86
96 84
103 94
104 92
27 96
60 96
12 76
33 91
131 88
145 88
107 90
99 157
141 67
55 65
46 66
56 75
46 72
29 56
54 56
107 57
49 83
110 153
84 153
105 153
85 153
137 154
24 47
13 47
80 44
82 44
99 129
149 112
140 112
97 114
93 61
50 15
129 39
79 21
102 22
81 0
55 3
132 0
35 121
113 120
110 135
49 54
140 54
118 18
10 16
83 103
106 103
143 50
141 116
86 118
33 105
44 105
113 106
129 86
80 94
63 79
130 77
51 78
57 77
44 88
143 157
131 66
21 72
19 136
89 136
127 136
55 58
10 40
116 142
44 82
56 83
26 154
51 154
59 5
50 44
82 45
3 128
57 131
95 112
88 151
91 150
86 69
56 71
57 68
124 38
50 39
80 23
50 22
44 33
149 1
68 133
55 135
83 55
54 54
108 18
63 101
129 101
135 50
45 126
140 127
137 126
39 117
13 26
132 104
131 84
148 94
145 94
147 77
126 90
80 88
44 91
108 158
33 31
25 31
56 67
53 41
73 43
8 143
21 80
74 153
13 155
129 154
91 7
93 46
87 45
62 44
2 44
98 129
72 112
5 149
81 68
64 20
66 21
51 22
110 35
119 109
134 120
92 54
87 

101 125
2 124
103 147
52 146
137 144
33 85
42 86
53 92
140 94
113 96
51 159
10 31
57 29
92 65
26 74
13 72
5 73
141 74
123 74
99 137
92 6
52 45
13 46
81 129
101 62
126 70
31 39
62 22
50 0
69 133
122 49
41 49
132 127
143 116
92 116
143 146
17 144
123 106
29 85
67 85
85 86
126 92
145 97
59 77
147 156
2 157
118 74
17 136
76 139
1 143
84 141
118 83
143 155
98 153
66 45
110 129
116 114
27 114
72 60
41 21
56 22
95 34
56 111
41 3
140 2
141 121
62 134
75 17
36 17
83 50
132 8
127 124
81 117
67 116
65 26
55 24
81 105
137 105
81 84
55 78
42 159
25 29
42 65
42 73
78 139
113 57
138 40
8 81
74 152
137 47
125 130
36 130
12 114
55 113
145 71
91 38
24 35
23 109
129 108
149 108
27 2
91 121
85 132
82 133
33 16
142 10
83 8
85 9
94 8
136 125
72 124
26 118
142 118
59 116
84 118
92 27
122 146
141 146
126 105
99 105
141 78
134 74
87 139
88 137
21 138
59 59
57 59
108 42
110 82
132 82
90 131
41 129
41 131
19 115
90 14
126 69
92 39
5 38
118 111
35 1
72 0
79 134
33 54
86 54
129 17
79 17
137 16
10 102
62 25
24 105


106 142
147 153
5 152
83 128
55 62
99 151
71 150
137 151
14 12
59 68
99 38
131 109
95 108
122 122
145 121
95 123
97 132
36 134
141 19
25 17
59 51
91 8
91 10
45 124
40 116
27 26
129 146
63 106
141 92
84 92
145 99
43 159
88 29
88 65
95 66
115 65
83 137
13 137
78 138
136 138
78 56
26 141
104 141
48 143
54 143
54 141
10 140
84 140
72 140
17 82
32 6
43 45
56 114
109 12
79 13
44 70
59 37
114 20
80 35
144 134
45 135
32 54
15 19
132 103
1 49
18 9
54 11
149 8
126 127
35 26
122 27
54 26
44 25
94 27
24 144
75 147
81 85
80 84
44 86
131 95
137 94
65 98
85 97
113 78
89 90
56 91
31 31
41 64
81 72
140 75
42 56
82 58
41 141
68 141
61 80
40 46
57 46
32 130
42 130
72 130
101 61
84 61
135 15
64 12
76 12
80 37
59 22
136 23
27 110
93 3
31 132
13 133
82 134
10 19
78 16
80 49
129 125
13 116
143 105
92 104
23 105
72 104
140 97
107 89
148 158
76 156
55 67
107 137
28 59
75 41
114 41
25 40
62 82
57 82
92 154
2 153
72 152
113 5
52 130
58 112
27 113
75 114
145 114
56 150
132 21
109 34
84 35
24 1
66 3
40 122
24 18
1

56 33
101 1
141 0
26 132
81 134
142 135
66 55
24 127
56 127
33 26
91 24
76 25
88 92
50 95
126 97
55 98
106 98
64 159
45 157
147 66
83 73
79 58
81 142
143 83
26 80
69 154
94 154
136 155
44 113
86 114
55 114
137 113
54 62
123 68
143 108
26 121
134 123
40 135
57 55
92 18
85 103
99 49
109 48
10 11
94 9
33 126
83 85
140 86
63 95
92 31
79 29
63 65
36 66
80 73
91 138
92 58
82 59
33 43
129 40
104 140
9 140
90 82
33 153
65 153
13 7
126 114
80 112
41 114
85 112
59 62
138 61
91 151
84 12
13 22
10 21
138 33
130 108
19 0
28 1
58 135
78 52
137 54
28 53
137 17
28 16
26 101
108 102
63 48
129 10
137 116
88 26
51 145
123 90
62 64
25 65
88 57
40 41
129 43
93 141
48 140
96 143
26 82
42 82
52 153
29 45
59 130
103 114
60 114
78 63
92 14
87 14
51 15
65 21
33 108
44 120
23 134
55 55
124 100
132 100
56 51
135 26
18 26
4 145
116 145
129 87
76 86
137 92
118 76
133 89
55 91
45 158
72 158
143 29
145 65
65 66
9 65
42 64
39 66
40 137
46 137
137 58
63 141
93 6
141 129
116 113
136 112
26 63
143 150
10 12
85 12
105 71


In [25]:
for line in indexed_hotels_df.collect():
    finalMat[line.ID - 1][160] = line.hotel_name

In [26]:
mat_df = spark.createDataFrame(finalMat)

In [27]:
mat_df.show()

+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+---+----+----+----+----+----+----+----+----+----+----+----+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+--------------------+
|  _1|  _2|  _3|  _4|  _5|  _6|  _7|  _8|  _9| _10| _11| _12| _13| _14| _15| _16| _17| _18| _19| _20| _21| _22| _23|_24| _25| _26| _27| _28| _29| _30| _31| _32| _33| _34| _35|_36|_

In [28]:
for row in range(0,150):
    curr_row_max_value = 0
    curr_row_min_value = 100000
    for col in range(0,160):
        if ((finalMat[row][col] > -1) & (finalMat[row][col] > curr_row_max_value)):
            curr_row_max_value = finalMat[row][col]
        if ((finalMat[row][col] > -1) & (finalMat[row][col] < curr_row_min_value)):
            curr_row_min_value = finalMat[row][col]
    for col in range(0,160):
        if (finalMat[row][col] > -1):
            if (curr_row_min_value < curr_row_max_value):
                finalMat[row][col] = 100 * (finalMat[row][col] - curr_row_min_value) / (curr_row_max_value - curr_row_min_value)
            else:
                finalMat[row][col] = 0

In [29]:
for row in range(0,150):
    for col in range(0,160):
        print(finalMat[row][col])

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
9.47176684881603
10.018214936247723
0.0
-1
-1
-1
-1
14.025500910746812
14.025500910746812
14.025500910746812
14.025500910746812
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
24.043715846994534
32.05828779599271
-1
-1
-1
-1
-1
-1
-1
51.7304189435337
61.56648451730419
71.5846994535519
-1
-1
53.916211293260474
-1
71.5846994535519
-1
32.05828779599271
-1
-1
-1
-1
-1
-1
-1
45.90163934426229
-1
34.06193078324226
-1
47.540983606557376
-1
-1
-1
43.169398907103826
-1
-1
42.25865209471767
-1
-1
100.0
-1
-1
-1
-1
-1
-1
-1
82.87795992714025
-1
-1
-1
21.493624772313296
-1
-1
-1
59.92714025500911
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
20.31152647975078
-1
-1
-1
-1
0.0
-1
-1
-1
42.92834890965732
-1
-1
-1
-1
-1
-1
19.87538940809969
-1
-1
-1
-1
-1
-1
-1
99.4392523364486
-1
62.679127725856695
-1
-1
-1
52.08722

-1
-1
-1
-1
-1
-1
-1
-1
-1
62.893081761006286
-1
-1
-1
-1
-1
-1
-1
45.283018867924525
-1
-1
-1
-1
35.84905660377358
24.528301886792452
-1
-1
24.528301886792452
-1
-1
-1
-1
-1
-1
17.61006289308176
-1
-1
-1
-1
-1
-1
-1
100.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
16.352201257861637
27.67295597484277
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
0.0
-1
-1
-1
-1
-1
20.754716981132077
-1
-1
8.80503144654088
20.12578616352201
-1
-1
-1
18.238993710691823
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
0.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
86.48409893992932
81.00706713780919
64.48763250883393
86.48409893992932
64.48763250883393
75.53003533568905
81.00706713780919
64.48763250883393
77.91519434628975
77.82685512367492
73.0565371024735
77.82685512367492
82.15547703180212
82.15547703180212
100.0
82.15547703180212
-1
-1
-1
-1
82.15547703180212
82.15547703180212
82.155

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
25.095785440613028
29.693486590038315
-1
-1
-1
-1
-1
-1
6.98051948051948
8.279220779220779
-1
-1
31.25
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
84.17207792207792
67.53246753246754
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
25.243506493506494
28.733766233766232
31.16883116883117
-1
-1
-1
-1
74.35064935064935
100.0
-1
-1
-1
90.17857142857143
-1
-1
-1
-1
-1
-1
19.074675324675326
21.59090909090909
23.863636363636363
-1
-1
-1
44.48051948051948
-1
84.41558441558442
86.12012987012987
82.71103896103897
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
28.246753246753247
-1
-1
27.191558441558442
57.87337662337662
17.37012987012987
-1
15.016233766233766
52.67857142857143
-1
-1
7.954545454545454
51.70454545454545
-1
-1
15.340909090909092
55.925324675324674
-1
-1
55.60064935064935
56.81818181818182
-1
-1
61.4448051948052
27.02922077922078
-1
-1
45.37337662337662
-1
-1
-1
32.79220779220779
-1
-1
-1
26.704545454545453
-1
18.668

-1
-1
-1
68.52459016393442
59.01639344262295
-1
39.01639344262295
-1
76.22950819672131
-1
-1
59.83606557377049
-1
-1
-1
82.37704918032787
72.45901639344262
-1
40.49180327868852
-1
-1
54.59016393442623
54.59016393442623
-1
50.16393442622951
56.721311475409834
71.06557377049181
69.59016393442623
57.950819672131146
-1
61.14754098360656
-1
53.9344262295082
-1
72.62295081967213
-1
-1
83.52459016393442
85.0
-1
79.91803278688525
-1
-1
100.0
77.78688524590164
-1
52.131147540983605
52.868852459016395
-1
61.22950819672131
63.442622950819676
61.721311475409834
37.950819672131146
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
44.91803278688525
50.65573770491803
43.278688524590166
-1
-1
57.459016393442624
-1
-1
-1
-1
-1
-1
-1
41.31147540983606
36.885245901639344
43.442622950819676
41.22950819672131
35.0
34.91803278688525
33.85245901639344
30.901639344262296
43.442622950819676
43.442622950819676
48.85245901639344
-1
-1
64.50819672131148
-1
-1
-1
58.68852459016394
39.75409836065574
69.59016393442623
33

-1
-1
20.92391304347826
23.980978260869566
26.42663043478261
28.26086956521739
-1
29.279891304347824
29.279891304347824
17.866847826086957
-1
28.668478260869566
25.067934782608695
-1
36.41304347826087
28.464673913043477
-1
-1
-1
-1
-1
-1
18.953804347826086
19.633152173913043
22.82608695652174
20.78804347826087
-1
-1
21.331521739130434
-1
-1
-1
19.565217391304348
19.565217391304348
-1
23.845108695652176
-1
-1
0.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
27.241847826086957
27.309782608695652
-1
22.35054347826087
-1
28.174603174603174
31.944444444444443
-1
13.78968253968254
15.575396825396826
-1
-1
-1
-1
31.944444444444443
-1
14.087301587301587
12.202380952380953
15.972222222222221
-1
-1
10.317460317460318
8.432539682539682
-1
16.071428571428573
8.432539682539682
-1
-1
82.53968253968254
62.00396825396825
53.07539682539682
-1
-1
-1
-1
-1
96.13095238095238
-1
-1
-1
-1
65.57539682539682
75.39682539682539
97.12301587301587
56.54761904761905
59.32539682539682
81.64682539682539
84.325396825

-1
-1
29.018961253091508
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
7.254740313272877
-1
-1
-1
-1
-1
3.462489694971146
-1
-1
100.0
-1
-1
-1
-1
50.45342126957956
53.751030502885406
-1
52.926628194558944
49.62901896125309
-1
-1
73.94888705688376
-1
65.37510305028854
-1
-1
-1
-1
-1
-1
49.62901896125309
-1
62.57213520197857
63.64385820280297
59.52184666117065
-1
52.926628194558944
51.27782357790602
30.667765869744436
29.018961253091508
-1
40.560593569662
38.911788953009065
52.10222588623248
-1
-1
-1
-1
-1
-1
52.926628194558944
-1
-1
-1
-1
17.394888705688377
-1
-1
-1
-1
66.11706512778235
52.51442704039572
-1
-1
-1
-1
-1
47.568013190436936
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
20.774938169826875
21.434460016488046
-1
-1
18.46661170651278
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1.8136850783182192
-1
-1
-1
-1
-1
-1
-1
0.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
11.494252873563218
24.597701149425287
15.32567049808429
24.36781609195402

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
29.06574394463668
-1
-1
-1
-1
100.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
3.9792387543252596
-1
-1
27.335640138408305
-1
-1
-1
20.761245674740483
17.647058823529413
28.37370242214533
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
6.57439446366782
22.14532871972318
20.41522491349481
-1
-1
-1
-1
-1
11.418685121107266
0.0
35.46712802768166
49.134948096885815
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
22.580645161290324
41.935483870967744
41.935483870967744
-1
-1
-1
41.935483870967744
22.580645161290324
1.2903225806451613
41.935483870967744
31.29032258064516
-1
22.580645161290324
41.935483870967744
41.935483870967744
-1
-1
-1
41.935483870967744
22.580645161290324
54.83870967741935
-1
51.61290322580645
-1
9.67741935483871
32.25806451612903

-1
0.0
6.132075471698113
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
16.037735849056602
-1
-1
0.0
5.660377358490566
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
100.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
61.796042617960424
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
37.29071537290715
41.856925418569254
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
29.22374429223744
-1
21.004566210045663
-1
14.459665144596652
0.0
12.1765601217656
-1
9.58904109589041
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
19.025875190258752
-1
-1
-1
-1
-1
-1
-1
-1
-1
18.07531380753138
29.790794979079497
-1
-1
-1
-1
-1
20.502092050209207
10.543933054393305
8.870292887029288
-1
45.02092050209205
2.092050209205021
0.0
2.2594142259414225
-1
25.523012552301257
4.351464435146443
15.648535564853557
2.092050209205021
23.430962343096233
1

-1
-1
80.22598870056497
72.88135593220339
88.70056497175142
88.70056497175142
43.50282485875706
48.0225988700565
57.06214689265537
88.13559322033899
29.943502824858758
45.76271186440678
50.282485875706215
-1
-1
29.37853107344633
-1
-1
49.152542372881356
42.3728813559322
52.54237288135593
54.23728813559322
38.983050847457626
70.62146892655367
37.85310734463277
-1
-1
49.152542372881356
42.3728813559322
51.41242937853107
42.3728813559322
35.59322033898305
50.282485875706215
49.717514124293785
29.37853107344633
-1
37.85310734463277
-1
-1
-1
-1
-1
-1
36.72316384180791
51.41242937853107
66.10169491525424
36.21468926553672
44.067796610169495
51.41242937853107
54.23728813559322
31.1864406779661
31.638418079096045
41.24293785310734
-1
-1
-1
-1
-1
28.192090395480225
42.3728813559322
38.983050847457626
45.19774011299435
-1
37.28813559322034
41.24293785310734
-1
-1
31.638418079096045
-1
-1
30.508474576271187
-1
-1
-1
-1
-1
-1
-1
21.016949152542374
22.65536723163842
-1
29.6045197740113
0.0
0.847457

9.090909090909092
8.211143695014663
-1
-1
7.331378299120234
2.932551319648094
0.0
-1
21.407624633431084
27.85923753665689
-1
17.59530791788856
21.994134897360702
28.299120234604107
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
53.96341463414634
-1
-1
-1
53.65853658536585
-1
-1
-1
60.97560975609756
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
59.146341463414636
59.146341463414636
59.146341463414636
59.146341463414636
-1
77.7439024390244
-1
-1
-1
-1
-1
-1
-1
55.386178861788615
-1
-1
-1
-1
-1
-1
-1
71.64634146341463
-1
-1
-1
71.23983739837398
-1
-1
83.73983739837398
74.6951219512195
93.90243902439025
100.0
-1
-1
67.6829268292683
61.99186991869919
-1
-1
57.01219512195122
-1
-1
59.451219512195124
62.703252032520325
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
31.910569105691057
31.910569105691057
31.910569105691057
31.910569105691057
-1
-1
25.101626016260163
-1
29.878048780487806
10.77

29.470802919708028
33.85036496350365
32.20802919708029
-1
-1
-1
100.0
-1
-1
68.43065693430657
59.03284671532847
-1
-1
62.043795620437955
-1
-1
-1
-1
-1
-1
61.222627737226276
69.52554744525547
59.762773722627735
-1
-1
42.42700729927007
-1
-1
-1
-1
70.89416058394161
-1
-1
36.13138686131387
-1
-1
-1
-1
59.12408759124087
-1
-1
49.36131386861314
-1
-1
-1
-1
41.24087591240876
-1
-1
22.71897810218978
49.27007299270073
-1
-1
32.93795620437956
-1
-1
11.313868613138686
-1
19.89051094890511
-1
-1
-1
-1
-1
-1
35.49270072992701
31.478102189781023
-1
20.437956204379564
23.357664233576642
-1
-1
-1
-1
-1
-1
48.996350364963504
31.934306569343065
30.2007299270073
23.175182481751825
-1
-1
-1
-1
0.0
20.072992700729927
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
63.04744525547445
-1
-1
85.21897810218978
64.32481751824818
-1
15.746124031007753
26.69573643410853
29.45736434108527
26.69573643410853
-1
-1
-1
-1
12.645348837209303
15.69767441860465
17.635658914728683
26.69573643410853
9.641472868217054
8.91472868217054

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
0.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
58.783120706575076
76.44749754661433
64.6712463199215
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
100.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
70.55937193326791
-1
-1
-1
-1
-1
-1
-1
-1
62.610402355250244
-1
-1
47.20314033366045
48.773307163886166
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
24.043179587831208
30.912659470068697
-1
-1
-1
-1
-1
-1
-1
-1
12.26692836113837
4.5142296368989205
-1
-1
-1
-1
17.17369970559372
15.89793915603533
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
14.785553047404063
11.963882618510159
-1
-1
13.37471783295711
13.37471783295711
-1
-1
14.785553047404063
11.963882618510159
-1
-1
14.785553047404063
11.963882618510159
-1
-1
13.37471783295711
13.37471783295711
-1
-1
11.963882618510159
14.

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
100.0
-1
-1
-1
-1
-1
-1
88.46153846153847
70.1923076923077
-1
-1
88.46153846153847
-1
-1
-1
-1
88.46153846153847
-1
-1
68.26923076923077
86.53846153846153
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
80.76923076923077
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
0.0
-1
88.46153846153847
-1
-1
57.69230769230769
75.96153846153847
-1
25.0
44.23076923076923
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1.7035775127768313
12.49290176036343
3.691084611016468
6.814310051107325
0.0
9.994321408290745
31.175468483816015
35.77512776831346
11.357183418512209
17.035775127768314
2.271436683702442
3.975014196479273
9.085746734809767
54.5144804088586
50.53946621237933
55.08233957978421
-1
-1
27.825099375354913
31.232254400908573
-1
33.50369108461101
29.528676888131745
-1
-1
27.825099375354913
-1
-1
48.26802952867689
51.67518455423055
-1
-1
-1
-1
-1
-1
-1
-1
24.41794434980125
27.825099375354

13.073005093378608
-1
-1
4.4142614601018675
18.845500848896435
-1
1.867572156196944
0.0
4.4142614601018675
-1
-1
30.39049235993209
33.276740237691
-1
-1
-1
2.037351443123939
-1
-1
0.0
4.4142614601018675
-1
-1
0.0
24.617996604414262
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
100.0
72.23796033994334
76.91218130311614
100.0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
97.73371104815864
69.8300283286119
-1
75.77903682719547
65.15580736543909
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
43.05949008498584
47.1671388101983
-1
-1
-1
-1
-1
0.0
7.36543909348442
9.206798866855523
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
38.95184135977337
49.71671388101983
-1
55.80736543909349
57.79036827195468
-1
-1
-1
44.91279069767442
-1
-1
-1
-1
-1
-1
-1
28.63372093023256
-1
-1
-1
0.

In [30]:
for row in range(0,150):
    for col in range(0,160):
        finalMat[row][col] = float(finalMat[row][col])

In [31]:
normalized_mat_df = spark.createDataFrame(finalMat)

In [32]:
normalized_mat_df.show(6)

+------------------+------------------+------------------+----+----+----+----+----+----+----+----+-----------------+----+----+----+----+------------------+----+-----------------+----+----+----------------+------------------+----+----+-----+-----------------+----+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+----+----+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+-----+-----------------+------------------+-----------------+-----------------+------------------+----------------+-----------------+-----------------+-----------------+----------------+------------------+----+-----------------+-----------------+-----------------+----+-----------------+-----------------+-----------------+-----------------+

In [33]:
# Export to file
normalized_mat_df.toPandas().to_csv('task4.csv', index=False, header=False)