In [1]:
import time
import pyspark.sql.functions as f

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import *
from pyspark.sql.functions import desc



In [2]:
spark = SparkSession.builder.master('"local[*]"').getOrCreate()

spark = SparkSession \
    .builder \
    .appName("Python Spark Assignment") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
df_flights = spark.read.csv('/data/lsml/4-5-spark/flights.csv')
df_flights.cache()
df_flights.show(2)

+----+------+--------------------+--------------------+---+---+---------+---+----+----+
| _c0|   _c1|                 _c2|                 _c3|_c4|_c5|      _c6|_c7| _c8| _c9|
+----+------+--------------------+--------------------+---+---+---------+---+----+----+
|1185|PG0134|2017-09-10 09:50:...|2017-09-10 14:55:...|DME|BTK|Scheduled|319|null|null|
|3979|PG0052|2017-08-25 14:50:...|2017-08-25 17:35:...|VKO|HMA|Scheduled|CR2|null|null|
+----+------+--------------------+--------------------+---+---+---------+---+----+----+
only showing top 2 rows



In [4]:
df_tickets = spark.read.csv('/data/lsml/4-5-spark/ticket_flights.csv')
df_tickets.cache()
df_tickets.show(2)

+-------------+-----+--------+--------+
|          _c0|  _c1|     _c2|     _c3|
+-------------+-----+--------+--------+
|0005432159776|30625|Business|42100.00|
|0005435212351|30625|Business|42100.00|
+-------------+-----+--------+--------+
only showing top 2 rows



In [8]:
df_tickets.count()

1045726

In [5]:
df_flights = df_flights.withColumn('_c2_unix', unix_timestamp(df_flights['_c2'].cast(TimestampType()), 'yyyy-MM-ddThh:mm:ss'))
df_flights = df_flights.withColumn('_c8_unix', unix_timestamp(df_flights['_c8'].cast(TimestampType()), 'yyyy-MM-ddThh:mm:ss'))
df_flights = df_flights.withColumn('_c8-c2_unix', df_flights['_c8_unix'] - df_flights['_c2_unix'])

df_flights.show(5)

+----+------+--------------------+--------------------+---+---+---------+---+----+----+----------+--------+-----------+
| _c0|   _c1|                 _c2|                 _c3|_c4|_c5|      _c6|_c7| _c8| _c9|  _c2_unix|_c8_unix|_c8-c2_unix|
+----+------+--------------------+--------------------+---+---+---------+---+----+----+----------+--------+-----------+
|1185|PG0134|2017-09-10 09:50:...|2017-09-10 14:55:...|DME|BTK|Scheduled|319|null|null|1505026200|    null|       null|
|3979|PG0052|2017-08-25 14:50:...|2017-08-25 17:35:...|VKO|HMA|Scheduled|CR2|null|null|1503661800|    null|       null|
|4739|PG0561|2017-09-05 12:30:...|2017-09-05 14:15:...|VKO|AER|Scheduled|763|null|null|1504603800|    null|       null|
|5502|PG0529|2017-09-12 09:50:...|2017-09-12 11:20:...|SVO|UFA|Scheduled|763|null|null|1505199000|    null|       null|
|6938|PG0461|2017-09-04 12:25:...|2017-09-04 13:20:...|SVO|ULV|Scheduled|SU9|null|null|1504517100|    null|       null|
+----+------+--------------------+------

In [6]:
df_flights.count()

33121

In [26]:
df_tickets_grouped = df_tickets.groupby('_c1').count()
df_tickets_grouped = df_tickets_grouped.withColumnRenamed('_c1', '_c1_tmp')
df_tickets_grouped.cache()
df_tickets_grouped.show(5)

+-------+-----+
|_c1_tmp|count|
+-------+-----+
|  29573|    7|
|   1090|   22|
|   2294|   48|
|   4821|   10|
|   6731|   50|
+-------+-----+
only showing top 5 rows



In [27]:
df_tickets_grouped.count()

22226

In [29]:
df_flights_join = df_flights.join(df_tickets_grouped, df_flights['_c0'] == df_tickets_grouped['_c1_tmp'])
df_flights_join.count()

22226

In [30]:
df_flights_join.show(5)

+----+------+--------------------+--------------------+---+---+---------+---+----+----+----------+--------+-----------+-------+-----+
| _c0|   _c1|                 _c2|                 _c3|_c4|_c5|      _c6|_c7| _c8| _c9|  _c2_unix|_c8_unix|_c8-c2_unix|_c1_tmp|count|
+----+------+--------------------+--------------------+---+---+---------+---+----+----+----------+--------+-----------+-------+-----+
|1185|PG0134|2017-09-10 09:50:...|2017-09-10 14:55:...|DME|BTK|Scheduled|319|null|null|1505026200|    null|       null|   1185|    2|
|3979|PG0052|2017-08-25 14:50:...|2017-08-25 17:35:...|VKO|HMA|Scheduled|CR2|null|null|1503661800|    null|       null|   3979|   28|
|4739|PG0561|2017-09-05 12:30:...|2017-09-05 14:15:...|VKO|AER|Scheduled|763|null|null|1504603800|    null|       null|   4739|   41|
|5502|PG0529|2017-09-12 09:50:...|2017-09-12 11:20:...|SVO|UFA|Scheduled|763|null|null|1505199000|    null|       null|   5502|    9|
|6938|PG0461|2017-09-04 12:25:...|2017-09-04 13:20:...|SVO|ULV

In [31]:
df_avg_of_departure_time = df_flights_join[(df_flights_join['_c8-c2_unix'] > 0) & (df_flights_join['_c6'] == 'Arrived')].groupby('_c1').avg('_c8-c2_unix')
df_avg_of_departure_time.show()

+------+------------------+
|   _c1|  avg(_c8-c2_unix)|
+------+------------------+
|PG0144|193.84615384615384|
|PG0325|             160.0|
|PG0503| 614.4827586206897|
|PG0278|            1894.0|
|PG0088|             280.0|
|PG0254|             894.0|
|PG0383|             216.0|
|PG0700| 589.2857142857143|
|PG0211| 543.8709677419355|
|PG0513|             546.0|
|PG0690|2262.8571428571427|
|PG0412|             906.0|
|PG0539|             210.0|
|PG0230| 589.2857142857143|
|PG0521|            1628.0|
|PG0399|1357.7777777777778|
|PG0108|             160.0|
|PG0059|             784.0|
|PG0380|            2985.0|
|PG0595|            1890.0|
+------+------------------+
only showing top 20 rows



In [32]:
df_flights_sample = df_flights_join['_c0', '_c1']
df_flights_sample = df_flights_sample.withColumnRenamed('_c0', '_flight_id')
df_flights_sample = df_flights_sample.withColumnRenamed('_c1', '_flight_no')

df_flights_sample.show()

+----------+----------+
|_flight_id|_flight_no|
+----------+----------+
|      1185|    PG0134|
|      3979|    PG0052|
|      4739|    PG0561|
|      5502|    PG0529|
|      6938|    PG0461|
|      9478|    PG0360|
|     11085|    PG0569|
|     11847|    PG0498|
|     12012|    PG0621|
|     23609|    PG0648|
|     23695|    PG0388|
|     24705|    PG0632|
|     27580|    PG0483|
|     29272|    PG0334|
|     29440|    PG0065|
|     32658|    PG0674|
|         1|    PG0405|
|         2|    PG0404|
|         3|    PG0405|
|         5|    PG0405|
+----------+----------+
only showing top 20 rows



In [33]:
df_tickets_join = df_tickets.join(df_flights_sample, df_tickets['_c1'] == df_flights_sample['_flight_id'], how='left')
df_tickets_join.show()

+-------------+-----+--------+--------+----------+----------+
|          _c0|  _c1|     _c2|     _c3|_flight_id|_flight_no|
+-------------+-----+--------+--------+----------+----------+
|0005432159776|30625|Business|42100.00|     30625|    PG0013|
|0005435212351|30625|Business|42100.00|     30625|    PG0013|
|0005435212386|30625|Business|42100.00|     30625|    PG0013|
|0005435212381|30625|Business|42100.00|     30625|    PG0013|
|0005432211370|30625|Business|42100.00|     30625|    PG0013|
|0005435212357|30625| Comfort|23900.00|     30625|    PG0013|
|0005435212360|30625| Comfort|23900.00|     30625|    PG0013|
|0005435212393|30625| Comfort|23900.00|     30625|    PG0013|
|0005435212374|30625| Comfort|23900.00|     30625|    PG0013|
|0005435212365|30625| Comfort|23900.00|     30625|    PG0013|
|0005435212378|30625| Comfort|23900.00|     30625|    PG0013|
|0005435212362|30625| Comfort|23900.00|     30625|    PG0013|
|0005435212334|30625| Comfort|23900.00|     30625|    PG0013|
|0005435

In [34]:
df_tickets_join.count()

1045726

In [35]:
df_tickets_join = df_tickets_join.withColumn('_c3_float', df_tickets_join['_c3'].cast(FloatType()))
df_tickets_join.show()

+-------------+-----+--------+--------+----------+----------+---------+
|          _c0|  _c1|     _c2|     _c3|_flight_id|_flight_no|_c3_float|
+-------------+-----+--------+--------+----------+----------+---------+
|0005434877632|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434878408|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434878496|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434878162|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434877976|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434878636|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434878414|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434878149|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434877987|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434877842|10096|Business|49700.00|     10096|    PG0186|  49700.0|
|0005434877972|10096|Business|49700.00|     10096|    PG0186|  4

In [36]:
min_price_for_business = df_tickets_join[df_tickets_join['_c2'] == 'Business'].groupby('_flight_no').min('_c3_float')
min_price_for_business = min_price_for_business.withColumnRenamed('_flight_no', '_flight_no_bs')
min_price_for_business = min_price_for_business.withColumnRenamed('min(_c3_float)', 'min(_c3_float)_bs')

min_price_for_business.show()

+-------------+-----------------+
|_flight_no_bs|min(_c3_float)_bs|
+-------------+-----------------+
|       PG0144|         203300.0|
|       PG0325|          91700.0|
|       PG0278|          84000.0|
|       PG0503|          67200.0|
|       PG0088|          92200.0|
|       PG0211|          38900.0|
|       PG0383|          10300.0|
|       PG0690|          52300.0|
|       PG0412|          35000.0|
|       PG0539|          88300.0|
|       PG0521|          40300.0|
|       PG0230|          18800.0|
|       PG0108|         201300.0|
|       PG0311|          61300.0|
|       PG0588|           9500.0|
|       PG0115|          18000.0|
|       PG0397|          26900.0|
|       PG0217|          24500.0|
|       PG0571|          26600.0|
|       PG0186|          49700.0|
+-------------+-----------------+
only showing top 20 rows



In [37]:
min_price_for_economy = df_tickets_join[df_tickets_join['_c2'] == 'Economy'].groupby('_flight_no').min('_c3_float')
min_price_for_economy = min_price_for_economy.withColumnRenamed('_flight_no', '_flight_no_ec')
min_price_for_economy = min_price_for_economy.withColumnRenamed('min(_c3_float)', 'min(_c3_float)_ec')

min_price_for_economy.show()

+-------------+-----------------+
|_flight_no_ec|min(_c3_float)_ec|
+-------------+-----------------+
|       PG0144|          67800.0|
|       PG0325|          30600.0|
|       PG0254|           3100.0|
|       PG0503|          22400.0|
|       PG0278|          28000.0|
|       PG0088|          30700.0|
|       PG0700|           8100.0|
|       PG0211|          13000.0|
|       PG0383|           3400.0|
|       PG0513|           5800.0|
|       PG0690|          17400.0|
|       PG0539|          29400.0|
|       PG0412|          11700.0|
|       PG0263|           6600.0|
|       PG0399|           5900.0|
|       PG0521|          13400.0|
|       PG0230|           6300.0|
|       PG0108|          67100.0|
|       PG0059|           7100.0|
|       PG0380|          17100.0|
+-------------+-----------------+
only showing top 20 rows



In [38]:
num_seats_in_business = df_tickets_join[df_tickets_join['_c2'] == 'Business'].groupby('_flight_no').count()
num_seats_in_business.show()

+----------+-----+
|_flight_no|count|
+----------+-----+
|    PG0144|  222|
|    PG0325|   11|
|    PG0278|  986|
|    PG0503|  291|
|    PG0088|   41|
|    PG0211|  443|
|    PG0383|  181|
|    PG0690|  316|
|    PG0539|  126|
|    PG0412|  996|
|    PG0521|  532|
|    PG0230| 1127|
|    PG0108|   34|
|    PG0311|  532|
|    PG0588|  452|
|    PG0115|  261|
|    PG0397|  437|
|    PG0217|  875|
|    PG0571|  347|
|    PG0186|  862|
+----------+-----+
only showing top 20 rows



In [39]:
result = df_avg_of_departure_time.join(min_price_for_business, df_avg_of_departure_time['_c1'] == min_price_for_business['_flight_no_bs'], how='left')
result = result.join(min_price_for_economy, result['_c1'] == min_price_for_economy['_flight_no_ec'], how='left')
result = result.join(num_seats_in_business, result['_c1'] == num_seats_in_business['_flight_no'], how='left')

result.show()

+------+------------------+-------------+-----------------+-------------+-----------------+----------+-----+
|   _c1|  avg(_c8-c2_unix)|_flight_no_bs|min(_c3_float)_bs|_flight_no_ec|min(_c3_float)_ec|_flight_no|count|
+------+------------------+-------------+-----------------+-------------+-----------------+----------+-----+
|PG0144|193.84615384615384|       PG0144|         203300.0|       PG0144|          67800.0|    PG0144|  222|
|PG0325|             160.0|       PG0325|          91700.0|       PG0325|          30600.0|    PG0325|   11|
|PG0088|             280.0|       PG0088|          92200.0|       PG0088|          30700.0|    PG0088|   41|
|PG0254|             894.0|         null|             null|       PG0254|           3100.0|      null| null|
|PG0278|            1894.0|       PG0278|          84000.0|       PG0278|          28000.0|    PG0278|  986|
|PG0503| 614.4827586206897|       PG0503|          67200.0|       PG0503|          22400.0|    PG0503|  291|
|PG0211| 543.870967

In [40]:
result = result.drop('_flight_no_bs')
result = result.drop('_flight_no_ec')
result = result.drop('_flight_no')

result.show()

+------+------------------+-----------------+-----------------+-----+
|   _c1|  avg(_c8-c2_unix)|min(_c3_float)_bs|min(_c3_float)_ec|count|
+------+------------------+-----------------+-----------------+-----+
|PG0144|193.84615384615384|         203300.0|          67800.0|  222|
|PG0325|             160.0|          91700.0|          30600.0|   11|
|PG0088|             280.0|          92200.0|          30700.0|   41|
|PG0254|             894.0|             null|           3100.0| null|
|PG0278|            1894.0|          84000.0|          28000.0|  986|
|PG0503| 614.4827586206897|          67200.0|          22400.0|  291|
|PG0211| 543.8709677419355|          38900.0|          13000.0|  443|
|PG0383|             216.0|          10300.0|           3400.0|  181|
|PG0700| 589.2857142857143|             null|           8100.0| null|
|PG0513|             546.0|             null|           5800.0| null|
|PG0690|2262.8571428571427|          52300.0|          17400.0|  316|
|PG0412|            

In [41]:
result_na = result.na.fill(0)
result_na.show()

+------+------------------+-----------------+-----------------+-----+
|   _c1|  avg(_c8-c2_unix)|min(_c3_float)_bs|min(_c3_float)_ec|count|
+------+------------------+-----------------+-----------------+-----+
|PG0144|193.84615384615384|         203300.0|          67800.0|  222|
|PG0325|             160.0|          91700.0|          30600.0|   11|
|PG0088|             280.0|          92200.0|          30700.0|   41|
|PG0254|             894.0|              0.0|           3100.0|    0|
|PG0278|            1894.0|          84000.0|          28000.0|  986|
|PG0503| 614.4827586206897|          67200.0|          22400.0|  291|
|PG0211| 543.8709677419355|          38900.0|          13000.0|  443|
|PG0383|             216.0|          10300.0|           3400.0|  181|
|PG0700| 589.2857142857143|              0.0|           8100.0|    0|
|PG0513|             546.0|              0.0|           5800.0|    0|
|PG0690|2262.8571428571427|          52300.0|          17400.0|  316|
|PG0412|            

In [42]:
final = result_na.withColumn('score', 
                             1 / result_na['avg(_c8-c2_unix)'] + 3 / 10000 * \
                             (result_na['min(_c3_float)_bs'] + result_na['min(_c3_float)_ec']) + \
                            7 / 100 * result_na['count'])
final.show()

+------+------------------+-----------------+-----------------+-----+------------------+
|   _c1|  avg(_c8-c2_unix)|min(_c3_float)_bs|min(_c3_float)_ec|count|             score|
+------+------------------+-----------------+-----------------+-----+------------------+
|PG0144|193.84615384615384|         203300.0|          67800.0|  222| 96.87515873015873|
|PG0325|             160.0|          91700.0|          30600.0|   11|          37.46625|
|PG0088|             280.0|          92200.0|          30700.0|   41| 39.74357142857142|
|PG0254|             894.0|              0.0|           3100.0|    0|0.9311185682326621|
|PG0278|            1894.0|          84000.0|          28000.0|  986|102.62052798310455|
|PG0503| 614.4827586206897|          67200.0|          22400.0|  291| 47.25162738496071|
|PG0211| 543.8709677419355|          38900.0|          13000.0|  443|46.581838671411624|
|PG0383|             216.0|          10300.0|           3400.0|  181| 16.78462962962963|
|PG0700| 589.28571428

In [43]:
final = final.drop('avg(_c8-c2_unix)')
final = final.drop('min(_c3_float)_bs')
final = final.drop('min(_c3_float)_ec')
final = final.drop('count')

final.show()

+------+------------------+
|   _c1|             score|
+------+------------------+
|PG0144| 96.87515873015873|
|PG0325|          37.46625|
|PG0088| 39.74357142857142|
|PG0254|0.9311185682326621|
|PG0278|102.62052798310455|
|PG0503| 47.25162738496071|
|PG0211|46.581838671411624|
|PG0383| 16.78462962962963|
|PG0700|2.4316969696969695|
|PG0513|1.7418315018315016|
|PG0690|43.030441919191915|
|PG0412|  83.7311037527594|
|PG0539|  44.1347619047619|
|PG0108| 82.90624999999999|
|PG0230| 86.42169696969697|
|PG0399| 1.770736497545008|
|PG0521| 53.35061425061426|
|PG0059|2.1312755102040817|
|PG0310| 7.414575163398692|
|PG0380|5.1303350083752095|
+------+------------------+
only showing top 20 rows



In [44]:
final.orderBy(desc('score')).show(10)

+------+------------------+
|   _c1|             score|
+------+------------------+
|PG0208|163.75100694444444|
|PG0209| 162.4217835178352|
|PG0357| 134.6211850152905|
|PG0356|133.29099928622412|
|PG0198|133.10102921646748|
|PG0199| 132.6816891891892|
|PG0222|116.43096021947873|
|PG0223|115.10178117048346|
|PG0703|110.72508771929824|
|PG0704|108.62098039215687|
+------+------------------+
only showing top 10 rows

