In [88]:
import time

from datetime import datetime as dt
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import unix_timestamp



In [73]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

spark = SparkSession \
    .builder \
    .appName("Python Spark Assignment") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    
sc = spark.sparkContext
rdd = sc.textFile("/data/lsml/4-5-spark/flights.csv")

In [114]:
print(rdd.count())
print(rdd.take(1))

33121
['1185,PG0134,2017-09-10 09:50:00+03,2017-09-10 14:55:00+03,DME,BTK,Scheduled,319,,']


In [115]:
def show_head(rdd, cnt=5):
    for line in rdd.take(cnt):
        if isinstance(line, list):
            print(line)
        else:
            print([val for val in line.split(',')])

In [116]:
show_head(rdd)

['1185', 'PG0134', '2017-09-10 09:50:00+03', '2017-09-10 14:55:00+03', 'DME', 'BTK', 'Scheduled', '319', '', '']
['3979', 'PG0052', '2017-08-25 14:50:00+03', '2017-08-25 17:35:00+03', 'VKO', 'HMA', 'Scheduled', 'CR2', '', '']
['4739', 'PG0561', '2017-09-05 12:30:00+03', '2017-09-05 14:15:00+03', 'VKO', 'AER', 'Scheduled', '763', '', '']
['5502', 'PG0529', '2017-09-12 09:50:00+03', '2017-09-12 11:20:00+03', 'SVO', 'UFA', 'Scheduled', '763', '', '']
['6938', 'PG0461', '2017-09-04 12:25:00+03', '2017-09-04 13:20:00+03', 'SVO', 'ULV', 'Scheduled', 'SU9', '', '']


In [128]:
rdd2 = rdd.map(
    lambda line:
        line.split(',')
).filter(
    lambda line:
        line[4] == 'SVO' and 
        line[6] == 'Arrived'
)

print(rdd2.count())
print(show_head(rdd2))

1503
['4947', 'PG0468', '2017-08-07 13:15:00+03', '2017-08-07 14:05:00+03', 'SVO', 'LED', 'Arrived', '321', '2017-08-07 13:16:00+03', '2017-08-07 14:06:00+03']
['4950', 'PG0469', '2017-08-07 12:35:00+03', '2017-08-07 13:25:00+03', 'SVO', 'LED', 'Arrived', '321', '2017-08-07 12:39:00+03', '2017-08-07 13:28:00+03']
['4951', 'PG0470', '2017-08-07 10:20:00+03', '2017-08-07 11:10:00+03', 'SVO', 'LED', 'Arrived', '321', '2017-08-07 10:23:00+03', '2017-08-07 11:12:00+03']
['4952', 'PG0471', '2017-08-07 18:40:00+03', '2017-08-07 19:30:00+03', 'SVO', 'LED', 'Arrived', '321', '2017-08-07 18:43:00+03', '2017-08-07 19:34:00+03']
['4953', 'PG0472', '2017-08-07 18:30:00+03', '2017-08-07 19:20:00+03', 'SVO', 'LED', 'Arrived', '321', '2017-08-07 18:33:00+03', '2017-08-07 19:23:00+03']
None


In [140]:
rdd3 = rdd2.map(
    lambda line: 
        [line[7],
         dt.strptime(line[2] + '00', '%Y-%m-%d %H:%M:%S%z').timestamp() - \
         dt.strptime(line[8] + '00', '%Y-%m-%d %H:%M:%S%z').timestamp() >= 0,
         line[2],
         line[8]]
).filter(
    lambda line:
        line[1] is True
)

print(rdd3.count())
print(rdd3.take(5))

75
[['321', True, '2017-07-29 10:20:00+03', '2017-07-29 10:20:00+03'], ['321', True, '2017-07-29 12:35:00+03', '2017-07-29 12:35:00+03'], ['321', True, '2017-08-08 13:15:00+03', '2017-08-08 13:15:00+03'], ['321', True, '2017-08-05 18:30:00+03', '2017-08-05 18:30:00+03'], ['321', True, '2017-08-09 13:15:00+03', '2017-08-09 13:15:00+03']]


In [129]:
rdd4 = rdd3.map(
    lambda line: 
        (line[0], 
         int(line[1]))
)

print(rdd4.count())
print(rdd4.take(5))

75
[('321', 1), ('321', 1), ('321', 1), ('321', 1), ('321', 1)]


In [130]:
rdd5 = rdd4.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], ascending = False)

print(rdd5.take(100))

[('SU9', 28), ('CR2', 12), ('321', 11), ('773', 8), ('CN1', 6), ('733', 4), ('319', 4), ('763', 2)]


In [131]:
rdd6 = rdd5.map(lambda x: x[0] + ' ' + str(x[1]))

In [132]:
for line in rdd5.collect():
    print(line)

('SU9', 28)
('CR2', 12)
('321', 11)
('773', 8)
('CN1', 6)
('733', 4)
('319', 4)
('763', 2)
