In [1]:
import h2o
import zipfile
import os
import sys
from pyspark.sql import SparkSession
from IPython.display import display
from pyspark.sql.functions import regexp_extract, col, split, udf, \
                                 trim, when, from_unixtime, unix_timestamp, minute, hour, datediff, lit, array
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType, BooleanType, ArrayType, StructType, StructField, LongType, TimestampType
import datetime
import argparse
import json
import glob, os, shutil
import pandas as pd
from pandas.io.json import json_normalize
from pyspark import SparkContext

pd.options.display.max_columns = 99

sc = SparkContext()

spark = SparkSession \
        .builder \
        .appName("Data ETL") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
        
display(spark.version)

'2.1.1'

# Load Data

In [2]:
# Version 1.0
flight = spark.read.parquet("/home/ubuntu/s3/comb/flight_v1_0.pq")
display(flight.count())
display(flight.show(2))

txt_exception_folder = '/home/ubuntu/s3/comb/txt_exception/'
print(txt_exception_folder)

# Version 1.1
flightv1_1 = spark.read.json(os.path.join(txt_exception_folder, "flight_15_13_price_2017-05-11*.txt"))
display(flightv1_1.count())
display(flightv1_1.show(1))

2288103

+--------------+----------+---------+----------------+-------+------------+----+-------+------------+-------------+--------------------+-------------+--------------------+--------------------+--------+-----------+-------------+-----+--------------------+-----+------+----------+-----------+---------+----+-------------------+-----------+-----+----+
|from_city_name|start_date|stay_days|      table_name|task_id|to_city_name|trip|version|airline_code|airline_codes|            arr_time|check_bag_inc|             company|            dep_time|duration|flight_code|flight_number|index|               plane|power| price|price_code|search_date|span_days|stop|          stop_info|ticket_left|video|wifi|
+--------------+----------+---------+----------------+-------+------------+----+-------+------------+-------------+--------------------+-------------+--------------------+--------------------+--------+-----------+-------------+-----+--------------------+-----+------+----------+-----------+---------+--

None

/home/ubuntu/s3/comb/txt_exception/


74603

+------------+----------+--------------------+--------------------+--------+-----+----------+--------+------------------+-------+--------------------+--------------------+--------+----+--------------------+-------+
|currencyCode|   depDate|         flight_leg1|         flight_leg2|fromCity|price|searchDate|stayDays|         tableName|task_id|       timeline_leg1|       timeline_leg2|  toCity|trip|                 url|version|
+------------+----------+--------------------+--------------------+--------+-----+----------+--------+------------------+-------+--------------------+--------------------+--------+----+--------------------+-------+
|         AUD|2017-05-18|[[Hangzhou,HGH],2...|[[Bangkok,BKK],20...| Bangkok|401.3|2017-05-11|       7|flight_15_13_price|  16232|[[[Macau, Macau,M...|[[[Macau, Macau,M...|Hangzhou|   2|https://www.exped...|    1.1|
+------------+----------+--------------------+--------------------+--------+-----+----------+--------+------------------+-------+-----------

None

# Modify version 1.0

In [3]:
# for one way trips, display None in stay_days
def correct_stay_days(trip, stay_days):
    if trip == '1':
        return None
    else:
        return int(stay_days)

correct_stay_days_UDF = udf(correct_stay_days, IntegerType())

def correct_tickets_left(noOfTicketsLeft):
    if noOfTicketsLeft == 0:
        return 999
    else:
        return noOfTicketsLeft
    
correct_tickets_left_UDF = udf(correct_tickets_left, IntegerType())


flight2 = (flight.withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stay_days')))
                 .drop('stay_days')
                 .withColumnRenamed('start_date', 'depDate')                 
                 .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                 .withColumnRenamed('from_city_name', 'fromCity')
                 .withColumnRenamed('to_city_name', 'toCity')                 
                 .withColumnRenamed('search_date', 'searchDate')                 
                 .withColumnRenamed('company', 'airlineName')                 
                 .withColumnRenamed('dep_time', 'departureTime')                                  
                 .withColumnRenamed('arr_time', 'arrivalTime')                                                   
                 .withColumn('duration_h', split(flight.duration,'h').getItem(0))
                 .withColumn('duration_m', F.substring_index(split(flight.duration,'h').getItem(1), 'm', 1))
#                  .withColumn('duration', F.struct(col('duration_h'), col('duration_m')))
                 .withColumn('duration_m', (col('duration_h')*60 + col('duration_m')))
                 .drop('duration', 'duration_h', 'flight_number')
                 .withColumnRenamed('price_code', 'currencyCode')                                  
                 .withColumnRenamed('stop', 'stops')                                  
                 .withColumn('stop_info', split(col('stop_info'), ';'))
                 .withColumn('noOfTicketsLeft', correct_tickets_left_UDF('ticket_left'))
                .drop('ticket_left')
               .withColumnRenamed('table_name', 'tableName')
                .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                        'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                        'stayDays', 
                       'departureTime', 'arrivalTime', 
                        'airlineName',  'duration_m', 
                        'flight_code', 'plane', 'stops', 'noOfTicketsLeft',
                       'airline_code', 'airline_codes',
                       'stop_info', 'span_days', 'power', 'video', 'wifi')                
          )
# varaibles added in v1.1: 'departureTime_leg2', 'arrivalTime_leg2', 'airlineName_leg2','duration_m_leg2','stops_leg2'
#  'noOfTicketsLeft_leg2','airline_codes_leg2', 
# 'stop_list', 'url'

# variables dropped in v1.1:
# 'span_days', 'power', 'video', 'wifi', 'stop_info'

display(flight2.where(col('trip') == 1).show(1))
display(flight2.where(col('trip') == 2).show(1, truncate=False))
flight2.printSchema()

# flight2.select('flight_code', 'flight_number').distinct().show(1000)
# flight2.select('stop_info').distinct().show()
# flight2.select('stop_list').distinct().show(100, truncate=False)

+------+-------+----------+----------------+-------+------------+--------+-------+----+----------+-------+--------+--------------------+--------------------+--------------+----------+-----------+----------------+-----+---------------+------------+-------------+--------------------+---------+-----+-----+-----+
| price|version|searchDate|       tableName|task_id|currencyCode|fromCity| toCity|trip|   depDate|retDate|stayDays|       departureTime|         arrivalTime|   airlineName|duration_m|flight_code|           plane|stops|noOfTicketsLeft|airline_code|airline_codes|           stop_info|span_days|power|video| wifi|
+------+-------+----------+----------------+-------+------------+--------+-------+----+----------+-------+--------+--------------------+--------------------+--------------+----------+-----------+----------------+-----+---------------+------------+-------------+--------------------+---------+-----+-----+-----+
|605.72|    1.0|2017-05-01|flight_1_5_price|    676|         AUD|  

None

+-----+-------+----------+----------------+-------+------------+--------+-------+----+----------+----------+--------+-----------------------------+-----------------------------+--------------+----------+-----------+-----------------------------------+-----+---------------+------------+-------------+---------------------+---------+-----+-----+----+
|price|version|searchDate|tableName       |task_id|currencyCode|fromCity|toCity |trip|depDate   |retDate   |stayDays|departureTime                |arrivalTime                  |airlineName   |duration_m|flight_code|plane                              |stops|noOfTicketsLeft|airline_code|airline_codes|stop_info            |span_days|power|video|wifi|
+-----+-------+----------+----------------+-------+------------+--------+-------+----+----------+----------+--------+-----------------------------+-----------------------------+--------------+----------+-----------+-----------------------------------+-----+---------------+------------+-------------+

None

root
 |-- price: double (nullable = true)
 |-- version: string (nullable = true)
 |-- searchDate: string (nullable = true)
 |-- tableName: string (nullable = true)
 |-- task_id: string (nullable = true)
 |-- currencyCode: string (nullable = true)
 |-- fromCity: string (nullable = true)
 |-- toCity: string (nullable = true)
 |-- trip: string (nullable = true)
 |-- depDate: string (nullable = true)
 |-- retDate: date (nullable = true)
 |-- stayDays: integer (nullable = true)
 |-- departureTime: string (nullable = true)
 |-- arrivalTime: string (nullable = true)
 |-- airlineName: string (nullable = true)
 |-- duration_m: double (nullable = true)
 |-- flight_code: string (nullable = true)
 |-- plane: string (nullable = true)
 |-- stops: long (nullable = true)
 |-- noOfTicketsLeft: integer (nullable = true)
 |-- airline_code: string (nullable = true)
 |-- airline_codes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stop_info: array (nullable = true)
 |    |-- 

# Modify Version 1.1

In [4]:
flightv1_1.show(2)

+------------+----------+--------------------+--------------------+--------+-----+----------+--------+------------------+-------+--------------------+--------------------+--------+----+--------------------+-------+
|currencyCode|   depDate|         flight_leg1|         flight_leg2|fromCity|price|searchDate|stayDays|         tableName|task_id|       timeline_leg1|       timeline_leg2|  toCity|trip|                 url|version|
+------------+----------+--------------------+--------------------+--------+-----+----------+--------+------------------+-------+--------------------+--------------------+--------+----+--------------------+-------+
|         AUD|2017-05-18|[[Hangzhou,HGH],2...|[[Bangkok,BKK],20...| Bangkok|401.3|2017-05-11|       7|flight_15_13_price|  16232|[[[Macau, Macau,M...|[[[Macau, Macau,M...|Hangzhou|   2|https://www.exped...|    1.1|
|         AUD|2017-05-18|[[Hangzhou,HGH],2...|[[Bangkok,BKK],20...| Bangkok|401.3|2017-05-11|       7|flight_15_13_price|  16232|[[[Macau, M

In [6]:

# # take_all_level1_str = udf(lambda rows, a: [row[a] for row in rows], ArrayType(StringType()))
# take_all_level2_str = udf(lambda rows, a, b:  [None if row is None else row[a][b] for row in rows], ArrayType(StringType()))
# # take_all = udf(lambda rows, a: [row[a]['city'] for row in rows], ArrayType(StringType()))


# flightv1_1.withColumn("city", take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('city')))\
#                       .select('airports').show(10)
# # flightv1_1.withColumn("airports", take_all_level1_str(flightv1_1.timeline_leg1, 'type')).select('airports').show(10)
# # flightv1_1.withColumn("airports", take_all(flightv1_1.timeline_leg1, lit('arrivalAirport'))).select('airports').show(10)

# # display(flightv1_1.select('timeline_leg1').show(100, truncate=False))



In [None]:
# df.selectExpr("explode(check) as e").select("e.*").show()

# flightv1_1.selectExpr('explode(timeline_leg1) as e').select('e.*').show(truncate=False)

In [7]:
# print(
#     datetime.datetime.fromtimestamp(
#         int("1284101485")
#     ).strftime('%Y-%m-%d %H:%M:%S')
# )

timeFmt = "yyyy-MM-dd'T'HH:mm:ss.SSS"

take_all_level2_str = udf(lambda rows, a, b: None if rows is None else [None if row is None else row[a][b] for row in rows], ArrayType(StringType()))
take_all_level1_str = udf(lambda rows, a: None if rows is None else [None if row is None else row[a] for row in rows], ArrayType(StringType()))
# take_all_level2_long = udf(lambda rows, a, b: None if rows is None else [None if row is None else datetime.datetime.fromtimestamp(row[a][b]) for row in rows], ArrayType(TimestampType()))

# airport = ArrayType(StructType([
#                                 StructField("airportCityState", StringType()),
#                                 StructField("city", StringType()),
#                                 StructField("code", StringType()),
#                                 StructField("localName", StringType()),
#                                 StructField("longName", StringType()),
#                                 StructField("name", StringType()),              
#           ]))

# take_all_airport = udf(lambda rows, a:  [None if row is None else row[a] for row in rows], ArrayType(airport))


flightv1_1_2 = (flightv1_1.withColumn('trip', col('trip').cast('string'))
                    .withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stayDays')))                    
                    .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                    .withColumn('airline_code', flightv1_1.flight_leg1.carrierSummary.airlineCodes.getItem(0))                   
                    .withColumn('airline_codes', flightv1_1.flight_leg1.carrierSummary.airlineCodes)                    
                    .withColumn('airline_codes_leg2', flightv1_1.flight_leg2.carrierSummary.airlineCodes)                    
                    .withColumn('departureTime', flightv1_1.flight_leg1.departureTime)
                    .withColumn('departureTime_leg2', flightv1_1.flight_leg2.departureTime)
                    .withColumn('arrivalTime', flightv1_1.flight_leg1.arrivalTime)
                    .withColumn('arrivalTime_leg2', flightv1_1.flight_leg2.arrivalTime)
#                 .withColumn('check_bag_inc', flightv1_1.flight_leg1.arrivalTime)
                    .withColumn('airlineName', flightv1_1.flight_leg1.carrierSummary.airlineName)
                    .withColumn('airlineName_leg2', flightv1_1.flight_leg2.carrierSummary.airlineName)
                    .withColumn('duration_m', (F.unix_timestamp('arrivalTime', format=timeFmt) - 
                                               F.unix_timestamp('departureTime', format=timeFmt))/60)                    
                .withColumn('duration_m_leg2', (F.unix_timestamp('arrivalTime_leg2', format=timeFmt) - 
                                               F.unix_timestamp('departureTime_leg2', format=timeFmt))/60)                    
#                     .withColumn('duration', flightv1_1.timeline_leg1.getItem(1).duration)
                .withColumn('airlineCode', flightv1_1.timeline_leg1.getItem(0).carrier.airlineCode)
                .withColumn('flightNumber', flightv1_1.timeline_leg1.getItem(0).carrier.flightNumber.cast('string'))                
                .select('*', F.concat(col('airlineCode'), col('flightNumber')).alias('flight_code'))
                .drop('airlineCode', 'flightNumber')
                .withColumn('plane', flightv1_1.timeline_leg1.getItem(0).carrier.plane)                
                .withColumn('stops', flightv1_1.flight_leg1.stops)                
                .withColumn('stops_leg2', flightv1_1.flight_leg2.stops)                
                .withColumn('stop_list', flightv1_1.flight_leg1.stop_list)# need to do more work                
                .withColumn('stop_list_leg2', flightv1_1.flight_leg1.stop_list)
                .withColumn('noOfTicketsLeft', correct_tickets_left_UDF(flightv1_1.flight_leg1.carrierSummary.noOfTicketsLeft))
                .withColumn('noOfTicketsLeft_leg2', correct_tickets_left_UDF(flightv1_1.flight_leg2.carrierSummary.noOfTicketsLeft))
                
                .withColumn('fromCityAirportCode', flightv1_1.flight_leg1.departureLocation.airportCode)                
                .withColumn('toCityAirportCode', flightv1_1.flight_leg1.arrivalLocation.airportCode)
                .withColumn('fromCityAirportCode_leg2', flightv1_1.flight_leg2.departureLocation.airportCode)
                .withColumn('toCityAirportCode_leg2', flightv1_1.flight_leg2.arrivalLocation.airportCode)
                
                # carrier leg 1
                .withColumn('carrierAirProviderId', flightv1_1.flight_leg1.carrierSummary.airProviderId)
                .withColumn('carrierAirlineImageFileName', flightv1_1.flight_leg1.carrierSummary.airlineImageFileName)
                .withColumn('carrierMixedCabinClass', flightv1_1.flight_leg1.carrierSummary.mixedCabinClass)
                .withColumn('carrierMultiStop', flightv1_1.flight_leg1.carrierSummary.multiStop)
                .withColumn('carrierNextDayArrival', flightv1_1.flight_leg1.carrierSummary.nextDayArrival)
                
                # carrier leg 2
                .withColumn('carrierAirProviderId_leg2', flightv1_1.flight_leg2.carrierSummary.airProviderId)
                .withColumn('carrierAirlineImageFileName_leg2', flightv1_1.flight_leg2.carrierSummary.airlineImageFileName)
                .withColumn('carrierMixedCabinClass_leg2', flightv1_1.flight_leg2.carrierSummary.mixedCabinClass)
                .withColumn('carrierMultiStop_leg2', flightv1_1.flight_leg2.carrierSummary.multiStop)
                .withColumn('carrierNextDayArrival_leg2', flightv1_1.flight_leg2.carrierSummary.nextDayArrival)
                
                ### Leg 1
                ## Leg 1 departure
#                 .withColumn('timeline_departureAirport', take_all_airport(flightv1_1.timeline_leg1, lit('departureAirport')))                               
                .withColumn('timeline_departureAirport_cityState', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('airportCityState')))
                .withColumn('timeline_departureAirport_city', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('city')))
                .withColumn('timeline_departureAirport_code', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('code')))
                .withColumn('timeline_departureAirport_localName', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('localName')))
                .withColumn('timeline_departureAirport_longName', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('longName')))
                .withColumn('timeline_departureAirport_name', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('name')))
                
                .withColumn('timeline_departureTime', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureTime'), lit('isoStr')))
                
                

                ## Leg 1 arrival
                .withColumn('timeline_arrivalAirport_cityState', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('airportCityState')))
                .withColumn('timeline_arrivalAirport_city', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('city')))
                .withColumn('timeline_arrivalAirport_code', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('code')))
                .withColumn('timeline_arrivalAirport_localName', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('localName')))
                .withColumn('timeline_arrivalAirport_longName', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('longName')))
                .withColumn('timeline_arrivalAirport_name', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('name')))                
                
                .withColumn('timeline_arrivalTime', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalTime'), lit('isoStr')))
                
                # distance
                .withColumn('timeline_distance', take_all_level2_str(flightv1_1.timeline_leg1, lit('distance'), lit('formattedTotal')))
                
                # carrier
                .withColumn('timeline_plane', take_all_level2_str(flightv1_1.timeline_leg1, lit('carrier'), lit('plane')))
                
                # brandedFareName
                .withColumn('timeline_brandedFareName', take_all_level1_str(flightv1_1.timeline_leg1, lit('brandedFareName')))                               
                
                # type
                .withColumn('timeline_type', take_all_level1_str(flightv1_1.timeline_leg1, lit('type')))                               
                
                ### Leg 2
                ## Leg 2 departure
                .withColumn('timeline_departureAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('airportCityState')))
                .withColumn('timeline_departureAirport_city_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('city')))
                .withColumn('timeline_departureAirport_code_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('code')))
                .withColumn('timeline_departureAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('localName')))
                .withColumn('timeline_departureAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('longName')))
                .withColumn('timeline_departureAirport_name_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('name')))
                
                .withColumn('timeline_departureTime_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureTime'), lit('isoStr')))                
                

                ## Leg 2 arrival
                .withColumn('timeline_arrivalAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('airportCityState')))
                .withColumn('timeline_arrivalAirport_city_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('city')))
                .withColumn('timeline_arrivalAirport_code_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('code')))
                .withColumn('timeline_arrivalAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('localName')))
                .withColumn('timeline_arrivalAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('longName')))
                .withColumn('timeline_arrivalAirport_name_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('name')))                
                
                .withColumn('timeline_arrivalTime_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalTime'), lit('isoStr')))
                
                # distance
                .withColumn('timeline_distance_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('distance'), lit('formattedTotal')))
               
                # carrier
                .withColumn('timeline_plane_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('carrier'), lit('plane')))
                
                # brandedFareName
                .withColumn('timeline_brandedFareName_leg2', take_all_level1_str(flightv1_1.timeline_leg2, lit('brandedFareName')))                           
                
                # type
                .withColumn('timeline_type_leg2', take_all_level1_str(flightv1_1.timeline_leg2, lit('type')))                               
                

                
                .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                        'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                        'stayDays', 
                       'departureTime', 'arrivalTime', 'departureTime_leg2', 'arrivalTime_leg2',
                        'airlineName', 'airlineName_leg2', 'duration_m', 'duration_m_leg2',                
                        'flight_code', 'plane', 'stops', 'stops_leg2', 'stop_list', 'stop_list_leg2',
                        'noOfTicketsLeft', 'noOfTicketsLeft_leg2',
                       'airline_code', 'airline_codes', 'airline_codes_leg2', 
                        'url', 'fromCityAirportCode', 'toCityAirportCode', 'fromCityAirportCode_leg2', 'toCityAirportCode_leg2',
                       'carrierAirProviderId', 'carrierAirlineImageFileName', 'carrierMixedCabinClass', 'carrierMultiStop', 'carrierNextDayArrival',
                        'carrierAirProviderId_leg2', 'carrierAirlineImageFileName_leg2', 'carrierMixedCabinClass_leg2', 'carrierMultiStop_leg2', 'carrierNextDayArrival_leg2',

                        ## leg 1
                        # departure
                        'timeline_departureAirport_cityState', 'timeline_departureAirport_city', 'timeline_departureAirport_code', 'timeline_departureAirport_localName', 
                        'timeline_departureAirport_longName', 'timeline_departureAirport_name',
                        
                        'timeline_departureTime',

                        # arrival
                        'timeline_arrivalAirport_cityState', 'timeline_arrivalAirport_city', 'timeline_arrivalAirport_code', 'timeline_arrivalAirport_localName', 
                        'timeline_arrivalAirport_longName', 'timeline_arrivalAirport_name',
                        
                        'timeline_arrivalTime',
                        
                        'timeline_distance',
                        'timeline_plane',
                        'timeline_brandedFareName',
                        'timeline_type',
                        
                        ## leg 2                        
                        # departure
                        'timeline_departureAirport_cityState_leg2', 'timeline_departureAirport_city_leg2', 'timeline_departureAirport_code_leg2', 'timeline_departureAirport_localName_leg2', 
                        'timeline_departureAirport_longName_leg2', 'timeline_departureAirport_name_leg2',
                        
                        'timeline_departureTime_leg2',

                        # arrival
                        'timeline_arrivalAirport_cityState_leg2', 'timeline_arrivalAirport_city_leg2', 'timeline_arrivalAirport_code_leg2', 'timeline_arrivalAirport_localName_leg2', 
                        'timeline_arrivalAirport_longName_leg2', 'timeline_arrivalAirport_name_leg2',
                        
                        'timeline_arrivalTime_leg2',
                        
                        'timeline_distance_leg2',
                        'timeline_plane_leg2',
                        'timeline_brandedFareName_leg2',
                        'timeline_type_leg2'
                       )                
               )

display(flightv1_1_2.where(col('trip')=='1').show(1))
display(flightv1_1_2.where(col('trip')=='2').show(1))
display(flightv1_1_2.printSchema())

# display(flightv1_1.select("flight_leg1.stop_list.airport").show(100, truncate=False))
        
#      |         flight_leg1|flight_leg2|| |||   airline_code      ||       timeline_leg1|timeline_leg2|  |                 |||||       ||
               

# flightv1_1_2.show(3)

# temp = flightv1_1.select("flight_leg1.stop_list").show(100, truncate=False)
# flightv1_1_2.printSchema()         
# flightv1_1_2.crosstab('noOfTicketsLeft', 'noOfTicketsLeft_leg2')
# flightv1_1_2.cube('trip', flightv1_1_2.noOfTicketsLeft, flightv1_1_2.noOfTicketsLeft_leg2).count().orderBy('trip', "noOfTicketsLeft", "noOfTicketsLeft_leg2").show(truncate=False)

+------+-------+----------+------------------+-------+------------+--------+--------+----+----------+-------+--------+--------------------+--------------------+------------------+----------------+------------+----------------+----------+---------------+-----------+-----+-----+----------+---------+--------------+---------------+--------------------+------------+-------------+------------------+--------------------+-------------------+-----------------+------------------------+----------------------+--------------------+---------------------------+----------------------+----------------+---------------------+-------------------------+--------------------------------+---------------------------+---------------------+--------------------------+-----------------------------------+------------------------------+------------------------------+-----------------------------------+----------------------------------+------------------------------+----------------------+---------------------------

None

+-----+-------+----------+------------------+-------+------------+--------+--------+----+----------+----------+--------+--------------------+--------------------+--------------------+--------------------+-----------------+-----------------+----------+---------------+-----------+-----------+-----+----------+--------------------+--------------------+---------------+--------------------+------------+-------------+------------------+--------------------+-------------------+-----------------+------------------------+----------------------+--------------------+---------------------------+----------------------+----------------+---------------------+-------------------------+--------------------------------+---------------------------+---------------------+--------------------------+-----------------------------------+------------------------------+------------------------------+-----------------------------------+----------------------------------+------------------------------+-------------

None

root
 |-- price: double (nullable = true)
 |-- version: string (nullable = true)
 |-- searchDate: string (nullable = true)
 |-- tableName: string (nullable = true)
 |-- task_id: long (nullable = true)
 |-- currencyCode: string (nullable = true)
 |-- fromCity: string (nullable = true)
 |-- toCity: string (nullable = true)
 |-- trip: string (nullable = true)
 |-- depDate: string (nullable = true)
 |-- retDate: date (nullable = true)
 |-- stayDays: integer (nullable = true)
 |-- departureTime: string (nullable = true)
 |-- arrivalTime: string (nullable = true)
 |-- departureTime_leg2: string (nullable = true)
 |-- arrivalTime_leg2: string (nullable = true)
 |-- airlineName: string (nullable = true)
 |-- airlineName_leg2: string (nullable = true)
 |-- duration_m: double (nullable = true)
 |-- duration_m_leg2: double (nullable = true)
 |-- flight_code: string (nullable = true)
 |-- plane: string (nullable = true)
 |-- stops: long (nullable = true)
 |-- stops_leg2: long (nullable = true)
 |-

None

In [None]:
# flightv1_1_2.select('timeline_departureTime', 'departureTime', 'departureTime_leg2').show(2, truncate=False)


In [None]:
# flightv1_1.select('timeline_leg1').show(truncate=False)

In [None]:
# datetime.datetime.fromtimestamp(1495084500000/3600)

In [None]:
# from datetime import datetime
# dt = datetime.now()
# dt.microsecond

In [None]:
# import datetime
# print(
#     datetime.datetime.fromtimestamp(
#         int("1495084500")
#     ).strftime('%d/%b/%Y:%H:%M:%S %z')
# )
