In [1]:
import h2o
import zipfile
import os
import sys
from pyspark.sql import SparkSession
from IPython.display import display
from pyspark.sql.functions import regexp_extract, col, split, udf, \
                                 trim, when, from_unixtime, unix_timestamp, minute, hour, datediff, lit, array,\
                                 to_date
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType, BooleanType, ArrayType, StructType, StructField, LongType, TimestampType
import datetime
import argparse
import json
import glob, os, shutil
import pandas as pd
from pandas.io.json import json_normalize
from pyspark import SparkContext
from time import sleep
from math import floor


pd.options.display.max_columns = 99

sc = SparkContext()

spark = SparkSession \
        .builder \
        .appName("Data ETL") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
        
display(spark.version)

'2.1.1'

# Load Data

In [2]:
# ! ls /home/ubuntu/s3/comb/txt_exception/ -l
# 826736 text files

In [3]:
# txt_exception_folder = '/home/ubuntu/s3/comb/txt_exception/'
# print(txt_exception_folder)

# # Version 1.1
# # flightv1_1 = spark.read.json(os.path.join(txt_exception_folder, "flight_15_13_price_2017-05-11*.txt"))
# flightv1_1 = spark.read.json(os.path.join(txt_exception_folder, "*.txt"))
# display(flightv1_1.count())
# display(flightv1_1.show(1))

# Define UDF's

In [4]:
# for one way trips, display None in stay_days
def correct_stay_days(trip, stay_days):
    if trip == '1':
        return None
    else:
        return int(stay_days)

correct_stay_days_UDF = udf(correct_stay_days, IntegerType())

def correct_tickets_left(noOfTicketsLeft):
    if noOfTicketsLeft == 0:
        return 99
    else:
        return noOfTicketsLeft
    
correct_tickets_left_UDF = udf(correct_tickets_left, IntegerType())

timeFmt = "yyyy-MM-dd'T'HH:mm:ss.SSS"

take_all_level2_str = udf(lambda rows, a, b: None if rows is None else [None if row is None else row[a][b] for row in rows], ArrayType(StringType()))
take_all_level1_str = udf(lambda rows, a: None if rows is None else [None if row is None else row[a] for row in rows], ArrayType(StringType()))

# Modify version 1.0

In [3]:
# Version 1.0
flight = spark.read.parquet("/home/ubuntu/s3/comb/flight_v1_0.pq")
display(flight.count())
display(flight.show(2))


take_all_duration_UDF = udf(lambda rows: None if rows is None else [None if row is None else row.split(":", 1)[1].replace("h", "h:") for row in rows], ArrayType(StringType()))
# couldn't get it to work

flight2 = (flight.withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stay_days')))
                 .drop('stay_days')           
                 .withColumnRenamed('start_date', 'depDate')                 
                 .withColumn('depDate', to_date('depDate'))
                 .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                 .withColumnRenamed('from_city_name', 'fromCity')
                 .withColumnRenamed('to_city_name', 'toCity')                 
                 .withColumnRenamed('search_date', 'searchDate')                 
                 .withColumn('searchDate', to_date('searchDate'))
                 .withColumnRenamed('company', 'airlineName')                 
                 .withColumnRenamed('dep_time', 'departureTime')                                  
                 .withColumnRenamed('arr_time', 'arrivalTime')                                                   
                 .withColumn('duration_h', split(flight.duration,'h').getItem(0))
                 .withColumn('duration_m', F.substring_index(split(flight.duration,'h').getItem(1), 'm', 1))
#                  .withColumn('duration', F.struct(col('duration_h'), col('duration_m')))
                 .withColumn('duration_m', (col('duration_h')*60 + col('duration_m')))
                 .drop('duration', 'duration_h', 'flight_number')
                 .withColumnRenamed('price_code', 'currencyCode')                                  
                 .withColumnRenamed('stop', 'stops')
                 .withColumn('stops', col('stops').cast('byte')) 
                 .withColumn('stop_info', split(col('stop_info'), ';'))
#                  .withColumn('stop_duration', take_all_duration_UDF(col('stop_info')))
                 .withColumn('noOfTicketsLeft', correct_tickets_left_UDF('ticket_left'))
                 .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte')) 
                .drop('ticket_left')
               .withColumnRenamed('table_name', 'tableName')
               .withColumn('task_id', col('task_id').cast('long')) 
               .withColumn('span_days', col('span_days').cast('integer')) 
                .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                        'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                        'stayDays', 
                       'departureTime', 'arrivalTime', 
                        'airlineName',  'duration_m', 
                        'flight_code', 'plane', 'stops', 'noOfTicketsLeft',
                       'airline_code', 'airline_codes',
                       'stop_info', 'span_days', 'power', 'video', 'wifi')                #'stop_duration', 
          )
# varaibles added in v1.1: 'departureTime_leg2', 'arrivalTime_leg2', 'airlineName_leg2','duration_m_leg2','stops_leg2'
#  'noOfTicketsLeft_leg2','airline_codes_leg2', 
# 'stop_list', 'url'

# variables dropped in v1.1:
# '', 'power', 'video', 'wifi', 'stop_info'

display(flight2.where(col('trip') == 1).show(1))
display(flight2.where(col('trip') == 2).show(1, truncate=False))
flight2.printSchema()

# flight2.select('flight_code', 'flight_number').distinct().show(1000)
# flight2.select('stop_info').distinct().show()
# flight2.select('stop_list').distinct().show(100, truncate=False)

2288103

+--------------+----------+---------+----------------+-------+------------+----+-------+------------+-------------+--------------------+-------------+--------------------+--------------------+--------+-----------+-------------+-----+--------------------+-----+------+----------+-----------+---------+----+-------------------+-----------+-----+----+
|from_city_name|start_date|stay_days|      table_name|task_id|to_city_name|trip|version|airline_code|airline_codes|            arr_time|check_bag_inc|             company|            dep_time|duration|flight_code|flight_number|index|               plane|power| price|price_code|search_date|span_days|stop|          stop_info|ticket_left|video|wifi|
+--------------+----------+---------+----------------+-------+------------+----+-------+------------+-------------+--------------------+-------------+--------------------+--------------------+--------+-----------+-------------+-----+--------------------+-----+------+----------+-----------+---------+--

None

+------+-------+----------+----------------+-------+------------+--------+-------+----+----------+-------+--------+--------------------+--------------------+--------------+----------+-----------+----------------+-----+---------------+------------+-------------+--------------------+---------+-----+-----+-----+
| price|version|searchDate|       tableName|task_id|currencyCode|fromCity| toCity|trip|   depDate|retDate|stayDays|       departureTime|         arrivalTime|   airlineName|duration_m|flight_code|           plane|stops|noOfTicketsLeft|airline_code|airline_codes|           stop_info|span_days|power|video| wifi|
+------+-------+----------+----------------+-------+------------+--------+-------+----+----------+-------+--------+--------------------+--------------------+--------------+----------+-----------+----------------+-----+---------------+------------+-------------+--------------------+---------+-----+-----+-----+
|605.72|    1.0|2017-05-01|flight_1_5_price|    676|         AUD|  

None

+-----+-------+----------+----------------+-------+------------+--------+-------+----+----------+----------+--------+-----------------------------+-----------------------------+--------------+----------+-----------+-----------------------------------+-----+---------------+------------+-------------+---------------------+---------+-----+-----+----+
|price|version|searchDate|tableName       |task_id|currencyCode|fromCity|toCity |trip|depDate   |retDate   |stayDays|departureTime                |arrivalTime                  |airlineName   |duration_m|flight_code|plane                              |stops|noOfTicketsLeft|airline_code|airline_codes|stop_info            |span_days|power|video|wifi|
+-----+-------+----------+----------------+-------+------------+--------+-------+----+----------+----------+--------+-----------------------------+-----------------------------+--------------+----------+-----------+-----------------------------------+-----+---------------+------------+-------------+

None

root
 |-- price: double (nullable = true)
 |-- version: string (nullable = true)
 |-- searchDate: date (nullable = true)
 |-- tableName: string (nullable = true)
 |-- task_id: long (nullable = true)
 |-- currencyCode: string (nullable = true)
 |-- fromCity: string (nullable = true)
 |-- toCity: string (nullable = true)
 |-- trip: string (nullable = true)
 |-- depDate: date (nullable = true)
 |-- retDate: date (nullable = true)
 |-- stayDays: integer (nullable = true)
 |-- departureTime: string (nullable = true)
 |-- arrivalTime: string (nullable = true)
 |-- airlineName: string (nullable = true)
 |-- duration_m: double (nullable = true)
 |-- flight_code: string (nullable = true)
 |-- plane: string (nullable = true)
 |-- stops: byte (nullable = true)
 |-- noOfTicketsLeft: byte (nullable = true)
 |-- airline_code: string (nullable = true)
 |-- airline_codes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stop_info: array (nullable = true)
 |    |-- element: 

In [6]:
# flight2.repartition(1).write.mode('append').parquet(os.path.join("/home/ubuntu/s3/pq_v1_1/", "flight_v1_1a"))
flight2.repartition(1).write.mode('overwrite').parquet(os.path.join("/home/ubuntu/s3/pq_v1_1/", "flight.pq"))

In [7]:
spark.read.parquet("/home/ubuntu/s3/pq_v1_1/" + "flight.pq").limit(3).toPandas()

Unnamed: 0,price,version,searchDate,tableName,task_id,currencyCode,fromCity,toCity,trip,depDate,retDate,stayDays,departureTime,arrivalTime,airlineName,duration_m,flight_code,plane,stops,noOfTicketsLeft,airline_code,airline_codes,stop_info,span_days,power,video,wifi
0,0.0,1.0,2017-05-08,flight_1_5_price,620,,sydney,beijing,2,2017-09-09,2017-10-07,28,2017-09-09T11:15:00.000+10:00,2017-09-10T04:10:00.000+08:00,Qantas Airways,1135.0,QF145,BOEING 737-800 (WINGLETS) PASSENGER,1,99,QF,"[QF, CA]",[Auckland(AKL):2h35m],0,,,
1,472.14,1.0,2017-05-08,flight_1_5_price,620,AUD,sydney,beijing,2,2017-09-09,2017-10-07,28,2017-09-09T20:50:00.000+10:00,2017-09-10T17:25:00.000+08:00,China Eastern Airlines,1355.0,MU778,AIRBUS INDUSTRIE A330-200,1,4,MU,"[MU, MU]",[Kunming(KMG):8h30m],0,,,
2,1095.74,1.0,2017-05-08,flight_1_5_price,620,AUD,sydney,beijing,2,2017-09-09,2017-10-07,28,2017-09-09T10:00:00.000+10:00,2017-09-10T06:55:00.000+08:00,Garuda Indonesia,1375.0,GA715,AIRBUS INDUSTRIE A330-300,1,9,GA,"[GA, GA]",[Denpasar(DPS):9h0m],0,,,


In [8]:
# flight2.select('stop_info').distinct().show(100, truncate=False)
# # flight2.sample(False, 0.001, 42).toPandas()
# flight2.limit(10).toPandas()

# Modify Version 1.1

In [9]:
# # take_all_level1_str = udf(lambda rows, a: [row[a] for row in rows], ArrayType(StringType()))
# take_all_level2_str = udf(lambda rows, a, b:  [None if row is None else row[a][b] for row in rows], ArrayType(StringType()))
# # take_all = udf(lambda rows, a: [row[a]['city'] for row in rows], ArrayType(StringType()))


# flightv1_1.withColumn("city", take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('city')))\
#                       .select('airports').show(10)
# # flightv1_1.withColumn("airports", take_all_level1_str(flightv1_1.timeline_leg1, 'type')).select('airports').show(10)
# # flightv1_1.withColumn("airports", take_all(flightv1_1.timeline_leg1, lit('arrivalAirport'))).select('airports').show(10)

# # display(flightv1_1.select('timeline_leg1').show(100, truncate=False))
# df.selectExpr("explode(check) as e").select("e.*").show()

# flightv1_1.selectExpr('explode(timeline_leg1) as e').select('e.*').show(truncate=False)

### Need to split stop list into duration and make it compatible with v1.0 - Can't figure out how to do this for v1.0

# Main function to convert text files to parquet

In [5]:
def txtToPq(inputFolder, pqFolder, pqFileName, searchString = "*.txt", append = True):
    """
    Read in all txt files in a folder, convert to parquet, and either append parquet or create new parquet
    @params:
        inputFolder   - Required  : input folder that contains json line txt files (Str)        
        pqFolder      - Required  : folder to save the parquet files into (Str)        
        pqFileName    - Required  : parquet file name (Bool)        
        append        - Optional  : append to existing parquet or create new parquet 
        searchString  - Optional  : search string that identifies all the json line text files (Str)        
    """
    
    flightv1_1 = spark.read.json(os.path.join(inputFolder, searchString))
    
    flightv1_1_2 = (flightv1_1.withColumn('trip', col('trip').cast('string'))
                        .withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stayDays')))                    
                        .withColumn('depDate', to_date('depDate'))
                        .withColumn('searchDate', to_date('searchDate'))
                        .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                        .withColumn('airline_code', flightv1_1.flight_leg1.carrierSummary.airlineCodes.getItem(0))                   
                        .withColumn('airline_codes', flightv1_1.flight_leg1.carrierSummary.airlineCodes)                    
                        .withColumn('airline_codes_leg2', flightv1_1.flight_leg2.carrierSummary.airlineCodes)                    
                        .withColumn('departureTime', flightv1_1.flight_leg1.departureTime)
                        .withColumn('departureTime_leg2', flightv1_1.flight_leg2.departureTime)
                        .withColumn('arrivalTime', flightv1_1.flight_leg1.arrivalTime)
                        .withColumn('arrivalTime_leg2', flightv1_1.flight_leg2.arrivalTime)
    #                 .withColumn('check_bag_inc', flightv1_1.flight_leg1.arrivalTime)
                        .withColumn('airlineName', flightv1_1.flight_leg1.carrierSummary.airlineName)
                        .withColumn('airlineName_leg2', flightv1_1.flight_leg2.carrierSummary.airlineName)
                        .withColumn('duration_m', (F.unix_timestamp('arrivalTime', format=timeFmt) - 
                                                   F.unix_timestamp('departureTime', format=timeFmt))/60)                    
                    .withColumn('duration_m_leg2', (F.unix_timestamp('arrivalTime_leg2', format=timeFmt) - 
                                                   F.unix_timestamp('departureTime_leg2', format=timeFmt))/60)                    
    #                     .withColumn('duration', flightv1_1.timeline_leg1.getItem(1).duration)
                    .withColumn('airlineCode', flightv1_1.timeline_leg1.getItem(0).carrier.airlineCode)
                    .withColumn('flightNumber', flightv1_1.timeline_leg1.getItem(0).carrier.flightNumber.cast('string'))                
                    .select('*', F.concat(col('airlineCode'), col('flightNumber')).alias('flight_code'))
                    .drop('airlineCode', 'flightNumber')
                    .withColumn('plane', flightv1_1.timeline_leg1.getItem(0).carrier.plane)                
                    .withColumn('stops', flightv1_1.flight_leg1.stops.cast('byte'))                                
                    .withColumn('stops_leg2', flightv1_1.flight_leg2.stops.cast('byte'))                

    #                 .withColumn('stop_list', flightv1_1.flight_leg1.stop_list)# need to do more work                
                    .withColumn('stop_airport', take_all_level1_str(flightv1_1.flight_leg1.stop_list, lit('airport')))                                               
                    .withColumn('stop_duration', take_all_level1_str(flightv1_1.flight_leg1.stop_list, lit('duration')))                                               

    #                 .withColumn('stop_list_leg2', flightv1_1.flight_leg2.stop_list)               
                    .withColumn('stop_airport_leg2', take_all_level1_str(flightv1_1.flight_leg2.stop_list, lit('airport')))                                               
                    .withColumn('stop_duration_leg2', take_all_level1_str(flightv1_1.flight_leg2.stop_list, lit('duration')))                                               



                    .withColumn('noOfTicketsLeft', correct_tickets_left_UDF(flightv1_1.flight_leg1.carrierSummary.noOfTicketsLeft))
                    .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte'))                
                    .withColumn('noOfTicketsLeft_leg2', correct_tickets_left_UDF(flightv1_1.flight_leg2.carrierSummary.noOfTicketsLeft))
                    .withColumn('noOfTicketsLeft_leg2', col('noOfTicketsLeft_leg2').cast('byte'))
                    .withColumn('fromCityAirportCode', flightv1_1.flight_leg1.departureLocation.airportCode)                
                    .withColumn('toCityAirportCode', flightv1_1.flight_leg1.arrivalLocation.airportCode)
                    .withColumn('fromCityAirportCode_leg2', flightv1_1.flight_leg2.departureLocation.airportCode)
                    .withColumn('toCityAirportCode_leg2', flightv1_1.flight_leg2.arrivalLocation.airportCode)

                    # carrier leg 1
                    .withColumn('carrierAirProviderId', flightv1_1.flight_leg1.carrierSummary.airProviderId)
                    .withColumn('carrierAirlineImageFileName', flightv1_1.flight_leg1.carrierSummary.airlineImageFileName)
                    .withColumn('carrierMixedCabinClass', flightv1_1.flight_leg1.carrierSummary.mixedCabinClass)
                    .withColumn('carrierMultiStop', flightv1_1.flight_leg1.carrierSummary.multiStop)
                    .withColumn('carrierNextDayArrival', flightv1_1.flight_leg1.carrierSummary.nextDayArrival)

                    # carrier leg 2
                    .withColumn('carrierAirProviderId_leg2', flightv1_1.flight_leg2.carrierSummary.airProviderId)
                    .withColumn('carrierAirlineImageFileName_leg2', flightv1_1.flight_leg2.carrierSummary.airlineImageFileName)
                    .withColumn('carrierMixedCabinClass_leg2', flightv1_1.flight_leg2.carrierSummary.mixedCabinClass)
                    .withColumn('carrierMultiStop_leg2', flightv1_1.flight_leg2.carrierSummary.multiStop)
                    .withColumn('carrierNextDayArrival_leg2', flightv1_1.flight_leg2.carrierSummary.nextDayArrival)

                    ### Leg 1
                    ## Leg 1 departure
    #                 .withColumn('timeline_departureAirport', take_all_airport(flightv1_1.timeline_leg1, lit('departureAirport')))                               
                    .withColumn('timeline_departureAirport_cityState', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('airportCityState')))
                    .withColumn('timeline_departureAirport_city', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('city')))
                    .withColumn('timeline_departureAirport_code', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('code')))
                    .withColumn('timeline_departureAirport_localName', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('localName')))
                    .withColumn('timeline_departureAirport_longName', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('longName')))
                    .withColumn('timeline_departureAirport_name', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureAirport'), lit('name')))

                    .withColumn('timeline_departureTime', take_all_level2_str(flightv1_1.timeline_leg1, lit('departureTime'), lit('isoStr')))



                    ## Leg 1 arrival
                    .withColumn('timeline_arrivalAirport_cityState', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('airportCityState')))
                    .withColumn('timeline_arrivalAirport_city', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('city')))
                    .withColumn('timeline_arrivalAirport_code', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('code')))
                    .withColumn('timeline_arrivalAirport_localName', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('localName')))
                    .withColumn('timeline_arrivalAirport_longName', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('longName')))
                    .withColumn('timeline_arrivalAirport_name', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalAirport'), lit('name')))                

                    .withColumn('timeline_arrivalTime', take_all_level2_str(flightv1_1.timeline_leg1, lit('arrivalTime'), lit('isoStr')))

                    # distance
                    .withColumn('timeline_distance', take_all_level2_str(flightv1_1.timeline_leg1, lit('distance'), lit('formattedTotal')))

                    # carrier
                    .withColumn('timeline_plane', take_all_level2_str(flightv1_1.timeline_leg1, lit('carrier'), lit('plane')))

                    # brandedFareName
                    .withColumn('timeline_brandedFareName', take_all_level1_str(flightv1_1.timeline_leg1, lit('brandedFareName')))                               

                    # type
                    .withColumn('timeline_type', take_all_level1_str(flightv1_1.timeline_leg1, lit('type')))                               

                    ### Leg 2
                    ## Leg 2 departure
                    .withColumn('timeline_departureAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('airportCityState')))
                    .withColumn('timeline_departureAirport_city_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('city')))
                    .withColumn('timeline_departureAirport_code_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('code')))
                    .withColumn('timeline_departureAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('localName')))
                    .withColumn('timeline_departureAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('longName')))
                    .withColumn('timeline_departureAirport_name_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureAirport'), lit('name')))

                    .withColumn('timeline_departureTime_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('departureTime'), lit('isoStr')))                


                    ## Leg 2 arrival
                    .withColumn('timeline_arrivalAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('airportCityState')))
                    .withColumn('timeline_arrivalAirport_city_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('city')))
                    .withColumn('timeline_arrivalAirport_code_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('code')))
                    .withColumn('timeline_arrivalAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('localName')))
                    .withColumn('timeline_arrivalAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('longName')))
                    .withColumn('timeline_arrivalAirport_name_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalAirport'), lit('name')))                

                    .withColumn('timeline_arrivalTime_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('arrivalTime'), lit('isoStr')))

                    # distance
                    .withColumn('timeline_distance_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('distance'), lit('formattedTotal')))

                    # carrier
                    .withColumn('timeline_plane_leg2', take_all_level2_str(flightv1_1.timeline_leg2, lit('carrier'), lit('plane')))

                    # brandedFareName
                    .withColumn('timeline_brandedFareName_leg2', take_all_level1_str(flightv1_1.timeline_leg2, lit('brandedFareName')))                           

                    # type
                    .withColumn('timeline_type_leg2', take_all_level1_str(flightv1_1.timeline_leg2, lit('type')))                               
                    
                    
                    # create variables droppped from v1.0
                    .withColumn('span_days', lit(99))
                    .withColumn('power', lit(False))
                    .withColumn('video', lit(False))
                    .withColumn('wifi', lit(False))
                    .withColumn('stop_info', col('stop_airport')) #placeholder. can't figure out how to create struct literal



                    .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                            'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                            'stayDays', 
                           'departureTime', 'arrivalTime', 'departureTime_leg2', 'arrivalTime_leg2',
                            'airlineName', 'airlineName_leg2', 'duration_m', 'duration_m_leg2',                
                            'flight_code', 'plane', 'stops', 'stops_leg2', 'stop_airport', 'stop_duration', 'stop_airport_leg2', 'stop_duration_leg2',
                            'noOfTicketsLeft', 'noOfTicketsLeft_leg2',
                           'airline_code', 'airline_codes', 'airline_codes_leg2', 
                            'url', 'fromCityAirportCode', 'toCityAirportCode', 'fromCityAirportCode_leg2', 'toCityAirportCode_leg2',
                           'carrierAirProviderId', 'carrierAirlineImageFileName', 'carrierMixedCabinClass', 'carrierMultiStop', 'carrierNextDayArrival',
                            'carrierAirProviderId_leg2', 'carrierAirlineImageFileName_leg2', 'carrierMixedCabinClass_leg2', 'carrierMultiStop_leg2', 'carrierNextDayArrival_leg2',

                            ## leg 1
                            # departure
                            'timeline_departureAirport_cityState', 'timeline_departureAirport_city', 'timeline_departureAirport_code', 'timeline_departureAirport_localName', 
                            'timeline_departureAirport_longName', 'timeline_departureAirport_name',

                            'timeline_departureTime',

                            # arrival
                            'timeline_arrivalAirport_cityState', 'timeline_arrivalAirport_city', 'timeline_arrivalAirport_code', 'timeline_arrivalAirport_localName', 
                            'timeline_arrivalAirport_longName', 'timeline_arrivalAirport_name',

                            'timeline_arrivalTime',

                            'timeline_distance',
                            'timeline_plane',
                            'timeline_brandedFareName',
                            'timeline_type',

                            ## leg 2                        
                            # departure
                            'timeline_departureAirport_cityState_leg2', 'timeline_departureAirport_city_leg2', 'timeline_departureAirport_code_leg2', 'timeline_departureAirport_localName_leg2', 
                            'timeline_departureAirport_longName_leg2', 'timeline_departureAirport_name_leg2',

                            'timeline_departureTime_leg2',

                            # arrival
                            'timeline_arrivalAirport_cityState_leg2', 'timeline_arrivalAirport_city_leg2', 'timeline_arrivalAirport_code_leg2', 'timeline_arrivalAirport_localName_leg2', 
                            'timeline_arrivalAirport_longName_leg2', 'timeline_arrivalAirport_name_leg2',

                            'timeline_arrivalTime_leg2',

                            'timeline_distance_leg2',
                            'timeline_plane_leg2',
                            'timeline_brandedFareName_leg2',
                            'timeline_type_leg2',
                            
                            # variables dropped from v1.0
                            'span_days', 'power', 'video', 'wifi', 'stop_info'

                           )                
                   )

    if append:
        flightv1_1_2.repartition(1).write.mode('append').parquet(os.path.join(pqFolder, pqFileName))        
    else:
        flightv1_1_2.repartition(1).write.parquet(os.path.join(pqFolder, pqFileName))       

In [6]:
# leg1 is renamed to leg1
def txtToPq_v2(inputFolder, pqFolder, pqFileName, searchString = "*.txt", append = True):
    """
    Read in all txt files in a folder, convert to parquet, and either append parquet or create new parquet
    @params:
        inputFolder   - Required  : input folder that contains json line txt files (Str)        
        pqFolder      - Required  : folder to save the parquet files into (Str)        
        pqFileName    - Required  : parquet file name (Bool)        
        append        - Optional  : append to existing parquet or create new parquet 
        searchString  - Optional  : search string that identifies all the json line text files (Str)        
    """
    
    flightv1_1 = spark.read.json(os.path.join(inputFolder, searchString))
    
    flightv1_1_2 = (flightv1_1.withColumn('trip', col('trip').cast('string'))
                            .withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stayDays')))                    
                            .withColumn('depDate', to_date('depDate'))
                            .withColumn('searchDate', to_date('searchDate'))
                            .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                            .withColumn('airline_code', flightv1_1.leg1.carrierSummary.airlineCodes.getItem(0))                   
                            .withColumn('airline_codes', flightv1_1.leg1.carrierSummary.airlineCodes)                    
                            .withColumn('airline_codes_leg2', flightv1_1.leg2.carrierSummary.airlineCodes)                    
                            .withColumn('departureTime', flightv1_1.leg1.departureTime.isoStr)
                            .withColumn('departureTime_leg2', flightv1_1.leg2.departureTime.isoStr)
                            .withColumn('arrivalTime', flightv1_1.leg1.arrivalTime.isoStr)
                            .withColumn('arrivalTime_leg2', flightv1_1.leg2.arrivalTime.isoStr)
        #                 .withColumn('check_bag_inc', flightv1_1.leg1.arrivalTime)
                            .withColumn('airlineName', flightv1_1.leg1.carrierSummary.airlineName)
                            .withColumn('airlineName_leg2', flightv1_1.leg2.carrierSummary.airlineName)
                            .withColumn('duration_m', (F.unix_timestamp('arrivalTime', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime', format=timeFmt))/60)                    
                        .withColumn('duration_m_leg2', (F.unix_timestamp('arrivalTime_leg2', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime_leg2', format=timeFmt))/60)                    
        #                     .withColumn('duration', flightv1_1.timeline1.getItem(1).duration)
                        .withColumn('airlineCode', flightv1_1.timeline1.getItem(0).carrier.airlineCode)
                        .withColumn('flightNumber', flightv1_1.timeline1.getItem(0).carrier.flightNumber.cast('string'))                
                        .select('*', F.concat(col('airlineCode'), col('flightNumber')).alias('flight_code'))
                        .drop('airlineCode', 'flightNumber')
                        .withColumn('plane', flightv1_1.timeline1.getItem(0).carrier.plane)                
                        .withColumn('stops', flightv1_1.leg1.stops.cast('byte'))                                
                        .withColumn('stops_leg2', flightv1_1.leg2.stops.cast('byte'))                

        #                 .withColumn('stop_list', flightv1_1.leg1.stop_list)# need to do more work                
                        .withColumn('stop_airport', take_all_level1_str(flightv1_1.leg1.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration', take_all_level1_str(flightv1_1.leg1.stop_list, lit('duration')))                                               

        #                 .withColumn('stop_list_leg2', flightv1_1.leg2.stop_list)               
                        .withColumn('stop_airport_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('duration')))                                               


                        .withColumn('noOfTicketsLeft', correct_tickets_left_UDF(flightv1_1.leg1.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte'))                
                        .withColumn('noOfTicketsLeft_leg2', correct_tickets_left_UDF(flightv1_1.leg2.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft_leg2', col('noOfTicketsLeft_leg2').cast('byte'))
                        .withColumn('fromCityAirportCode', flightv1_1.leg1.departureLocation.airportCode)                
                        .withColumn('toCityAirportCode', flightv1_1.leg1.arrivalLocation.airportCode)
                        .withColumn('fromCityAirportCode_leg2', flightv1_1.leg2.departureLocation.airportCode)
                        .withColumn('toCityAirportCode_leg2', flightv1_1.leg2.arrivalLocation.airportCode)

                        # carrier leg 1
                        .withColumn('carrierAirProviderId', flightv1_1.leg1.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName', flightv1_1.leg1.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass', flightv1_1.leg1.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop', flightv1_1.leg1.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival', flightv1_1.leg1.carrierSummary.nextDayArrival)

                        # carrier leg 2
                        .withColumn('carrierAirProviderId_leg2', flightv1_1.leg2.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName_leg2', flightv1_1.leg2.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass_leg2', flightv1_1.leg2.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop_leg2', flightv1_1.leg2.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival_leg2', flightv1_1.leg2.carrierSummary.nextDayArrival)

                        ### Leg 1
                        ## Leg 1 departure
        #                 .withColumn('timeline_departureAirport', take_all_airport(flightv1_1.timeline1, lit('departureAirport')))                               
                        .withColumn('timeline_departureAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime', take_all_level2_str(flightv1_1.timeline1, lit('departureTime'), lit('isoStr')))



                        ## Leg 1 arrival
                        .withColumn('timeline_arrivalAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime', take_all_level2_str(flightv1_1.timeline1, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance', take_all_level2_str(flightv1_1.timeline1, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane', take_all_level2_str(flightv1_1.timeline1, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName', take_all_level1_str(flightv1_1.timeline1, lit('brandedFareName')))                               

                        # type
                        .withColumn('timeline_type', take_all_level1_str(flightv1_1.timeline1, lit('type')))                               

                        ### Leg 2
                        ## Leg 2 departure
                        .withColumn('timeline_departureAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureTime'), lit('isoStr')))                


                        ## Leg 2 arrival
                        .withColumn('timeline_arrivalAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance_leg2', take_all_level2_str(flightv1_1.timeline2, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane_leg2', take_all_level2_str(flightv1_1.timeline2, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName_leg2', take_all_level1_str(flightv1_1.timeline2, lit('brandedFareName')))                           

                        # type
                        .withColumn('timeline_type_leg2', take_all_level1_str(flightv1_1.timeline2, lit('type')))                               

                        # create variables droppped from v1.0
                        .withColumn('span_days', lit(99))
                        .withColumn('power', lit(False))
                        .withColumn('video', lit(False))
                        .withColumn('wifi', lit(False))
                        .withColumn('stop_info', col('stop_airport')) #placeholder. can't figure out how to create struct literal


                        .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                                'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                                'stayDays', 
                               'departureTime', 'arrivalTime', 'departureTime_leg2', 'arrivalTime_leg2',
                                'airlineName', 'airlineName_leg2', 'duration_m', 'duration_m_leg2',                
                                'flight_code', 'plane', 'stops', 'stops_leg2', 'stop_airport', 'stop_duration', 'stop_airport_leg2', 'stop_duration_leg2',
                                'noOfTicketsLeft', 'noOfTicketsLeft_leg2',
                               'airline_code', 'airline_codes', 'airline_codes_leg2', 
                                'fromCityAirportCode', 'toCityAirportCode', 'fromCityAirportCode_leg2', 'toCityAirportCode_leg2',
                               'carrierAirProviderId', 'carrierAirlineImageFileName', 'carrierMixedCabinClass', 'carrierMultiStop', 'carrierNextDayArrival',
                                'carrierAirProviderId_leg2', 'carrierAirlineImageFileName_leg2', 'carrierMixedCabinClass_leg2', 'carrierMultiStop_leg2', 'carrierNextDayArrival_leg2',
                                #'url',

                                ## leg 1
                                # departure
                                'timeline_departureAirport_cityState', 'timeline_departureAirport_city', 'timeline_departureAirport_code', 'timeline_departureAirport_localName', 
                                'timeline_departureAirport_longName', 'timeline_departureAirport_name',

                                'timeline_departureTime',

                                # arrival
                                'timeline_arrivalAirport_cityState', 'timeline_arrivalAirport_city', 'timeline_arrivalAirport_code', 'timeline_arrivalAirport_localName', 
                                'timeline_arrivalAirport_longName', 'timeline_arrivalAirport_name',

                                'timeline_arrivalTime',

                                'timeline_distance',
                                'timeline_plane',
                                'timeline_brandedFareName',
                                'timeline_type',

                                ## leg 2                        
                                # departure
                                'timeline_departureAirport_cityState_leg2', 'timeline_departureAirport_city_leg2', 'timeline_departureAirport_code_leg2', 'timeline_departureAirport_localName_leg2', 
                                'timeline_departureAirport_longName_leg2', 'timeline_departureAirport_name_leg2',

                                'timeline_departureTime_leg2',

                                # arrival
                                'timeline_arrivalAirport_cityState_leg2', 'timeline_arrivalAirport_city_leg2', 'timeline_arrivalAirport_code_leg2', 'timeline_arrivalAirport_localName_leg2', 
                                'timeline_arrivalAirport_longName_leg2', 'timeline_arrivalAirport_name_leg2',

                                'timeline_arrivalTime_leg2',

                                'timeline_distance_leg2',
                                'timeline_plane_leg2',
                                'timeline_brandedFareName_leg2',
                                'timeline_type_leg2',

                                # variables dropped from v1.0
                                'span_days', 'power', 'video', 'wifi', 'stop_info'
                               )                
                       )


    if append:
        flightv1_1_2.repartition(1).write.mode('append').parquet(os.path.join(pqFolder, pqFileName))        
    else:
        flightv1_1_2.repartition(1).write.parquet(os.path.join(pqFolder, pqFileName))   

# Set up folders

In [7]:
zip_folder = '/home/ubuntu/s3/zip/'
txt_folder = '/home/ubuntu/s3/txt/'
txt_exception_folder = '/home/ubuntu/s3/comb/txt_exception/'
pq_folder = '/home/ubuntu/s3/pq_v1_1/'
txt_new_exception_folder = '/home/ubuntu/s3/txt_exception/'

# Only create folder if they don't exist
! mkdir -p $zip_folder
! mkdir -p $txt_folder
! mkdir -p $txt_exception_folder
! mkdir -p $pq_folder
! mkdir -p $txt_new_exception_folder

# Tidy up working folder
! rm -rf /home/ubuntu/s3/comb/zip/*
! rm -rf /home/ubuntu/s3/comb/txt/*
! rm -rf /home/ubuntu/s3/zip/*
! rm -rf /home/ubuntu/s3/txt/*
# -f - stands for "force" which is helpful when you don't want to be asked/prompted if you want to remove an archive, for example.
# -r - stands for "recursive" which means that you want to go recursively down every folder and remove everything.


In [6]:
# download another v1.1 file, unzip and test
! cd $zip_folder

# zip_file_path = "flight_10_1"
# zip_file_name = "flight_10_1_price_2017-05-15.zip"
# zip_file_path = "flight_9_40"
# zip_file_name = "flight_9_40_price_2017-06-19.zip"
zip_file_path = "flight_8_38"
zip_file_name = "flight_8_38_price_2017-06-12.zip"

! aws s3 cp s3://flight.price.11/$zip_file_path/$zip_file_name $zip_folder/$zip_file_name
! sudo apt-get install unzip
! unzip $zip_folder/$zip_file_name -d $txt_folder

download: s3://flight.price.11/flight_8_38/flight_8_38_price_2017-06-12.zip to ../../s3/zip/flight_8_38_price_2017-06-12.zip
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following packages were automatically installed and are no longer required:
  linux-aws-headers-4.4.0-1013 linux-aws-headers-4.4.0-1016
  linux-aws-headers-4.4.0-1017 linux-headers-4.4.0-1013-aws
  linux-headers-4.4.0-1016-aws linux-headers-4.4.0-1017-aws
  linux-image-4.4.0-1013-aws linux-image-4.4.0-1016-aws
  linux-image-4.4.0-1017-aws
Use 'sudo apt autoremove' to remove them.
Suggested packages:
  zip
The following NEW packages will be installed:
  unzip
0 upgraded, 1 newly installed, 0 to remove and 48 not upgraded.
Need to get 158 kB of archives.
After this operation, 530 kB of additional disk space will be used.
Get:1 http://ap-southeast-2.ec2.archive.ubuntu.com/ubuntu xenial/main amd64 unzip amd64 6.0-20ubuntu1 [158 kB]
Fetched 158 kB in 0s (12.3 MB/s)
Sele

# Append v1.1a

In [25]:
txtToPq_v2(inputFolder = '/home/ubuntu/s3/txt/final_results/', pqFolder = pq_folder,
                    pqFileName = "flight.pq", searchString = "*.txt", append = True)    
# txtToPq_v2(inputFolder = '/home/ubuntu/s3/txt/final_results/', pqFolder = pq_folder,
#                     pqFileName = "flight_v1_0a", searchString = "*.txt", append = True)    

# spark.read.parquet(pq_folder + "flight_v1_0a").limit(2).toPandas()
spark.read.parquet(pq_folder + "flight.pq").limit(2).toPandas()

Unnamed: 0,price,version,searchDate,tableName,task_id,currencyCode,fromCity,toCity,trip,depDate,retDate,stayDays,departureTime,arrivalTime,departureTime_leg2,arrivalTime_leg2,airlineName,airlineName_leg2,duration_m,duration_m_leg2,flight_code,plane,stops,stops_leg2,stop_airport,stop_duration,stop_airport_leg2,stop_duration_leg2,noOfTicketsLeft,noOfTicketsLeft_leg2,airline_code,airline_codes,airline_codes_leg2,fromCityAirportCode,toCityAirportCode,fromCityAirportCode_leg2,toCityAirportCode_leg2,carrierAirProviderId,carrierAirlineImageFileName,carrierMixedCabinClass,carrierMultiStop,carrierNextDayArrival,carrierAirProviderId_leg2,carrierAirlineImageFileName_leg2,carrierMixedCabinClass_leg2,carrierMultiStop_leg2,carrierNextDayArrival_leg2,timeline_departureAirport_cityState,timeline_departureAirport_city,timeline_departureAirport_code,timeline_departureAirport_localName,timeline_departureAirport_longName,timeline_departureAirport_name,timeline_departureTime,timeline_arrivalAirport_cityState,timeline_arrivalAirport_city,timeline_arrivalAirport_code,timeline_arrivalAirport_localName,timeline_arrivalAirport_longName,timeline_arrivalAirport_name,timeline_arrivalTime,timeline_distance,timeline_plane,timeline_brandedFareName,timeline_type,timeline_departureAirport_cityState_leg2,timeline_departureAirport_city_leg2,timeline_departureAirport_code_leg2,timeline_departureAirport_localName_leg2,timeline_departureAirport_longName_leg2,timeline_departureAirport_name_leg2,timeline_departureTime_leg2,timeline_arrivalAirport_cityState_leg2,timeline_arrivalAirport_city_leg2,timeline_arrivalAirport_code_leg2,timeline_arrivalAirport_localName_leg2,timeline_arrivalAirport_longName_leg2,timeline_arrivalAirport_name_leg2,timeline_arrivalTime_leg2,timeline_distance_leg2,timeline_plane_leg2,timeline_brandedFareName_leg2,timeline_type_leg2,span_days,power,video,wifi,stop_info
0,0.0,1.0,2017-05-08,flight_1_5_price,620,,sydney,beijing,2,2017-09-09,2017-10-07,28,2017-09-09T11:15:00.000+10:00,2017-09-10T04:10:00.000+08:00,,,Qantas Airways,,1135.0,,QF145,BOEING 737-800 (WINGLETS) PASSENGER,1,,,,,,99,,QF,"[QF, CA]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,[Auckland(AKL):2h35m]
1,472.14,1.0,2017-05-08,flight_1_5_price,620,AUD,sydney,beijing,2,2017-09-09,2017-10-07,28,2017-09-09T20:50:00.000+10:00,2017-09-10T17:25:00.000+08:00,,,China Eastern Airlines,,1355.0,,MU778,AIRBUS INDUSTRIE A330-200,1,,,,,,4,,MU,"[MU, MU]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,[Kunming(KMG):8h30m]


In [26]:
# spark.read.parquet(pq_folder + "flight_v1_1a").where(col('fromCity')=="Chengdu").limit(2).toPandas()
# spark.read.parquet(pq_folder + "flight_v1_1a").where(col('toCity')=="beijing").limit(2).toPandas()

In [27]:
spark.read.parquet(pq_folder + "flight.pq").count()

2649297

# Append v1.1b

In [21]:
txt_exception_folder = '/home/ubuntu/s3/comb/txt_exception/'

txtToPq(inputFolder = txt_exception_folder, pqFolder = pq_folder,
                    pqFileName = "flight.pq", searchString = "*.txt", append = True)    

spark.read.parquet(pq_folder + "flight.pq").limit(2).toPandas()

Unnamed: 0,price,version,searchDate,tableName,task_id,currencyCode,fromCity,toCity,trip,depDate,retDate,stayDays,departureTime,arrivalTime,departureTime_leg2,arrivalTime_leg2,airlineName,airlineName_leg2,duration_m,duration_m_leg2,flight_code,plane,stops,stops_leg2,stop_airport,stop_duration,stop_airport_leg2,stop_duration_leg2,noOfTicketsLeft,noOfTicketsLeft_leg2,airline_code,airline_codes,airline_codes_leg2,fromCityAirportCode,toCityAirportCode,fromCityAirportCode_leg2,toCityAirportCode_leg2,carrierAirProviderId,carrierAirlineImageFileName,carrierMixedCabinClass,carrierMultiStop,carrierNextDayArrival,carrierAirProviderId_leg2,carrierAirlineImageFileName_leg2,carrierMixedCabinClass_leg2,carrierMultiStop_leg2,carrierNextDayArrival_leg2,timeline_departureAirport_cityState,timeline_departureAirport_city,timeline_departureAirport_code,timeline_departureAirport_localName,timeline_departureAirport_longName,timeline_departureAirport_name,timeline_departureTime,timeline_arrivalAirport_cityState,timeline_arrivalAirport_city,timeline_arrivalAirport_code,timeline_arrivalAirport_localName,timeline_arrivalAirport_longName,timeline_arrivalAirport_name,timeline_arrivalTime,timeline_distance,timeline_plane,timeline_brandedFareName,timeline_type,timeline_departureAirport_cityState_leg2,timeline_departureAirport_city_leg2,timeline_departureAirport_code_leg2,timeline_departureAirport_localName_leg2,timeline_departureAirport_longName_leg2,timeline_departureAirport_name_leg2,timeline_departureTime_leg2,timeline_arrivalAirport_cityState_leg2,timeline_arrivalAirport_city_leg2,timeline_arrivalAirport_code_leg2,timeline_arrivalAirport_localName_leg2,timeline_arrivalAirport_longName_leg2,timeline_arrivalAirport_name_leg2,timeline_arrivalTime_leg2,timeline_distance_leg2,timeline_plane_leg2,timeline_brandedFareName_leg2,timeline_type_leg2,span_days,power,video,wifi,stop_info
0,0.0,1.0,2017-05-08,flight_1_5_price,620,,sydney,beijing,2,2017-09-09,2017-10-07,28,2017-09-09T11:15:00.000+10:00,2017-09-10T04:10:00.000+08:00,,,Qantas Airways,,1135.0,,QF145,BOEING 737-800 (WINGLETS) PASSENGER,1,,,,,,99,,QF,"[QF, CA]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,[Auckland(AKL):2h35m]
1,472.14,1.0,2017-05-08,flight_1_5_price,620,AUD,sydney,beijing,2,2017-09-09,2017-10-07,28,2017-09-09T20:50:00.000+10:00,2017-09-10T17:25:00.000+08:00,,,China Eastern Airlines,,1355.0,,MU778,AIRBUS INDUSTRIE A330-200,1,,,,,,4,,MU,"[MU, MU]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,,,,[Kunming(KMG):8h30m]


In [28]:
display(spark.read.parquet(pq_folder + "flight.pq").count())
display(spark.read.parquet(pq_folder + "flight.pq")
        .groupBy("fromCity", "toCity", "trip", "version")
        .count()
        .orderBy("fromCity", "toCity", "trip", "version").show())

2649297

+--------+--------+----+-------+-------+
|fromCity|  toCity|trip|version|  count|
+--------+--------+----+-------+-------+
| Bangkok|Hangzhou|   1|    1.1|  29175|
| Bangkok|Hangzhou|   2|    1.1| 119236|
|hongkong|Auckland|   1|    1.1|  28824|
|hongkong|Auckland|   2|    1.1| 122266|
|shenzhen|Florence|   1|    1.1|  11104|
|shenzhen|Florence|   2|    1.1|  50589|
|  sydney| beijing|   1|    1.0| 356608|
|  sydney| beijing|   2|    1.0|1931495|
+--------+--------+----+-------+-------+



None

In [29]:
spark.read.parquet(pq_folder + "flight.pq").printSchema()

root
 |-- price: double (nullable = true)
 |-- version: string (nullable = true)
 |-- searchDate: date (nullable = true)
 |-- tableName: string (nullable = true)
 |-- task_id: long (nullable = true)
 |-- currencyCode: string (nullable = true)
 |-- fromCity: string (nullable = true)
 |-- toCity: string (nullable = true)
 |-- trip: string (nullable = true)
 |-- depDate: date (nullable = true)
 |-- retDate: date (nullable = true)
 |-- stayDays: integer (nullable = true)
 |-- departureTime: string (nullable = true)
 |-- arrivalTime: string (nullable = true)
 |-- departureTime_leg2: string (nullable = true)
 |-- arrivalTime_leg2: string (nullable = true)
 |-- airlineName: string (nullable = true)
 |-- airlineName_leg2: string (nullable = true)
 |-- duration_m: double (nullable = true)
 |-- duration_m_leg2: double (nullable = true)
 |-- flight_code: string (nullable = true)
 |-- plane: string (nullable = true)
 |-- stops: byte (nullable = true)
 |-- stops_leg2: byte (nullable = true)
 |-- st

#### Stopped here 20170623. Next step:
- reformat 1.0
- repartition 1.0 into 100Mb blocks
- append 1.1a and 1.1b onto 1.0, check pq file after each append

In [8]:
from os.path import join
from os import listdir, rmdir
from shutil import move


def unzip_files(dir_in, dir_out, extension):
    os.chdir(dir_in) # change directory from working dir to dir with files
    for subdir, dirs, files in os.walk(dir_in):
        for item in files:
            if item.endswith(extension): # check for ".zip" extension
                file_name = os.path.join(subdir, item)
                zip_ref = zipfile.ZipFile(file_name) # create zipfile object
                zip_ref.extractall(dir_out) # extract file to dir
                zip_ref.close() # close file             

                
def clear_folder(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            #elif os.path.isdir(file_path): shutil.rmtree(file_path)
        except Exception as e:
            print(e)

        # recreate the folder after deletion
        if not os.path.exists(folder):
            os.makedirs(folder)

            
# Print iterations progress
# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
#     print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))        
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))
    # Print New Line on Complete
    if iteration == total: 
        print()

In [9]:
import boto3 
s3_client = boto3.client("s3")
# all_objects = s3_client.list_objects(Bucket = 'flight.price.11')

s3 = boto3.resource('s3')
bucket = s3.Bucket('flight.price.11')

In [10]:
# get number of items in the s3 bucket
! aws s3 ls s3://flight.price.11/ --recursive | wc -l    
    

12976


### Get list of all zip files

In [9]:
from time import sleep
from math import floor

# make a list
i = 0
l = 12661 # from the above command

s3_files = list()

# Initial call to print 0% progress
printProgressBar(i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 1, length = 50)

for item in bucket.objects.all():    
    # define s3 file name
    s3_file = item.key    
    s3_files.append(s3_file)
    
    sleep(0.1)
    # Update Progress Bar
    i += 1
    if i % floor(l / 200) == 0:
        printProgressBar(i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 1, length = 50)    

NameError: name 'printProgressBar' is not defined

In [59]:
import csv

with open('/home/ubuntu/work/flight/zip_files.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(s3_files)
    

In [12]:
import csv
with open('/home/ubuntu/work/flight/zip_files.csv', 'r') as f:
    reader = csv.reader(f)
    zip_list = list(reader)
zip_list[0][35]

'flight_10_14/flight_10_14_price_2017-05-15.zip'

In [13]:
clear_folder(txt_new_exception_folder)
clear_folder(zip_folder)
clear_folder(txt_folder)      
# clear_folder(txt_folder+"final_results")      

In [32]:
len(zip_list[0])

11178

In [27]:
# zip_list[0][23:]

In [24]:
# chunk_size = 20
# remaining_list = zip_list[0][23:]
# print([remaining_list[i:i+chunk_size] for i in range(0, len(remaining_list), chunk_size)])
# print [mylist[i:i+4] for i in range(0, len(mylist), 4)]

[['flight_10_1/flight_10_1_price_2017-06-07.zip', 'flight_10_1/flight_10_1_price_2017-06-08.zip', 'flight_10_1/flight_10_1_price_2017-06-09.zip', 'flight_10_1/flight_10_1_price_2017-06-10.zip', 'flight_10_1/flight_10_1_price_2017-06-11.zip', 'flight_10_1/flight_10_1_price_2017-06-12.zip', 'flight_10_1/flight_10_1_price_2017-06-13.zip', 'flight_10_1/flight_10_1_price_2017-06-14.zip', 'flight_10_1/flight_10_1_price_2017-06-15.zip', 'flight_10_1/flight_10_1_price_2017-06-16.zip', 'flight_10_1/flight_10_1_price_2017-06-17.zip', 'flight_10_1/flight_10_1_price_2017-06-18.zip', 'flight_10_14/flight_10_14_price_2017-05-15.zip', 'flight_10_14/flight_10_14_price_2017-05-16.zip', 'flight_10_14/flight_10_14_price_2017-05-17.zip', 'flight_10_14/flight_10_14_price_2017-05-18.zip', 'flight_10_14/flight_10_14_price_2017-05-19.zip', 'flight_10_14/flight_10_14_price_2017-05-20.zip', 'flight_10_14/flight_10_14_price_2017-05-21.zip', 'flight_10_14/flight_10_14_price_2017-05-22.zip'], ['flight_10_14/flight

In [14]:
import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size/(1024*1024*1024)

print(get_size())

0.005279757082462311


In [15]:
get_size(pq_folder)

0.10793561208993196

In [None]:
start = 35
chunk_size = 10 #tried 20. 30 GB hard drive is too small for 20 zip files
remaining_list = zip_list[0][start:] #already processed 22
# print([remaining_list[i:i+chunk_size] for i in range(0, len(remaining_list), chunk_size)])

i = 0
l = len(zip_list[0])

# Initial call to print 0% progress
printProgressBar(i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 1, length = 50)


for j in range(0, len(remaining_list), chunk_size):
    print("started " + str(start+j) + " out of " + str(l))
    
    for item in remaining_list[j:j+chunk_size]:           
#     for item in remaining_list[j:j+1]:   

        # clear working folder
        clear_folder(zip_folder)
        clear_folder(txt_folder)   
        
        # define s3 file name
        s3_file = item    
        s3_file_new_name = s3_file.replace('/', '__')

         # download zip
        s3_client.download_file('flight.price.11', s3_file, zip_folder + s3_file_new_name)

        # extract to txt
        unzip_files(zip_folder, txt_folder, '.zip')       

        # if necessary move subfolder contents to parent folder        
        try:
            for filename in listdir(join(txt_folder, 'final_results')):
                move(join(txt_folder, 'final_results', filename), join(txt_folder, filename))
            rmdir(join(txt_folder, 'final_results'))
        except Exception as e:
            print(e)
        
        print("finished extracting " + s3_file_new_name)
        print(get_size(txt_folder))

            
        # convert to parquet and append to existing parquet
        try:
            print("started transforming " + s3_file_new_name)
            txtToPq_v2(inputFolder = txt_folder, pqFolder = pq_folder,
                            pqFileName = s3_file_new_name.replace(".zip", ".pq"), searchString = "*.txt", append = False)        
        except:
            for filename in listdir(txt_folder):
                move(join(txt_folder, filename), join(txt_new_exception_folder, filename))

        print("finished transforming " + s3_file_new_name)

        sleep(0.1)
        # Update Progress Bar
        i += 1
        if i % floor(l / 1000) == 0:
            printProgressBar(i, l, prefix = 'Progress:', suffix = 'Complete', decimals = 1, length = 50)    


Progress: |--------------------------------------------------| 0.0% Complete
started 35 out of 11178
finished extracting flight_10_14__flight_10_14_price_2017-05-15.zip
0.38285352755337954
started transforming flight_10_14__flight_10_14_price_2017-05-15.zip


In [13]:
temp = spark.read.parquet(os.path.join(pq_folder, '*.pq'))
display(temp.count())
display(temp
        .groupBy("fromCity", "toCity", "trip", "version", "stayDays")
        .count()
        .orderBy("fromCity", "toCity", "trip", "version", "stayDays").show())
temp.limit(100).toPandas()

In [20]:
!mkdir /home/ubuntu/s3/pq_v1_1_consolidated/

In [21]:
temp.repartition(1).write.parquet(os.path.join("/home/ubuntu/s3/pq_v1_1_consolidated/", "batch_1.pq"))

In [23]:
temp = spark.read.parquet("/home/ubuntu/s3/pq_v1_1_consolidated/*.pq")
# limit(3).toPandas()

In [24]:
display(temp.count())
display(temp
        .groupBy("fromCity", "toCity", "trip", "version", "stayDays")
        .count()
        .orderBy("fromCity", "toCity", "trip", "version", "stayDays").show())
temp.limit(100).toPandas()

2535820

+--------+------+----+-------+--------+------+
|fromCity|toCity|trip|version|stayDays| count|
+--------+------+----+-------+--------+------+
| Chengdu|sydney|   1|    1.1|    null|459073|
| Chengdu|sydney|   2|    1.1|       7|519472|
| Chengdu|sydney|   2|    1.1|      14|519535|
| Chengdu|sydney|   2|    1.1|      21|519152|
| Chengdu|sydney|   2|    1.1|      28|518588|
+--------+------+----+-------+--------+------+



None

Unnamed: 0,price,version,searchDate,tableName,task_id,currencyCode,fromCity,toCity,trip,depDate,retDate,stayDays,departureTime,arrivalTime,departureTime_leg2,arrivalTime_leg2,airlineName,airlineName_leg2,duration_m,duration_m_leg2,flight_code,plane,stops,stops_leg2,stop_airport,stop_duration,stop_airport_leg2,stop_duration_leg2,noOfTicketsLeft,noOfTicketsLeft_leg2,airline_code,airline_codes,airline_codes_leg2,fromCityAirportCode,toCityAirportCode,fromCityAirportCode_leg2,toCityAirportCode_leg2,carrierAirProviderId,carrierAirlineImageFileName,carrierMixedCabinClass,carrierMultiStop,carrierNextDayArrival,carrierAirProviderId_leg2,carrierAirlineImageFileName_leg2,carrierMixedCabinClass_leg2,carrierMultiStop_leg2,carrierNextDayArrival_leg2,timeline_departureAirport_cityState,timeline_departureAirport_city,timeline_departureAirport_code,timeline_departureAirport_localName,timeline_departureAirport_longName,timeline_departureAirport_name,timeline_departureTime,timeline_arrivalAirport_cityState,timeline_arrivalAirport_city,timeline_arrivalAirport_code,timeline_arrivalAirport_localName,timeline_arrivalAirport_longName,timeline_arrivalAirport_name,timeline_arrivalTime,timeline_distance,timeline_plane,timeline_brandedFareName,timeline_type,timeline_departureAirport_cityState_leg2,timeline_departureAirport_city_leg2,timeline_departureAirport_code_leg2,timeline_departureAirport_localName_leg2,timeline_departureAirport_longName_leg2,timeline_departureAirport_name_leg2,timeline_departureTime_leg2,timeline_arrivalAirport_cityState_leg2,timeline_arrivalAirport_city_leg2,timeline_arrivalAirport_code_leg2,timeline_arrivalAirport_localName_leg2,timeline_arrivalAirport_longName_leg2,timeline_arrivalAirport_name_leg2,timeline_arrivalTime_leg2,timeline_distance_leg2,timeline_plane_leg2,timeline_brandedFareName_leg2,timeline_type_leg2,span_days,power,video,wifi,stop_info
0,604.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T11:40:00.000+08:00,2017-06-21T09:15:00.000+10:00,2017-06-27T11:40:00.000+10:00,2017-06-28T10:25:00.000+08:00,Xiamen Airlines,Xiamen Airlines,1295.0,1365.0,MF8436,Boeing 737-800,1,1,"[Fuzhou, China (FOC-Changle Intl.)]",[7h:20m],"[Xiamen, China (XMN-Xiamen Intl.)]",[12h:15m],7,7,MF,"[MF, MF]","[MF, MF]",CTU,SYD,SYD,CTU,7,MF.gif,False,True,True,7,MF.gif,False,True,True,"[Chengdu, China, Fuzhou, China]","[Chengdu, Fuzhou]","[CTU, FOC]","[Shuangliu Intl., Changle Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Fuzhou,...","[Chengdu (CTU), Fuzhou (FOC)]","[2017-06-20T11:40:00.000+08:00, 2017-06-20T21:...","[Fuzhou, China, Sydney, NSW]","[Fuzhou, Sydney]","[FOC, SYD]","[Changle Intl., Kingsford Smith Intl.]","[Fuzhou, China (FOC-Changle Intl.), Sydney, NS...","[Fuzhou (FOC), Sydney (Kingsford Smith Intl.)]","[2017-06-20T14:25:00.000+08:00, 2017-06-21T09:...","[0, 0]","[Boeing 737-800, Boeing 787]","[, ]","[Segment, Segment]","[Sydney, NSW, Xiamen, China]","[Sydney, Xiamen]","[SYD, XMN]","[Kingsford Smith Intl., Xiamen Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Xiam...","[Sydney (Kingsford Smith Intl.), Xiamen (XMN)]","[2017-06-27T11:40:00.000+10:00, 2017-06-28T07:...","[Xiamen, China, Chengdu, China]","[Xiamen, Chengdu]","[XMN, CTU]","[Xiamen Intl., Shuangliu Intl.]","[Xiamen, China (XMN-Xiamen Intl.), Chengdu, Ch...","[Xiamen (XMN), Chengdu (CTU)]","[2017-06-27T19:20:00.000+08:00, 2017-06-28T10:...","[0, 0]","[Boeing 787, Boeing 737-800]","[, ]","[Segment, Segment]",99,False,False,False,"[Fuzhou, China (FOC-Changle Intl.)]"
1,754.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T22:20:00.000+08:00,2017-06-21T19:40:00.000+10:00,2017-06-27T10:45:00.000+10:00,2017-06-28T09:30:00.000+08:00,China Southern Airlines,China Southern Airlines,1280.0,1365.0,CZ3428,Boeing 737-800,1,1,"[Guangzhou, China (CAN-Baiyun Intl.)]",[7h:35m],"[Guangzhou, China (CAN-Baiyun Intl.)]",[12h:35m],8,8,CZ,"[CZ, CZ]","[CZ, CZ]",CTU,SYD,SYD,CTU,7,CZ.gif,False,True,True,7,CZ.gif,False,True,True,"[Chengdu, China, Guangzhou, China]","[Chengdu, Guangzhou]","[CTU, CAN]","[Shuangliu Intl., Baiyun Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Guangzh...","[Chengdu (CTU), Guangzhou (CAN)]","[2017-06-20T22:20:00.000+08:00, 2017-06-21T08:...","[Guangzhou, China, Sydney, NSW]","[Guangzhou, Sydney]","[CAN, SYD]","[Baiyun Intl., Kingsford Smith Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Sydney, ...","[Guangzhou (CAN), Sydney (Kingsford Smith Intl.)]","[2017-06-21T00:50:00.000+08:00, 2017-06-21T19:...","[0, 0]","[Boeing 737-800, Airbus A330]","[, ]","[Segment, Segment]","[Sydney, NSW, Guangzhou, China]","[Sydney, Guangzhou]","[SYD, CAN]","[Kingsford Smith Intl., Baiyun Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Guan...","[Sydney (Kingsford Smith Intl.), Guangzhou (CAN)]","[2017-06-27T10:45:00.000+10:00, 2017-06-28T07:...","[Guangzhou, China, Chengdu, China]","[Guangzhou, Chengdu]","[CAN, CTU]","[Baiyun Intl., Shuangliu Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Chengdu,...","[Guangzhou (CAN), Chengdu (CTU)]","[2017-06-27T18:30:00.000+08:00, 2017-06-28T09:...","[0, 0]","[Airbus A330, Boeing 737-800]","[, ]","[Segment, Segment]",99,False,False,False,"[Guangzhou, China (CAN-Baiyun Intl.)]"
2,754.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T21:25:00.000+08:00,2017-06-21T19:40:00.000+10:00,2017-06-27T10:45:00.000+10:00,2017-06-28T09:30:00.000+08:00,China Southern Airlines,China Southern Airlines,1335.0,1365.0,CZ3484,BOEING 777-300ER,1,1,"[Guangzhou, China (CAN-Baiyun Intl.)]",[8h:45m],"[Guangzhou, China (CAN-Baiyun Intl.)]",[12h:35m],8,8,CZ,"[CZ, CZ]","[CZ, CZ]",CTU,SYD,SYD,CTU,7,CZ.gif,False,True,True,7,CZ.gif,False,True,True,"[Chengdu, China, Guangzhou, China]","[Chengdu, Guangzhou]","[CTU, CAN]","[Shuangliu Intl., Baiyun Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Guangzh...","[Chengdu (CTU), Guangzhou (CAN)]","[2017-06-20T21:25:00.000+08:00, 2017-06-21T08:...","[Guangzhou, China, Sydney, NSW]","[Guangzhou, Sydney]","[CAN, SYD]","[Baiyun Intl., Kingsford Smith Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Sydney, ...","[Guangzhou (CAN), Sydney (Kingsford Smith Intl.)]","[2017-06-20T23:40:00.000+08:00, 2017-06-21T19:...","[0, 0]","[BOEING 777-300ER, Airbus A330]","[, ]","[Segment, Segment]","[Sydney, NSW, Guangzhou, China]","[Sydney, Guangzhou]","[SYD, CAN]","[Kingsford Smith Intl., Baiyun Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Guan...","[Sydney (Kingsford Smith Intl.), Guangzhou (CAN)]","[2017-06-27T10:45:00.000+10:00, 2017-06-28T07:...","[Guangzhou, China, Chengdu, China]","[Guangzhou, Chengdu]","[CAN, CTU]","[Baiyun Intl., Shuangliu Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Chengdu,...","[Guangzhou (CAN), Chengdu (CTU)]","[2017-06-27T18:30:00.000+08:00, 2017-06-28T09:...","[0, 0]","[Airbus A330, Boeing 737-800]","[, ]","[Segment, Segment]",99,False,False,False,"[Guangzhou, China (CAN-Baiyun Intl.)]"
3,754.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T19:45:00.000+08:00,2017-06-21T19:40:00.000+10:00,2017-06-27T10:45:00.000+10:00,2017-06-27T23:40:00.000+08:00,China Southern Airlines,China Southern Airlines,1435.0,775.0,CZ3418,AIRBUS INDUSTRIE A330-300,1,1,"[Guangzhou, China (CAN-Baiyun Intl.)]",[10h:20m],"[Guangzhou, China (CAN-Baiyun Intl.)]",[2h:45m],8,8,CZ,"[CZ, CZ]","[CZ, CZ]",CTU,SYD,SYD,CTU,7,CZ.gif,False,True,True,7,CZ.gif,False,True,False,"[Chengdu, China, Guangzhou, China]","[Chengdu, Guangzhou]","[CTU, CAN]","[Shuangliu Intl., Baiyun Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Guangzh...","[Chengdu (CTU), Guangzhou (CAN)]","[2017-06-20T19:45:00.000+08:00, 2017-06-21T08:...","[Guangzhou, China, Sydney, NSW]","[Guangzhou, Sydney]","[CAN, SYD]","[Baiyun Intl., Kingsford Smith Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Sydney, ...","[Guangzhou (CAN), Sydney (Kingsford Smith Intl.)]","[2017-06-20T22:05:00.000+08:00, 2017-06-21T19:...","[0, 0]","[AIRBUS INDUSTRIE A330-300, Airbus A330]","[, ]","[Segment, Segment]","[Sydney, NSW, Guangzhou, China]","[Sydney, Guangzhou]","[SYD, CAN]","[Kingsford Smith Intl., Baiyun Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Guan...","[Sydney (Kingsford Smith Intl.), Guangzhou (CAN)]","[2017-06-27T10:45:00.000+10:00, 2017-06-27T21:...","[Guangzhou, China, Chengdu, China]","[Guangzhou, Chengdu]","[CAN, CTU]","[Baiyun Intl., Shuangliu Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Chengdu,...","[Guangzhou (CAN), Chengdu (CTU)]","[2017-06-27T18:30:00.000+08:00, 2017-06-27T23:...","[0, 0]","[Airbus A330, Airbus A321]","[, ]","[Segment, Segment]",99,False,False,False,"[Guangzhou, China (CAN-Baiyun Intl.)]"
4,754.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T16:00:00.000+08:00,2017-06-21T19:40:00.000+10:00,2017-06-27T10:45:00.000+10:00,2017-06-27T23:40:00.000+08:00,China Southern Airlines,China Southern Airlines,1660.0,775.0,CZ3444,Boeing 787,1,1,"[Guangzhou, China (CAN-Baiyun Intl.)]",[14h:0m],"[Guangzhou, China (CAN-Baiyun Intl.)]",[2h:45m],8,8,CZ,"[CZ, CZ]","[CZ, CZ]",CTU,SYD,SYD,CTU,7,CZ.gif,False,True,True,7,CZ.gif,False,True,False,"[Chengdu, China, Guangzhou, China]","[Chengdu, Guangzhou]","[CTU, CAN]","[Shuangliu Intl., Baiyun Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Guangzh...","[Chengdu (CTU), Guangzhou (CAN)]","[2017-06-20T16:00:00.000+08:00, 2017-06-21T08:...","[Guangzhou, China, Sydney, NSW]","[Guangzhou, Sydney]","[CAN, SYD]","[Baiyun Intl., Kingsford Smith Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Sydney, ...","[Guangzhou (CAN), Sydney (Kingsford Smith Intl.)]","[2017-06-20T18:25:00.000+08:00, 2017-06-21T19:...","[0, 0]","[Boeing 787, Airbus A330]","[, ]","[Segment, Segment]","[Sydney, NSW, Guangzhou, China]","[Sydney, Guangzhou]","[SYD, CAN]","[Kingsford Smith Intl., Baiyun Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Guan...","[Sydney (Kingsford Smith Intl.), Guangzhou (CAN)]","[2017-06-27T10:45:00.000+10:00, 2017-06-27T21:...","[Guangzhou, China, Chengdu, China]","[Guangzhou, Chengdu]","[CAN, CTU]","[Baiyun Intl., Shuangliu Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Chengdu,...","[Guangzhou (CAN), Chengdu (CTU)]","[2017-06-27T18:30:00.000+08:00, 2017-06-27T23:...","[0, 0]","[Airbus A330, Airbus A321]","[, ]","[Segment, Segment]",99,False,False,False,"[Guangzhou, China (CAN-Baiyun Intl.)]"
5,1007.92,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T23:55:00.000+08:00,2017-06-21T20:10:00.000+10:00,2017-06-27T11:40:00.000+10:00,2017-06-28T10:25:00.000+08:00,AirAsiaX,Xiamen Airlines,1215.0,1365.0,D7327,,1,1,"[Kuala Lumpur, Malaysia (KUL-Kuala Lumpur Intl.)]",[5h:30m],"[Xiamen, China (XMN-Xiamen Intl.)]",[12h:15m],99,7,D7,"[D7, D7]","[MF, MF]",CTU,SYD,SYD,CTU,75,D7.GIF,False,True,True,7,MF.gif,False,True,True,"[Chengdu, China, Kuala Lumpur, Malaysia]","[Chengdu, Kuala Lumpur]","[CTU, KUL]","[Shuangliu Intl., Kuala Lumpur Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Kuala L...","[Chengdu (CTU), Kuala Lumpur (Kuala Lumpur Int...","[2017-06-20T23:55:00.000+08:00, 2017-06-21T10:...","[Kuala Lumpur, Malaysia, Sydney, NSW]","[Kuala Lumpur, Sydney]","[KUL, SYD]","[Kuala Lumpur Intl., Kingsford Smith Intl.]","[Kuala Lumpur, Malaysia (KUL-Kuala Lumpur Intl...","[Kuala Lumpur (Kuala Lumpur Intl.), Sydney (Ki...","[2017-06-21T04:30:00.000+08:00, 2017-06-21T20:...","[0, 0]","[, ]","[, ]","[Segment, Segment]","[Sydney, NSW, Xiamen, China]","[Sydney, Xiamen]","[SYD, XMN]","[Kingsford Smith Intl., Xiamen Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Xiam...","[Sydney (Kingsford Smith Intl.), Xiamen (XMN)]","[2017-06-27T11:40:00.000+10:00, 2017-06-28T07:...","[Xiamen, China, Chengdu, China]","[Xiamen, Chengdu]","[XMN, CTU]","[Xiamen Intl., Shuangliu Intl.]","[Xiamen, China (XMN-Xiamen Intl.), Chengdu, Ch...","[Xiamen (XMN), Chengdu (CTU)]","[2017-06-27T19:20:00.000+08:00, 2017-06-28T10:...","[0, 0]","[Boeing 787, Boeing 737-800]","[, ]","[Segment, Segment]",99,False,False,False,"[Kuala Lumpur, Malaysia (KUL-Kuala Lumpur Intl.)]"
6,1058.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T16:00:00.000+08:00,2017-06-21T08:25:00.000+10:00,2017-06-27T10:45:00.000+10:00,2017-06-27T23:40:00.000+08:00,China Southern Airlines,China Southern Airlines,985.0,775.0,CZ3444,Boeing 787,1,1,"[Guangzhou, China (CAN-Baiyun Intl.)]",[2h:45m],"[Guangzhou, China (CAN-Baiyun Intl.)]",[2h:45m],8,8,CZ,"[CZ, CZ]","[CZ, CZ]",CTU,SYD,SYD,CTU,7,CZ.gif,False,True,True,7,CZ.gif,False,True,False,"[Chengdu, China, Guangzhou, China]","[Chengdu, Guangzhou]","[CTU, CAN]","[Shuangliu Intl., Baiyun Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Guangzh...","[Chengdu (CTU), Guangzhou (CAN)]","[2017-06-20T16:00:00.000+08:00, 2017-06-20T21:...","[Guangzhou, China, Sydney, NSW]","[Guangzhou, Sydney]","[CAN, SYD]","[Baiyun Intl., Kingsford Smith Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Sydney, ...","[Guangzhou (CAN), Sydney (Kingsford Smith Intl.)]","[2017-06-20T18:25:00.000+08:00, 2017-06-21T08:...","[0, 0]","[Boeing 787, Airbus A330]","[, ]","[Segment, Segment]","[Sydney, NSW, Guangzhou, China]","[Sydney, Guangzhou]","[SYD, CAN]","[Kingsford Smith Intl., Baiyun Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Guan...","[Sydney (Kingsford Smith Intl.), Guangzhou (CAN)]","[2017-06-27T10:45:00.000+10:00, 2017-06-27T21:...","[Guangzhou, China, Chengdu, China]","[Guangzhou, Chengdu]","[CAN, CTU]","[Baiyun Intl., Shuangliu Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Chengdu,...","[Guangzhou (CAN), Chengdu (CTU)]","[2017-06-27T18:30:00.000+08:00, 2017-06-27T23:...","[0, 0]","[Airbus A330, Airbus A321]","[, ]","[Segment, Segment]",99,False,False,False,"[Guangzhou, China (CAN-Baiyun Intl.)]"
7,1058.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T13:55:00.000+08:00,2017-06-21T08:25:00.000+10:00,2017-06-27T10:45:00.000+10:00,2017-06-27T23:40:00.000+08:00,China Southern Airlines,China Southern Airlines,1110.0,775.0,CZ3438,BOEING 777-300ER,1,1,"[Guangzhou, China (CAN-Baiyun Intl.)]",[4h:50m],"[Guangzhou, China (CAN-Baiyun Intl.)]",[2h:45m],8,8,CZ,"[CZ, CZ]","[CZ, CZ]",CTU,SYD,SYD,CTU,7,CZ.gif,False,True,True,7,CZ.gif,False,True,False,"[Chengdu, China, Guangzhou, China]","[Chengdu, Guangzhou]","[CTU, CAN]","[Shuangliu Intl., Baiyun Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Guangzh...","[Chengdu (CTU), Guangzhou (CAN)]","[2017-06-20T13:55:00.000+08:00, 2017-06-20T21:...","[Guangzhou, China, Sydney, NSW]","[Guangzhou, Sydney]","[CAN, SYD]","[Baiyun Intl., Kingsford Smith Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Sydney, ...","[Guangzhou (CAN), Sydney (Kingsford Smith Intl.)]","[2017-06-20T16:20:00.000+08:00, 2017-06-21T08:...","[0, 0]","[BOEING 777-300ER, Airbus A330]","[, ]","[Segment, Segment]","[Sydney, NSW, Guangzhou, China]","[Sydney, Guangzhou]","[SYD, CAN]","[Kingsford Smith Intl., Baiyun Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Guan...","[Sydney (Kingsford Smith Intl.), Guangzhou (CAN)]","[2017-06-27T10:45:00.000+10:00, 2017-06-27T21:...","[Guangzhou, China, Chengdu, China]","[Guangzhou, Chengdu]","[CAN, CTU]","[Baiyun Intl., Shuangliu Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Chengdu,...","[Guangzhou (CAN), Chengdu (CTU)]","[2017-06-27T18:30:00.000+08:00, 2017-06-27T23:...","[0, 0]","[Airbus A330, Airbus A321]","[, ]","[Segment, Segment]",99,False,False,False,"[Guangzhou, China (CAN-Baiyun Intl.)]"
8,1058.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T11:50:00.000+08:00,2017-06-21T08:25:00.000+10:00,2017-06-27T10:45:00.000+10:00,2017-06-27T23:40:00.000+08:00,China Southern Airlines,China Southern Airlines,1235.0,775.0,CZ3402,Airbus A330,1,1,"[Guangzhou, China (CAN-Baiyun Intl.)]",[6h:55m],"[Guangzhou, China (CAN-Baiyun Intl.)]",[2h:45m],8,8,CZ,"[CZ, CZ]","[CZ, CZ]",CTU,SYD,SYD,CTU,7,CZ.gif,False,True,True,7,CZ.gif,False,True,False,"[Chengdu, China, Guangzhou, China]","[Chengdu, Guangzhou]","[CTU, CAN]","[Shuangliu Intl., Baiyun Intl.]","[Chengdu, China (CTU-Shuangliu Intl.), Guangzh...","[Chengdu (CTU), Guangzhou (CAN)]","[2017-06-20T11:50:00.000+08:00, 2017-06-20T21:...","[Guangzhou, China, Sydney, NSW]","[Guangzhou, Sydney]","[CAN, SYD]","[Baiyun Intl., Kingsford Smith Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Sydney, ...","[Guangzhou (CAN), Sydney (Kingsford Smith Intl.)]","[2017-06-20T14:15:00.000+08:00, 2017-06-21T08:...","[0, 0]","[Airbus A330, Airbus A330]","[, ]","[Segment, Segment]","[Sydney, NSW, Guangzhou, China]","[Sydney, Guangzhou]","[SYD, CAN]","[Kingsford Smith Intl., Baiyun Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Guan...","[Sydney (Kingsford Smith Intl.), Guangzhou (CAN)]","[2017-06-27T10:45:00.000+10:00, 2017-06-27T21:...","[Guangzhou, China, Chengdu, China]","[Guangzhou, Chengdu]","[CAN, CTU]","[Baiyun Intl., Shuangliu Intl.]","[Guangzhou, China (CAN-Baiyun Intl.), Chengdu,...","[Guangzhou (CAN), Chengdu (CTU)]","[2017-06-27T18:30:00.000+08:00, 2017-06-27T23:...","[0, 0]","[Airbus A330, Airbus A321]","[, ]","[Segment, Segment]",99,False,False,False,"[Guangzhou, China (CAN-Baiyun Intl.)]"
9,1185.14,1.1,2017-06-08,flight_10_1_price,57,AUD,Chengdu,sydney,2,2017-06-20,2017-06-27,7,2017-06-20T23:10:00.000+08:00,2017-06-21T16:50:00.000+10:00,2017-06-27T11:40:00.000+10:00,2017-06-28T10:25:00.000+08:00,,Xiamen Airlines,1060.0,1365.0,MI937,Boeing 737-800,1,1,"[Singapore, Singapore (SIN-Changi)]",[3h:0m],"[Xiamen, China (XMN-Xiamen Intl.)]",[12h:15m],8,7,MI,"[MI, SQ]","[MF, MF]",CTU,SYD,SYD,CTU,7,,False,True,True,7,MF.gif,False,True,True,"[Chengdu, China, Singapore, Singapore]","[Chengdu, Singapore]","[CTU, SIN]","[Shuangliu Intl., Changi]","[Chengdu, China (CTU-Shuangliu Intl.), Singapo...","[Chengdu (CTU), Singapore (Changi)]","[2017-06-20T23:10:00.000+08:00, 2017-06-21T07:...","[Singapore, Singapore, Sydney, NSW]","[Singapore, Sydney]","[SIN, SYD]","[Changi, Kingsford Smith Intl.]","[Singapore, Singapore (SIN-Changi), Sydney, NS...","[Singapore (Changi), Sydney (Kingsford Smith I...","[2017-06-21T04:10:00.000+08:00, 2017-06-21T16:...","[0, 0]","[Boeing 737-800, BOEING 777-300ER]","[, ]","[Segment, Segment]","[Sydney, NSW, Xiamen, China]","[Sydney, Xiamen]","[SYD, XMN]","[Kingsford Smith Intl., Xiamen Intl.]","[Sydney, NSW (SYD-Kingsford Smith Intl.), Xiam...","[Sydney (Kingsford Smith Intl.), Xiamen (XMN)]","[2017-06-27T11:40:00.000+10:00, 2017-06-28T07:...","[Xiamen, China, Chengdu, China]","[Xiamen, Chengdu]","[XMN, CTU]","[Xiamen Intl., Shuangliu Intl.]","[Xiamen, China (XMN-Xiamen Intl.), Chengdu, Ch...","[Xiamen (XMN), Chengdu (CTU)]","[2017-06-27T19:20:00.000+08:00, 2017-06-28T10:...","[0, 0]","[Boeing 787, Boeing 737-800]","[, ]","[Segment, Segment]",99,False,False,False,"[Singapore, Singapore (SIN-Changi)]"
