<p style="font-weight:bold; font-size:40px; color:green; line-height:1; margin:0px">
    Smart City Applications in Land Use and Transport (SCALUT)
</p>

## TfNSW GTFS-R Bus Trip Update 

<p style="font-weight:bold; font-size:24px; color:Gold; line-height:1; margin:4px 0px">
    1.2 Transform .CSV Files
</p>

<p style="font-weight:bold; font-size:18px; color:tomato; line-height:1; margin:4px 0px">
    Housekeeping: Import Libraries/Packages
</p>

In [None]:
import sys
import os
from datetime import datetime
import pandas as pd
import glob
import time
from zipfile import ZipFile
from GTFS_DPL_Funcs import *

<p style="font-weight:bold; font-size:18px; color:tomato; line-height:1; margin:4px 0px">
    Specify Project Directory and Folders and Define Variables
</p>

In [None]:
## Specifiy the main directory that stores input and output folders
DataDir = r'C:\OneMetis Dropbox\@One.IMS\Datasets\SCALUT_DW\TfNSW_GTFS_Buses'

## Specify the folder that stores the .PB.GZ files to be processed
FileTP = 'Test_201014_0800-0805'
DayInMonth = 14

# ## Specify the GTFS-R file prefix
GTFS_TU_Prefix = 'GTFS_TU'

## Specifiy the main folders that stores input and output data
# FldRawPB = '10_Raw_PB'
FldRawCSVtu = '11_CSV_Raw_TU'
FldTransTU = '12_CSV_Transformed_TU'
FldClnTU = '13_CSV_Cleaned_Unique_TU'

## Filter by Agency
Flt_Agency = 'Premier Illawarra'

In [None]:
## Specifiy the main folders that stores GTFS Static data
FldRawStatic = '10_Raw_Static'
StaticIdLkUp = {
#     'FileTP':'StaticId', 
    'Test_201014_0800-0805':'20201001191000', 
    '2020m06':'20200601190600', 
    '2020m07':'20200701190700', 
    '2020m08':'20200803190800', 
    '2020m09':'20200901190900', 
    '2020m10':'20201001191000', 
    '2020m11':'20201102191100', 
    '2020m12':'20201201191200', 
}
if FileTP in StaticIdLkUp.keys():
    FileIdStatic = StaticIdLkUp[FileTP]
    print(FileIdStatic)
else:
    print(f"ERROR: '{FileTP}' is not a key within the StaticIdLkUp.")

In [None]:
## Directory Path
# DirRawPBtu = DataDir + '/' + FldRawPBtu + '/' + FileTP
# DirRawPBtu = os.path.join(DataDir, FldRawPBtu, FileTP)

# DirRawCSVtu = DataDir + '/' + FldRawCSVtu + '/' + FileTP
DirRawCSVtu = os.path.join(DataDir, FldRawCSVtu, FileTP)
if not os.path.exists(DirRawCSVtu):
    os.makedirs(DirRawCSVtu)

# DirTransTU = DataDir + '/' + FldTransTU + '/' + FileTP
DirTransTU = os.path.join(DataDir, FldTransTU, FileTP)
if not os.path.exists(DirTransTU):
    os.makedirs(DirTransTU)

# DirClnTU = DataDir + '/' + FldClnTU + '/' + FileTP
DirClnTU = os.path.join(DataDir, FldClnTU, FileTP)
if not os.path.exists(DirClnTU):
    os.makedirs(DirClnTU)

# File_RoutesList = DataDir + '/' + FN_RoutesList

<p style="font-weight:bold; font-size:18px; color:tomato; line-height:1; margin:4px 0px">
    Get Information from GTFS Static
</p>

In [None]:
## Static Directory Path
FileStaticZip = 'complete_gtfs_scheduled_data_' + FileIdStatic + '.zip'
DirStaticZip = DataDir + '/' + FldRawStatic + '/' + FileStaticZip

ZipStatic = ZipFile(DirStaticZip)
df_StTimes = pd.read_csv(ZipStatic.open('stop_times.txt'),
                         dtype={'trip_id':'str','arrival_time':'str','departure_time':'str','stop_id':'str',
                                'stop_sequence':'Int64','stop_headsign':'str','pickup_type':'int','drop_off_type':'int',
                                'shape_dist_traveled':'float','timepoint':'int','stop_note':'str'},
                        )
# df_StTimes.head(2)

<p style="font-weight:bold; font-size:18px; color:tomato; line-height:1; margin:4px 0px">
    FOR ARTEMIS: Combine Complete Raw CSV Files
</p>

In [None]:
## Record Start Time
tStart = datetime.now()
print('PROCESSING DATA FOR', FileTP, "...")
print('Time Start:', tStart.isoformat(' ', 'seconds'))
       
## Define File Path
PathTransTUrtNoROT = DirTransTU + '/' + GTFS_TU_Prefix + '_' + FileTP + '_NoROT.csv'
PathTransTUrtCln = DirClnTU + '/' + GTFS_TU_Prefix + '_' + FileTP + '_Cln.csv'

## Check if file exists. Remove if exist.
if os.path.exists(PathTransTUrtNoROT):
    os.remove(PathTransTUrtNoROT)
if os.path.exists(PathTransTUrtCln):
    os.remove(PathTransTUrtCln)

## Filter Route and Concatenate All CSV Files in Folder (add new column with Filename as trace)
all_files = glob.glob(os.path.join(DirRawCSVtu, GTFS_TU_Prefix + '*.csv'))

iFile = 0
df_Con = []

for f in all_files:

    ## Count File
    iFile = iFile + 1

    ## Get FullFileName from Path
##    FullFileName = f.split('/')[-1]     ## FOR LINUX COMPUTERS
    FullFileName = f.split('\\')[-1]    ## FOR WINDOWS COMPUTERS
    ## FileName exclude Extension
    FNexExt = os.path.splitext(FullFileName)[0]

    if iFile == 1:
        ## Call function to read raw TU CSV files
        df_Con = Read_CSV_Raw_TU(f)
        ## Call function remove ROT records
        df_Con_NoROT1 = Df_Remove_ROT(df_Con)
    else:
        ## Call function to read raw TU CSV files
        df_X = Read_CSV_Raw_TU(f)
        ## Call function remove ROT records
        df_X_NoROT1 = Df_Remove_ROT(df_X)

        ## Combine records from df_Con_Flt and df_X_Flt
        df_Con_NoROT1 = pd.concat([df_Con_NoROT1, df_X_NoROT1], ignore_index=True)

## Calculate Scheduled ArrivalTime
df_Con_NoROT1 = Df_SchArrTime(df_Con_NoROT1)

## Get Stop Sequence from GTFS Static
df_Con_NoROT2 = Df_GetStaticStopSeq(df_Con_NoROT1, df_StTimes)

## Get shape_dist_traveled from GTFS Static
df_Con_NoROT3 = Df_GetStaticDist(df_Con_NoROT2, df_StTimes)

## Flag Bad Observations
df_Con_NoROT4 = Df_FlagBad(df_Con_NoROT3)

## Export concatenated files to CSV
df_Con_NoROT = df_Con_NoROT4
df_Con_NoROT.to_csv(PathTransTUrtNoROT, index=False)

## Clean Duplicate Data
df_ConTU_Cln = Df_Remove_Duplicate(df_Con_NoROT)
## Export Cleaned Data to CSV
df_ConTU_Cln.to_csv(PathTransTUrtCln, index=False)

## Record End Time
tEnd = datetime.now()
print(iFile, 'Files Processed:', tEnd.isoformat(' ', 'seconds') + '; Time Spent:', tEnd-tStart)
print('After ROT Removed:', df_Con_NoROT.shape)
print('Cleaned:', df_ConTU_Cln.shape)
print('COMPLETED ON', datetime.now())
print('Transformed file saved in:', PathTransTUrtNoROT)
print('Cleaned file saved in:', PathTransTUrtCln)