## Data integration script

The aim of this task is to integrate in the same file the following data:
- GPS;
- Weather;
- Waze;
- GTFS

Example of final file attributes (to use in ML algorithms):


The task has the following steps:
1. Clean the data (removing missing/wrong data);
2. Label shape file with route type (low/high frequency)
3. Update and run BULMA (matching of GPS and GTFS);
4. Update and run BUSTE (interpolate stops timestamp)
5. Label each GPS with headway value and BB (headway, BB, id_bus_bb)
6. Label with precipitation (precipitation)

In [10]:
import utm
import os
import pandas as pd
from datetime import datetime
from numpy import median
import math
import pandas

### Data example

#### GPS
- bus_code
- timestamp
- route
- latitude
- longitude

#### Weather

#### Waze

#### GTFS

Pre-processing GPS data of Recife to separate files per day and to convert coordinates.

In [None]:
def convertCoordinates(x, y):
    if (x == '-' or x == '0' or y == '-' or y == '0'):
        return ['-', '-']
    
    return utm.to_latlon(long(x), long(y), 25, 'M')
    
def separateGPSFilePerMonth(file_path):
    DELIMITER = ','
    october_file = open(file_path + 'GPS_data_october.csv', 'w')
    november_file = open(file_path + 'GPS_data_november.csv', 'w')
    december_file = open(file_path + 'GPS_data_december.csv', 'w')
    columns_name = "Unidad" + DELIMITER +  "Instante" + DELIMITER + "Estado" + DELIMITER + "Comunica" + DELIMITER + "CoordX" + DELIMITER + "CoordY" + DELIMITER + "Linea" + DELIMITER + "Ruta" + DELIMITER + "Posicion" + DELIMITER + "Viaje" + DELIMITER + "Velocidad"
    october_file.write(columns_name + '\n')
    november_file.write(columns_name + '\n')
    december_file.write(columns_name + '\n')
    
    october = '2018-10'
    november = '2018-11'
    december = '2018-12'
    
    file_name = file_path + "GPS_data.csv"
    with open(file_name, 'r') as gps_data:
        next(gps_data)
        for line in gps_data:
            line_splitted = line.split(',')
            date = line_splitted[1]
            lat = line_splitted[4]
            lon = line_splitted[5]
            new_coordinates = convertCoordinates(lat, lon)
            
            new_line = line_splitted[0] + DELIMITER + line_splitted[1] + DELIMITER + line_splitted[2] + DELIMITER + line_splitted[3] + DELIMITER + str(new_coordinates[0]) + DELIMITER + str(new_coordinates[1]) + DELIMITER + line_splitted[6] + DELIMITER + line_splitted[7] + DELIMITER + line_splitted[8] + DELIMITER + line_splitted[9] + DELIMITER + line_splitted[10]
            
            if (october in date):
                october_file.write(new_line)
            elif (november in date):
                november_file.write(new_line)
            elif (december in date):
                december_file.write(new_line)

def separateGPSFilePerDay(file_path):
    DELIMITER = ','
    columns_name = "Unidad" + DELIMITER +  "Instante" + DELIMITER + "Estado" + DELIMITER + "Comunica" + DELIMITER + "CoordX" + DELIMITER + "CoordY" + DELIMITER + "Linea" + DELIMITER + "Ruta" + DELIMITER + "Posicion" + DELIMITER + "Viaje" + DELIMITER + "Velocidad"
    file_name = file_path + "GPS_data_october.csv"
    date_lines_dict = {}
    with open(file_name, 'r') as gps_data:
        next(gps_data)
        for line in gps_data:
            line_splitted = line.split(',')
            date_time = line_splitted[1]
            date = date_time.split(' ')[0]
            
            if (date not in date_lines_dict):
                date_lines_dict[date] = []
            
            date_lines_dict[date].append(line)
            
    
    for key in date_lines_dict:
        new_file = open(file_path + 'GPS_data_' + key + '.csv', 'w')
        new_file.write(columns_name + '\n')
        
        for data_per_day in date_lines_dict[key]:
            new_file.write(data_per_day)
            
        new_file.close()
            
file_path = os.getcwd() + "/../data/input/Recife/GPS/"

separateGPSFilePerMonth(file_path)

separateGPSFilePerDay(file_path)

### 1. Clean the data
Removing missing/wrong data

#### 1.1 Waze

In [None]:
# Waze
# Removing lines from different city

def removeDifferentCity(dir_name, city_label):
    
    for file_name in os.listdir(dir_name):
        
        if file_name.endswith(".csv"): # to get just files
            file_path = dir_name + file_name

            new_file = open(dir_name + 'clean/' + file_name, 'w')

            with open(file_path, 'r') as waze_data:
                new_file.write(next(waze_data)) # write the header
                
                for line in waze_data:
                    line_splitted = line.split(',')
                    city = line_splitted[1]

                    if (city_label in city):
                        new_file.write(line) # to add just lines of the city

                new_file.close()
            
# Alert: create the folder 'clean', set the path and the city name            
dir_path = os.getcwd() + "/../data/input/Recife/Waze/"
city = 'Recife'
removeDifferentCity(dir_path, city)

#### 1.2 GPS

Removing Recife data without *linea* (route) or *CoordX* or *CoordY*

In [None]:
# GPS - Recife
# Removing lines without linea or CoordX or CoordY

def cleanRecifeGPS(dir_path):
    for file_name in os.listdir(dir_path):
        
        if file_name.endswith(".csv"): # to get just files
            file_path = dir_path + file_name
            
            new_file = open(dir_path + 'clean/' + file_name, 'w')

            with open(file_path, 'r') as gps_data:
                new_file.write(next(gps_data)) # write the header
                
                for line in gps_data:
                    line_splitted = line.split(',')
                    route = line_splitted[6]
                    lat = line_splitted[4]
                    lon = line_splitted[5]

                    if (route != '' and lat != '-' and lat != '' and lon != '-' and lon != ''):
                        new_file.write(line) # to add just lines not empty

                new_file.close()
                
# Alert: create the folder 'csv' 
dir_path = os.getcwd() + "/../data/input/Recife/GPS/"
cleanRecifeGPS(dir_path)

In [None]:
# GPS - Curitiba
# Converting json file to csv file

# Alert: first, replace }{ by },{ 
# sed -i ':a;N;$!ba;s/}/},/g' *.json

def convertJSON2CSV(dir_path):
    DELIMITER = ','
    columns_name = "bus_code" + DELIMITER + "lat" + DELIMITER + "lon" + DELIMITER + "timestamp" + DELIMITER + "route"
    
    for file_name in os.listdir(dir_path):
        
        if file_name.endswith(".json"): # to get just files
            file_path = dir_path + file_name
            
            new_file = open(dir_path + 'csv/' + file_name.split('.')[0] + '.csv', 'w')

            new_file.write(columns_name + "\n")
            
            with open(file_path, 'r') as gps_data:
                next(gps_data) # skip empty line 
                for line in gps_data:
                    line_splitted = line.split(',')
                    bus_code = line_splitted[0].split('":')[1]
                    lat = line_splitted[1].split('":')[1]
                    lon = line_splitted[2].split('":')[1]
                    timestamp = line_splitted[3].split('":')[1]
                    route = line_splitted[4].split('":')[1][:-1]
                    
                    new_line = bus_code + DELIMITER + lat + DELIMITER + lon + DELIMITER + timestamp + DELIMITER + route
                    
                    new_file.write(new_line + "\n")
            
dir_path = os.getcwd() + "/../data/input/Curitiba/GPS/"
convertJSON2CSV(dir_path)

### 2. Label shape file with route type

Label shape file with route type: high frequency or low frequency based on *headway median of the city (h_median)*.

- **High frequency:** headway mean of the route <= h_median
- **Low frequency:** headway mean of the route > h_median

<img src="trips_file_example.png">

The *stop_times* file has the information:
- Buses: all the schedules of the bus are grouped, e.g. the schedules of the first bus are the firsts lines (from 5:00 to 23:00)
- Headway: the diference of the *arrival_time* of the bus and the follow.

**Challenge: how to separate different buses to compare headways?**

In [3]:
# Alert: Set the path with the city name            
dir_path = os.getcwd() + "/../data/input/Curitiba/GTFS/"
trips_file = 'trips.txt'
stop_times_file = 'stop_times.txt'

# Read trips.txt to get the route of the trips
def getRouteTripsMap():
    route_trips_map = {}
    
    with open(dir_path + trips_file, 'r') as trips_data:
        next(trips_data) # skip header
        
        for line in trips_data:
            splitted_line = line.split(',')
            route = splitted_line[0]
            trip_id = splitted_line[2]
            
            if (route not in route_trips_map):
                route_trips_map[route] = []
            
            route_trips_map[route].append(trip_id)
    
#     print(route_trips_map)
    
    return route_trips_map
    

# Read stop_times.txt to get the stop_times of each trip
def getTripsStopsTimesMap(route_trips_map):
    route_stops_times = {}
    
    with open(dir_path + stop_times_file, 'r') as stops_times_data:
        next(stops_times_data) # skip header
        
        for line in stops_times_data:
            splitted_line = line.split(',')
            trip_id = splitted_line[0]
            stop_id = splitted_line[3]
            arrival_time = splitted_line[1]
            
            # get the route of the trip 
            for key, trips in route_trips_map.items():
                if trip_id in trips:
                    route = key

            if (route not in route_stops_times):
                route_stops_times[route] = {}
            
            if (stop_id not in route_stops_times[route]):
                route_stops_times[route][stop_id] = []
            
            route_stops_times[route][stop_id].append(arrival_time)
                    
#     print(route_stops_times['34'])
    return route_stops_times


# Calculate the headway(difference between two timestamps) in seconds
def getHeadway(time1, time2):
    t1 = datetime.strptime(time1, "%H:%M:%S")
    t2 = datetime.strptime(time2, "%H:%M:%S")
    difference = t2 - t1
    
    return difference.seconds


In [4]:
route_trips_map = getRouteTripsMap()
route_stops_times = getTripsStopsTimesMap(route_trips_map)

In [5]:
# Calculate headway median of each route;
# headway median of the city.
def calculateRoutesHeadways(route_stops_times):
    route_headway_median_map = {}
    sum_headways_city = []
    
    for route, stopstimes in route_stops_times.items():
        
        sum_headways_route = []
        
        for stops, times in stopstimes.items():
            last_time = 0
            
            # Order times before to iterate
            ordered_times = sorted(times, key=lambda d: map(int, d.split(':')))
            for time in ordered_times:
                
                # skip the first timestamp or wrong data
                if (last_time != 0 and not time.startswith('24:') and not last_time.startswith('24:')): 
                    # mean doesn't work because some routes run in specific times
                    headway = getHeadway(last_time, time)
                    
                    sum_headways_route.append(headway)
                    
                last_time = time
                
        # calculate the median headway of the route
        if (len(sum_headways_route) > 0):
            headway_route = median(sum_headways_route)
            route_headway_median_map[route] = headway_route
            
            sum_headways_city.append(headway_route)
        
    median_headway_city = median(sum_headways_city)
    return (median_headway_city, route_headway_median_map)
    
median_headway_city, route_headway_median_map = calculateRoutesHeadways(route_stops_times)

In [17]:
#TODO run to CG and Recife

# Update routes.txt file with frequency (high or low) label
# based on its median headway and the median headway city
def updateRoutesFile(file_path, median_headway_city, route_headway_median_map):
    
    new_routes_file = open(file_path + 'routes_label.txt', 'w')
    DELIMITER = ','
    
    with open(file_path + 'routes.txt', 'r') as routes_data:
        columns_name = next(routes_data).split(DELIMITER)
        
        new_routes_file.write(columns_name[0] + DELIMITER + columns_name[1] + DELIMITER + columns_name[2]
                               + DELIMITER + columns_name[3] + DELIMITER + columns_name[4] + DELIMITER 
                              + columns_name[5] + DELIMITER + columns_name[6] + DELIMITER + columns_name[7] 
                               + DELIMITER + 'frequency' + DELIMITER + columns_name[8]) # write header 
        
        for line in routes_data:
            line_splitted = line.split(DELIMITER)
            route = str(line_splitted[0])
            
            # When there was no data to calculate headway or when route headway > city headway
            label = 'low_frequency'
            headway = '-'
            
            if (route in route_headway_median_map):
                headway = route_headway_median_map[route]
                if (headway <= median_headway_city):
                    label = 'high_frequency'
            
            print('Route: ' + route, 'headway_city: ' + str(median_headway_city), 
                  'headway_route: ' + str(headway), 'label: ' + label)
            
            new_line = line_splitted[0] + DELIMITER + line_splitted[1] + DELIMITER + line_splitted[2] + DELIMITER + line_splitted[3] + DELIMITER + line_splitted[4] + DELIMITER + line_splitted[5] + DELIMITER + line_splitted[6] + DELIMITER + line_splitted[7] + DELIMITER + label + DELIMITER + line_splitted[8]
            new_routes_file.write(new_line)
            
    new_routes_file.close()

# Alert: change the city name
file_path = os.getcwd() + "/../data/input/Curitiba/GTFS/"
updateRoutesFile(file_path, median_headway_city, route_headway_median_map)

('Route: 1', 'headway_city: 1413.0', 'headway_route: 900.0', 'label: high_frequency')
('Route: 6', 'headway_city: 1413.0', 'headway_route: 1823.0', 'label: low_frequency')
('Route: 7', 'headway_city: 1413.0', 'headway_route: 1200.0', 'label: high_frequency')
('Route: 12', 'headway_city: 1413.0', 'headway_route: 2040.0', 'label: low_frequency')
('Route: 16', 'headway_city: 1413.0', 'headway_route: 1612.0', 'label: low_frequency')
('Route: 17', 'headway_city: 1413.0', 'headway_route: 639.0', 'label: high_frequency')
('Route: 20', 'headway_city: 1413.0', 'headway_route: 1680.0', 'label: low_frequency')
('Route: 21', 'headway_city: 1413.0', 'headway_route: 1080.0', 'label: high_frequency')
('Route: 22', 'headway_city: 1413.0', 'headway_route: 2100.0', 'label: low_frequency')
('Route: 25', 'headway_city: 1413.0', 'headway_route: 1345.0', 'label: high_frequency')
('Route: 27', 'headway_city: 1413.0', 'headway_route: 1200.0', 'label: high_frequency')
('Route: 29', 'headway_city: 1413.0', 'hea

Each stop time belongs to one trip, i.e. the total os trips is the total os time stamps there.

**Observation:**

- In general, the scheduled headway of Curitiba is 1413 seconds (23 minutes).
- In general, the scheduled headway of Campina Grande is 1413 seconds (23 minutes).
- In general, the scheduled headway of Recife is 1413 seconds (23 minutes).