## Data integration script

The aim of this task is to integrate in the same file the following data:
- GPS;
- Weather;
- Waze;
- GTFS

Example of final file attributes (to use in ML algorithms):


The task has the following steps:
1. Clean the data (removing missing/wrong data);
2. Label shape file with route type (low/high frequency)
3. Update and run BULMA (matching of GPS and GTFS);
4. Update and run BUSTE (interpolate stops timestamp)
5. Label each GPS with headway value and BB (headway, BB, id_bus_bb)
6. Label with precipitation (precipitation)

In [1]:
import utm
import os
import pandas as pd

### Data example

#### GPS

#### Weather

#### Waze

#### GTFS

Pre-processing GPS data of Recife to separate files per day and to convert coordinates.

In [None]:
def convertCoordinates(x, y):
    if (x == '-' or x == '0' or y == '-' or y == '0'):
        return ['-', '-']
    
    return utm.to_latlon(long(x), long(y), 25, 'M')
    
def separateGPSFilePerMonth(file_path):
    DELIMITER = ','
    october_file = open(file_path + 'GPS_data_october.csv', 'w')
    november_file = open(file_path + 'GPS_data_november.csv', 'w')
    december_file = open(file_path + 'GPS_data_december.csv', 'w')
    columns_name = "Unidad" + DELIMITER +  "Instante" + DELIMITER + "Estado" + DELIMITER + "Comunica" + DELIMITER + "CoordX" + DELIMITER + "CoordY" + DELIMITER + "Linea" + DELIMITER + "Ruta" + DELIMITER + "Posicion" + DELIMITER + "Viaje" + DELIMITER + "Velocidad"
    october_file.write(columns_name + '\n')
    november_file.write(columns_name + '\n')
    december_file.write(columns_name + '\n')
    
    october = '2018-10'
    november = '2018-11'
    december = '2018-12'
    
    file_name = file_path + "GPS_data.csv"
    with open(file_name, 'r') as gps_data:
        next(gps_data)
        for line in gps_data:
            line_splitted = line.split(',')
            date = line_splitted[1]
            lat = line_splitted[4]
            lon = line_splitted[5]
            new_coordinates = convertCoordinates(lat, lon)
            
            new_line = line_splitted[0] + DELIMITER + line_splitted[1] + DELIMITER + line_splitted[2] + DELIMITER + line_splitted[3] + DELIMITER + str(new_coordinates[0]) + DELIMITER + str(new_coordinates[1]) + DELIMITER + line_splitted[6] + DELIMITER + line_splitted[7] + DELIMITER + line_splitted[8] + DELIMITER + line_splitted[9] + DELIMITER + line_splitted[10]
            
            if (october in date):
                october_file.write(new_line)
            elif (november in date):
                november_file.write(new_line)
            elif (december in date):
                december_file.write(new_line)

def separateGPSFilePerDay(file_path):
    DELIMITER = ','
    columns_name = "Unidad" + DELIMITER +  "Instante" + DELIMITER + "Estado" + DELIMITER + "Comunica" + DELIMITER + "CoordX" + DELIMITER + "CoordY" + DELIMITER + "Linea" + DELIMITER + "Ruta" + DELIMITER + "Posicion" + DELIMITER + "Viaje" + DELIMITER + "Velocidad"
    file_name = file_path + "GPS_data_october.csv"
    date_lines_dict = {}
    with open(file_name, 'r') as gps_data:
        next(gps_data)
        for line in gps_data:
            line_splitted = line.split(',')
            date_time = line_splitted[1]
            date = date_time.split(' ')[0]
            
            if (date not in date_lines_dict):
                date_lines_dict[date] = []
            
            date_lines_dict[date].append(line)
            
    
    for key in date_lines_dict:
        new_file = open(file_path + 'GPS_data_' + key + '.csv', 'w')
        new_file.write(columns_name + '\n')
        
        for data_per_day in date_lines_dict[key]:
            new_file.write(data_per_day)
            
        new_file.close()
            
file_path = os.getcwd() + "/../data/input/Recife/GPS/"

separateGPSFilePerMonth(file_path)

separateGPSFilePerDay(file_path)

### 1. Clean the data
Removing missing/wrong data

In [23]:
# Waze
# Removing lines from different city

def removeDifferentCity(dir_name, city_label):
    
    for file_name in os.listdir(dir_name):
        
        if file_name.endswith(".csv"): # to get just files
            file_path = dir_name + file_name

            new_file = open(dir_name + 'clean/' + file_name, 'w')

            with open(file_path, 'r') as waze_data:
                new_file.write(next(waze_data)) # write the header
                
                for line in waze_data:
                    line_splitted = line.split(',')
                    city = line_splitted[1]

                    if (city_label in city):
                        new_file.write(line) # to add just lines of the city

                new_file.close()
            
# Alert: create the folder 'clean', set the path and the city name            
dir_path = os.getcwd() + "/../data/input/Recife/Waze/"
city = 'Recife'
removeDifferentCity(dir_path, city)

In [None]:
#check gps data and weather for cleaning