In [1]:
import os
import pandas as pd
import sys
import datetime
import math
import time
import seaborn as sns
import folium

In [2]:
def readFiles(dataPath):
    dataFrames = []
    
#     # Create one file by day of analysis
#     for dirName in os.listdir(dataPath):
#         allBusteData = []
#         for fileName in os.listdir(dataPath + dirName):
#             filePath = dataPath + dirName +  "/" + fileName
#             if ("part-" in fileName):
#                 allBusteData.append(pd.read_csv(filePath))

#         # Concatenate all data of one day into one DataFrame
#         dataByDay = pd.concat(allBusteData, ignore_index = True)
#         print("Day " + dataByDay['date'][0], dataByDay.head(5))
#         print("Size: ", dataByDay.shape)
#         dataFrames.append(dataByDay)

    dataFrames.append(pd.read_csv(dataPath + "test")) # TODO just for test

    return dataFrames

In [3]:
def processingData(dataFrames):

    dataFramesFiltered = []
    
    for dataByDay in dataFrames:
        
        # Filtering usable columns
        dataByDay = dataByDay.loc[:, ["route", "busCode", "timestamp", "stopPointId", "cardTimestamp", 
                                      "shapeLat", "shapeLon", "date"]]
        print("After filter columns: ", dataByDay.shape)

        # Removing data without gps point
        before = dataByDay.shape[0]
        dataByDay = dataByDay[dataByDay.timestamp != "-"]
        print("After remove rows without GPS: ", dataByDay.shape)
        after = dataByDay.shape[0]
        deleted_rows = before - after
        print("Deleted rows", deleted_rows)

        # Convert timestamp to date
        dataByDay['date'] = dataByDay['date'].replace("_", "-", regex = True)
        dataByDay['timestamp'] = pd.to_datetime(dataByDay.date.astype(str) + ' ' + dataByDay.timestamp.astype(str))
        
        # Fixing index by number of rows
        dataByDay = dataByDay.reset_index(drop = True)
        
        # Generate ID by index
        dataByDay["id"] = dataByDay.index
        
        print("Processed data: ", dataByDay.head())
        
        dataFramesFiltered.append(dataByDay)
        
    return dataFramesFiltered

In [4]:
# Output dataframe to add Bus Bunching occurrence
def createOutputDataframe(dataByDay):
    outputBusBunching = dataByDay
    outputBusBunching['busBunching'] = "" # create new column

    return outputBusBunching

In [5]:
def distanceBetween(origin, destination):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    origin : tuple of float
        (lat, long)
    destination : tuple of float
        (lat, long)

    Returns
    -------
    distance_in_m : float

    Examples
    --------
    >>> origin = (48.1372, 11.5756)  # Munich
    >>> destination = (52.5186, 13.4083)  # Berlin
    >>> round(distance(origin, destination), 1)
    504.2
    """
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c

    return d * 1000 #meters

In [6]:
def isBusBunching(distance, threshold):
    """
    Detecting Bus Bunching.

    Parameters
    ----------
    distance : distance between two gps points
        float

    Returns
    -------
    is_bus_bunching : boolean

    Examples
    --------
    >>> distance = 98.08
    true
    """
    return distance < threshold

In [7]:
def generateOutputBusBunchingDetection(dataByDay, dataByDayOutput, route_buses_dic, intervalTimeThreshold, 
                                       distanceThreshold, outputPath):

    outputBusBunching = dataByDayOutput
    
    for row in dataByDay.itertuples():
        """
        Index=0, route=463, busCode='DC090', timestamp=Timestamp('2017-04-30 05:48:27'), stopPointId=31707, 
        cardTimestamp='05:49:19', shapeLat=-25.478024708433903, shapeLon=-49.20303259585818, date='2017-04-30', 
        id=1, busBunching='')
        """
        indexFirstBus = row[9]
        routeFirstBus = row[1]
        codeFirstBus = row[2]
        timestampFirstBus = row[3]
        latitudeFirstBus = row[6]
        longitudeFirstBus = row[7]

        closestTime = ""
        previousBus = ""

        # Compare buses from the same route
        buses_same_route = route_buses_dic[routeFirstBus]

        for rowOtherBus in buses_same_route:
            """
            [id, route, busCode, timestamp, "stopPointId", "cardTimestamp", "shapeLat", "shapeLon"]
            [1, 463, 'DC090', Timestamp('2017-04-30 05:48:27'), 31707, '05:49:19', 
            -25.478024708433903, -49.20303259585818]
            """
            indexOtherBus = rowOtherBus[0]
            routeOtherBus = rowOtherBus[1]
            codeOtherBus = rowOtherBus[2]
            timestampOtherBus = rowOtherBus[3]
            latitudeOtherBus = rowOtherBus[6]
            longitudeOtherBus = rowOtherBus[7]

            if (codeOtherBus != codeFirstBus): # avoiding to compare with the same bus
                difTime = abs(timestampOtherBus - timestampFirstBus)

                if (previousBus == codeOtherBus):

                    #find the nearest timestamps
                    if (difTime < closestTime):
                        closestTime = difTime
                        closestBusCode = codeOtherBus
                        closestLat = latitudeOtherBus
                        closestLon = longitudeOtherBus

                else:
                    #first, analyse the last bus

                    # we don't have the same buses hour to compare distance, so they should be at least near
                    if (closestTime != "" and (closestTime < datetime.timedelta(seconds = intervalTimeThreshold))):
                        distance = distanceBetween((latitudeFirstBus, longitudeFirstBus), (closestLat, closestLon))

                        if (isBusBunching(distance, distanceThreshold)):
                            outputBusBunching.loc[indexFirstBus,'busBunching'] = outputBusBunching.iloc[indexFirstBus]["busBunching"] + str(indexOtherBus) + "-"
                            outputBusBunching.loc[indexOtherBus,'busBunching'] = outputBusBunching.iloc[indexOtherBus]["busBunching"] + str(indexFirstBus) + "-"
    
                    # -----------------------------------------
                    # New bus to compare: reseting variables
                    previousBus = codeOtherBus
                    closestTime = difTime
                    closestBusCode = codeOtherBus
                    closestLat = latitudeOtherBus
                    closestLon = longitudeOtherBus
    
    print("Output Bus Bunching " + outputBusBunching['date'][0], outputBusBunching.loc[outputBusBunching['busBunching'] != ""].head())                
    saveToCSV(outputBusBunching, outputBusBunching["date"][0], outputPath)


In [8]:
# Saving csv file
def saveToCSV(outputByDay, date, outputPath):
    # to print bus bunching
    # outputBusBunching.loc[outputBusBunching['busBunching'] != ""]
    
    outputByDay.to_csv(outputPath + "outputBusBunching-" + date + ".csv", index_label = "index", encoding = 'utf-8')

In [9]:
def executeDetection(dataFramesFiltered, intervalTimeThreshold, distanceThreshold, outputPath):
    
    for dataByDay in dataFramesFiltered:
        outputByDay = createOutputDataframe(dataByDay)
        
        # Mapping route - buses
        route_buses_dic = dataByDay.groupby(['route']).apply(lambda f: f[["id", "route", "busCode", "timestamp", 
                                                                          "stopPointId", "cardTimestamp", "shapeLat", 
                                                                          "shapeLon"]].values.tolist()).to_dict()
        generateOutputBusBunchingDetection(dataByDay, outputByDay, route_buses_dic, intervalTimeThreshold, 
                                           distanceThreshold, outputPath)    

In [10]:
def main():
    start_time = time.time()
    
    MAX_INTERVAL_TIME_THRESHOLD = 300 # max interval time to compare buses
    DISTANCE_THRESHOLD = 100 # meters

    inputPath = os.getcwd() + "/data/buste/"
    outputPath = os.getcwd() + "/data/outputBusBunching/"
    
    dataFrames = readFiles(inputPath)
    dataFramesFiltered = processingData(dataFrames)
    
    executeDetection(dataFramesFiltered, MAX_INTERVAL_TIME_THRESHOLD, DISTANCE_THRESHOLD, outputPath)
    
    executionTime = int(time.time() - start_time) / 60
    print("--- %s minutes ---" % executionTime)

In [11]:
if __name__ == '__main__':
    main()

  exec(code_obj, self.user_global_ns, self.user_ns)


('After filter columns: ', (112729, 8))
('After remove rows without GPS: ', (104683, 8))
('Deleted rows', 8046)
('Processed data: ',    route busCode           timestamp  stopPointId cardTimestamp   shapeLat  \
0    463   DC090 2017-04-30 05:48:27        31707      05:49:19 -25.478025   
1    463   DC090 2017-04-30 05:49:55        31524             - -25.474027   
2    463   DC090 2017-04-30 05:51:18        31521             - -25.468650   
3    463   DC090 2017-04-30 05:52:09        31517             - -25.464090   
4    463   DC090 2017-04-30 05:53:01        31515             - -25.459453   

    shapeLon        date  id  
0 -49.203033  2017-04-30   0  
1 -49.209193  2017-04-30   1  
2 -49.213528  2017-04-30   2  
3 -49.217170  2017-04-30   3  
4 -49.220872  2017-04-30   4  )
('Output Bus Bunching 2017-04-30',       route busCode           timestamp  stopPointId cardTimestamp   shapeLat  \
3026    216   HA601 2017-04-30 09:16:47        29123             - -25.431592   
3099    216   

In [None]:
def plotMarkersOnMap(outputBusBunching):
    
    # create map route - color
    color = sns.choose_colorbrewer_palette("diverging", as_cmap = False)
    route_color_dict = dict(zip(outputBusBunching["route"], color))
    route_color_dict

    # filter rows with bus bunching
    busBunching = outputBusBunching.loc[outputBusBunching["busBunching"] != ""]
    #busBunching = outputBusBunching
    
    # Make an empty map
    m = folium.Map(location = [20, 0], tiles = "Mapbox Bright", zoom_start = 3)

    # I can add marker one by one on the map
    for i in range(0, len(busBunching)):
        route = busBunching.iloc[i]['route']
        popup_data = str(busBunching.iloc[i]['id']) + "\n" + str(busBunching.iloc[i]['busCode'])

        folium.CircleMarker([busBunching.iloc[i]['shapeLon'], busBunching.iloc[i]['shapeLat']], popup = popup_data, 
                            radius = 1000).add_to(m)
        
#         folium.CircleMarker([busBunching.iloc[i]['shapeLon'], busBunching.iloc[i]['shapeLat']], popup = popup_data, 
#                             fill_color = route_color_dict["route"], color = route_color_dict["route"], radius = 1000).add_to(m)

    # Save it as html
    # m.save('bus_bunching_visualization.html')

    print(m)

def validationOnMap():
    dayOfAnalysis = "2017-04-30.csv"
    filePath = os.getcwd() + "/data/outputBusBunching/outputBusBunching-" + dayOfAnalysis
    outputBusBunching = pd.read_csv(filePath)
    
    print(outputBusBunching.head())
    
    plotMarkersOnMap(outputBusBunching)

In [None]:
if __name__ == '__main__':
    validationOnMap()

In [15]:
outputBusBunching.loc[outputBusBunching["busBunching" != ""]].route.unique()

NameError: name 'outputBusBunching' is not defined