In [2]:
import pandas as pd
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.formula.api as sm
from sklearn import linear_model, preprocessing
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, mean_absolute_error
import time

In [35]:
import time
def timeit(method):
    '''A time decorator to time other functions.'''
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print ('%r  %2.2f ms' %(method.__name__, (te - ts) * 1000))
        return result
    return timed

### Weather Function
Basic function that reads in the weather csv, formats it and merges it with supplied data, returning the merged datframe

### I cleaned this function a little bit. -Wang

In [2]:
def weatherInfo(data):
    '''Add weather information (temp, precipitation) to the dataframe.'''
    
    weather = pd.read_csv('CleanedWeather2016_17C.csv',index_col=0)
    
    data.dayofservice = data.dayofservice.astype('datetime64[ns]')
    data['datetime'] = data.dayofservice + pd.to_timedelta(data.actual_arr,unit='s')
    data['weekday'] = data.dayofservice.dt.dayofweek
    data = data.sort_values('datetime')
    #data = data.reset_index()
    data[['tripid','progrnumber']] = data[['tripid','progrnumber']].astype('int')
    df = pd.merge_asof(data,weather,on='datetime',tolerance=pd.Timedelta('1h'))
    
    return df

### Train&Predict Class

Class that reads in a cleaned df a set of xCols and a target feature. Has methods for MSE, MAE and Variance

In [1]:
class LinearTrainPredict:
    
    def __init__ (self, df, xCol, y, name=""):
        self.df = df
        self.xCol = xCol
        self.y = y
        self.__split = int(self.df.shape[0]*0.7)
        self.__X = pd.get_dummies(self.df[self.Xcol],drop_first=True)
        self.__xTrain, self.__xTest = self.__X[:self.__split], self.__X[self.__split:]
        self.__yTrain, self.__yTest = self.df[self.y][:self.__split], self.df[self.y][self.__split:]
        self.__regr = linear_model.LinearRegression()
        if name == ""
            pass
        else:
            with open('obj/'+ name + '.pkl', 'wb') as f:
                pickle.dump(self.__regr, f, pickle.HIGHEST_PROTOCOL)
        self.__regr.fit(self.__xTrain, self.__yTrain)
        self.yPred = self.__regr.predict(self.__xTest)
        
    def mSqErr(self):
        return mean_squared_error(self.__yTest,self.yPred)
        
    def mAbsErr(self):
        return mean_absolute_error(self.__yTest,self.yPred)
    
    def varScore(self):
        return r2_score(self.__yTest,self.yPred)

## Cleaning Dfs

Some fucntions that clean dfs and adds features such as weekday and month integers

In [4]:
#def dateManipulations(df, dateCol, arrCol, m=False, d=False, weekendSplit=False):
def dateManipulations(df, dateCol='dayofservice', d=False, weekendSplit=False):
    #Suggest: add this default value here ↑
    
    
    # ? Duplicate with the weatherInfo function here...
    #df.dateCol = df.dateCol.astype('datetime64[ns]')
    #df['datetime'] = df.dateCol + pd.to_timedelta(df.arrCol, unit = 's')
    # ?
    #Suggest to call weatherInfo instead:
    df = weatherInfo(df)
    
    
    '''
    #We don't use month as a feature anymore, this block can be deleted.
    
    #if m == False:
    #Suggest change to:
    if not m:
        pass
    
    #Suggest change to:
    elif m:
    #elif m == True:
        df['month'] = df.dateCol.map(lambda x: x.month)
    else:
        return 'Error in month input, must be True or False!'
    '''
    
    #same as above.
    #if d == False:
    if not d:
        pass
    
    #elif d == True:
    elif d:
        df['weekday'] = df.dateCol.dt.dayofweek
    else:
        return 'Error in day input, must be True or False!'
    
    #suggest change to:
    if weekendSplit and d:
    #if weekendSplit == True && d == True:
        
        #df['m2f'] = np.where((df.weekday ==0) & (df.weekday ==1) & (df.weekday ==2) & (df.weekday ==3) & (df.weekday ==4),1,0)
        #Suggest change to:
        df['m2f'] = np.where(df.weekday <= 4, 1,0)
        df['sat'] = np.where((df.weekday == 5),1,0)
        
    #same above
    elif not weekendSplit:
    #elif weekendSplit == False:
        pass
    #elif weekendSplit == True && d == False:
    elif weekendSplit and not d:
        return 'Error, must have weekday column for weekend split'
    else:
        return 'Error in weekend split input, must be True or False!'
    return df

In [5]:
def addPeakHours(df, timeOfDayCol):
    df['em'] = np.where((df[timeOfDayCol] > 14400) & (df[timeOfDayCol] <=  25200),1,0)
    df['mp'] = np.where((df[timeOfDayCol] > 25200) & (df[timeOfDayCol] <=  36000),1,0)
    df['lm'] = np.where((df[timeOfDayCol] > 36000) & (df[timeOfDayCol] <=  46800),1,0)
    df['md'] = np.where((df[timeOfDayCol] > 46800) & (df[timeOfDayCol] <=  47600),1,0)
    df['ap'] = np.where((df[timeOfDayCol] > 47600) & (df[timeOfDayCol] <=  68400),1,0)
    df['ev'] = np.where((df[timeOfDayCol] > 68400) & (df[timeOfDayCol] <=  79200),1,0)
    df['ln'] = np.where((df[timeOfDayCol] > 79200) & (df[timeOfDayCol] <=  90000),1,0)
    return df

## Route Manipulation functions

Functions that uses stop_times.txt to generate features and a route Dictionary that will be used with model prediction

In [8]:
#Some of the work is once off, I suggest do it once not do it everytime we load in the txt.
import pandas as pd
stoptime = pd.read_csv('stop_times.txt')
#Only look at the rows for certain route.
stoptime['line'] = stoptime.trip_id.str.split('-').str[1]
stoptime = stoptime[stoptime.stop_id.str.contains(':') == False]
stoptime['stopid'] = stoptime.stop_id.str.slice(8,)
stoptime.stopid = stoptime['stopid'].astype('int64')
stoptime.to_csv('stop_timesC',index=False)

In [10]:
stoptime = pd.read_csv('stop_timesC')
#stoptime.dtypes
stoptime.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,line,stopid
0,3643.y102m.60-1-d12-1.1.O,15:40:00,15:40:00,8240DB000226,1,Sandymount,0,0,0.0,1,226
1,3643.y102m.60-1-d12-1.1.O,15:40:47,15:40:47,8240DB000228,2,Sandymount,0,0,261.136188,1,228
2,3643.y102m.60-1-d12-1.1.O,15:41:26,15:41:26,8240DB000229,3,Sandymount,0,0,484.925289,1,229
3,3643.y102m.60-1-d12-1.1.O,15:42:30,15:42:30,8240DB000227,4,Sandymount,0,0,836.995679,1,227
4,3643.y102m.60-1-d12-1.1.O,15:43:09,15:43:09,8240DB000230,5,Sandymount,0,0,1066.461783,1,230


In [11]:
stoptime['tripmark'] = stoptime.trip_id.str.split('.').str[3]
stoptime.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,line,stopid,tripmark
0,3643.y102m.60-1-d12-1.1.O,15:40:00,15:40:00,8240DB000226,1,Sandymount,0,0,0.0,1,226,1
1,3643.y102m.60-1-d12-1.1.O,15:40:47,15:40:47,8240DB000228,2,Sandymount,0,0,261.136188,1,228,1
2,3643.y102m.60-1-d12-1.1.O,15:41:26,15:41:26,8240DB000229,3,Sandymount,0,0,484.925289,1,229,1
3,3643.y102m.60-1-d12-1.1.O,15:42:30,15:42:30,8240DB000227,4,Sandymount,0,0,836.995679,1,227,1
4,3643.y102m.60-1-d12-1.1.O,15:43:09,15:43:09,8240DB000230,5,Sandymount,0,0,1066.461783,1,230,1


In [14]:
stoptime[['stop_sequence','stop_headsign','shape_dist_traveled','line','stopid','tripmark']].drop_duplicates().to_csv('stop_timeCC',index=False)

In [None]:
'''
I notice a situation that you cannot just use get unique to get all the unique route. This is because there are more than one
route for a bus line + direction. E.g.:
39aO has three combination, 
1. start from UCD(767), go to Ongar
2. start from UCD(767), go to Aston
3. start from Aston(328), go to Ongar.
Base on this kind of situation, I suggest we name route as 39a767Ongar (busline+startstopid+destinationStop).
This will guarantee we have all the routes and it would be clean and without dupilication.
'''

In [15]:
import pickle
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def append_obj(obj,name):
    with open('obj/'+ name + '.pkl', 'ab') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [30]:
d = load_obj('stopDicts/39A_767_ Ongar')
d

{767: 0.0,
 768: 657.287066064875,
 769: 985.330520927881,
 770: 1429.24894221461,
 771: 1620.44696377974,
 772: 2127.48709498295,
 773: 3434.8578642479997,
 774: 3677.18821727873,
 775: 4001.74363072419,
 776: 4285.13185337461,
 777: 4493.83259606615,
 779: 4740.48732233394,
 780: 5009.04830237385,
 781: 5193.6587029476,
 782: 5357.09815442135,
 783: 5606.069766801032,
 784: 5776.637949866071,
 785: 6055.606335357479,
 786: 6545.589502681712,
 793: 7269.64182312836,
 7586: 7835.15763393275,
 7587: 8467.10980025078,
 7588: 8530.04627616102,
 328: 9243.322928124291,
 1443: 9826.568859010358,
 1444: 10107.2418629801,
 1445: 10417.1497368751,
 1647: 10939.5178555804,
 1648: 11272.5213698264,
 1649: 11469.3056185517,
 1911: 11995.4996735001,
 1913: 12337.7290265695,
 1914: 12592.8159362723,
 1805: 12806.0519246444,
 1806: 13196.947965981299,
 1660: 13713.2410577792,
 1661: 13997.7195279663,
 1662: 14322.374320031799,
 1664: 14788.0353517777,
 1665: 15079.2139210518,
 1666: 15297.9786348426

In [38]:
#Based on the discussion above, I changed this function into a function that run once and split all the routes into pickle file
# for later consideration:
#https://stackoverflow.com/questions/16497115/python-pickle-vs-sql-efficiency
#security issue? efficiency issue?...
@timeit
def getStops(file):
    
    '''get stops for a certain route from GTFS file.'''
    
    stoptime = pd.read_csv(file)
    '''
    #stoptime['line'] = stoptime.trip_id.str.split('-').str[1]
    #stoptime = stoptime[stoptime.stop_id.str.contains(':') == False]
    #stoptime['stopid'] = stoptime.stop_id.str.slice(8,)
    #stoptime.stopid = stoptime['stopid'].astype('int64')
    
    if outbound == True:
        routeDf = stoptime.loc[(stoptime.line == route) & (stoptime.trip_id.str.split('.').str[4]=='O')]
    elif outbound == False:
        routeDf = stoptime.loc[(stoptime.line == route) & (stoptime.trip_id.str.split('.').str[4]=='I')]
    else:
        return 'Error outbound must be True or False!'
    
    #Now there are many duplicate in the df, get unique.
    
    return routeDf[['stop_sequence','shape_dist_traveled', 'stopid']]
    
    header = ['stop_sequence','stop_headsign','shape_dist_traveled','line','stopid','tripmark']
    '''
    for line in stoptime.line.unique():
        for tm in stoptime.tripmark.unique():
            df = stoptime.loc[(stoptime.line == line) & (stoptime.tripmark == tm)]
            #print(df)
            if not df.empty and not df.loc[df.stop_sequence==1,'stopid'].empty:
                #IndexError: index 0 is out of bounds for axis 0 with size 0
                start = str(df.loc[df.stop_sequence==1,'stopid'].values[0]).strip()
                dest = str(df.loc[df.stop_sequence==1,'stop_headsign'].values[0]).strip()
                save_obj(df[['stopid','shape_dist_traveled']].set_index('stopid')['shape_dist_traveled'].to_dict(), \
                         'stopDicts/'+str(line)+'_'+start+'_'+ dest)

In [39]:
getStops('stop_timeCC')

'getStops'  355207.28 ms


In [7]:
def createStopDict(routeDf, name):
    stops = routeDf[['stop_sequence','stopid']]
    stops = stops.reset_index(drop = True)
    stops = stops.sort_values('stop_sequence')
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(stops.stopid.tolist(), f, pickle.HIGHEST_PROTOCOL)
    return

In [3]:
def checkstopSeq(routeDf):
    if routeDf.stopSeq.iloc[0] != 1:
        print('Starting stop is not 1')
        return False
    else:
        for i in routeDf.stopSeq:
            if routeDf.stopSeq.iloc[i+1] - routeDf.stopSeq.iloc[i] == 1:
                pass
                if routeDf.stopSeq.iloc[i+1] == routeDf.stopSeq.iloc[-1]:
                    break
            else:
                return False
            if routeDf.stopSeq.is_monotonic_increasing == True:
                return True
            elif routeDf.stopSeq.is_monotonic_increasing == False:
                return False
            else:
                return 'Error in monotonic method'