In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from scipy.signal import savgol_filter

In [2]:
filename = '../../data/input/new_feats_data.csv'

input_data = pd.read_csv(filename, index_col=None, header=0)
input_data.shape

(1643336, 60)

In [3]:
# converting categorical variables

input_data['shapeId'] = input_data['shapeId'].astype(str)
input_data['routeFrequency'] = input_data['routeFrequency'].astype(str)
input_data['problem'] = input_data['problem'].astype(str)
input_data['alertSubtype'] = input_data['alertSubtype'].astype(str)
# input_data['alertSubtype'].replace("nan", "-", inplace=True)
input_data['alertType'] = input_data['alertType'].astype(str)
# input_data['alertType'].replace("nan", "-", inplace=True)
input_data['jamBlockType'] = input_data['jamBlockType'].astype(str)
# input_data['jamBlockType'].replace("nan", "-", inplace=True)
input_data['alertIsJamUnifiedAlert'] = (input_data['alertIsJamUnifiedAlert'] == True).astype(int)
input_data['alertInScale'] = (input_data['alertInScale'] == True).astype(int)
input_data['busBunching'] = (input_data['busBunching'] == True).astype(int)

print(input_data.dtypes)

route                                 int64
tripNum                               int64
shapeId                              object
routeFrequency                       object
shapeSequence                         int64
shapeLat                            float64
shapeLon                            float64
distanceTraveledShape               float64
busCode                               int64
gpsPointId                          float64
gpsLat                              float64
gpsLon                              float64
distanceToShapePoint                float64
stopPointId                           int64
problem                              object
precipitation                       float64
alertSubtype                         object
alertType                            object
alertRoadType                       float64
alertConfidence                     float64
alertNComments                      float64
alertNImages                        float64
alertNThumbsUp                  

In [6]:
input_data.isnull().any()

route                               False
tripNum                             False
shapeId                             False
routeFrequency                      False
shapeSequence                       False
shapeLat                            False
shapeLon                            False
distanceTraveledShape               False
busCode                             False
gpsPointId                          False
gpsLat                              False
gpsLon                              False
distanceToShapePoint                False
stopPointId                         False
problem                             False
precipitation                       False
alertSubtype                        False
alertType                           False
alertRoadType                       False
alertConfidence                     False
alertNComments                      False
alertNImages                        False
alertNThumbsUp                      False
alertReliability                  

In [5]:
# Replacing null values

# changing by shape data
input_data.gpsPointId = np.where(input_data.gpsPointId.isnull(), 0, input_data.gpsPointId)
input_data.gpsLat = np.where(input_data.gpsLat.isnull(), input_data.shapeLat, input_data.gpsLat)
input_data.gpsLon = np.where(input_data.gpsLon.isnull(), input_data.shapeLon, input_data.gpsLon)
input_data.distanceToShapePoint = np.where(input_data.distanceToShapePoint.isnull(), 
                                            0, input_data.distanceToShapePoint)

#grouping by weekday and filling NA's with the median of the column
numeric_feats = input_data.dtypes[(input_data.dtypes != "object") & (input_data.dtypes != "datetime64[ns]")
                               & (input_data.dtypes != "bool")].index
input_data[numeric_feats] = input_data.groupby(['WEEKDAY(gps_datetime)'])[numeric_feats].transform(lambda x: x.fillna(x.median()))

input_data.head(5)

Unnamed: 0,route,tripNum,shapeId,routeFrequency,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,...,YEAR(alertDateTime),YEAR(jamUpdateDateTime),MONTH(gps_datetime),MONTH(precipitationTime),MONTH(alertDateTime),MONTH(jamUpdateDateTime),WEEKDAY(gps_datetime),WEEKDAY(precipitationTime),WEEKDAY(alertDateTime),WEEKDAY(jamUpdateDateTime)
0,68,1,Route_160,low_frequency,12573,-8.036592,-34.975683,785.0,12021,27.0,...,2018.0,2018.0,12,12,12.0,12.0,0,0,0.0,0.0
1,68,1,Route_160,low_frequency,12581,-8.03415,-34.960756,2794.0,12021,33.0,...,2018.0,2018.0,12,12,12.0,12.0,0,0,0.0,0.0
2,68,1,Route_160,low_frequency,12582,-8.032878,-34.959775,2971.0,12021,0.0,...,2018.0,2018.0,12,12,12.0,12.0,0,0,0.0,0.0
3,68,1,Route_160,low_frequency,12583,-8.031478,-34.958622,3171.0,12021,35.0,...,2018.0,2018.0,12,12,12.0,12.0,0,0,0.0,0.0
4,68,1,Route_160,low_frequency,12589,-8.03675,-34.946086,4838.0,12021,43.0,...,2018.0,2018.0,12,12,12.0,12.0,0,0,0.0,0.0


In [7]:
# getting dummy of categorical variables
input_data = pd.get_dummies(input_data)
input_data.head()

Unnamed: 0,route,tripNum,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,...,alertType_-,alertType_ACCIDENT,alertType_CHIT_CHAT,alertType_HAZARD,alertType_JAM,alertType_POLICE,alertType_ROAD_CLOSED,jamBlockType_-,jamBlockType_ROAD_CLOSED_CONSTRUCTION,jamBlockType_ROAD_CLOSED_EVENT
0,68,1,12573,-8.036592,-34.975683,785.0,12021,27.0,-8.036673,-34.97555,...,1,0,0,0,0,0,0,1,0,0
1,68,1,12581,-8.03415,-34.960756,2794.0,12021,33.0,-8.033869,-34.960571,...,0,0,0,0,1,0,0,1,0,0
2,68,1,12582,-8.032878,-34.959775,2971.0,12021,0.0,-8.032878,-34.959775,...,0,0,0,0,1,0,0,1,0,0
3,68,1,12583,-8.031478,-34.958622,3171.0,12021,35.0,-8.031753,-34.958864,...,0,0,0,0,1,0,0,1,0,0
4,68,1,12589,-8.03675,-34.946086,4838.0,12021,43.0,-8.035829,-34.947773,...,0,0,0,0,1,0,0,1,0,0


In [8]:
# saving input data for the models, after dummy
output_path = '../../data/output/integrated_data_done.csv'

input_data.to_csv(output_path, index=False)