In [5]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
import seaborn as sns
from time import process_time
import dask.dataframe as dd
try:
    from uszipcode import SearchEngine
except:
    !pip install uszipcode
import json
import folium
from folium.plugins import HeatMapWithTime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import os.path
from os import path


import warnings
warnings.filterwarnings('ignore')

In [14]:
def parse_timestamp(df):
    
    df['Start Time'] = pd.to_datetime(df['Trip Start Timestamp'])
    df['End Time'] = pd.to_datetime(df['Trip End Timestamp']) 
    df['Year'] = df['Start Time'].dt.year
    df['PU_Hour'] = df['Start Time'].dt.hour
    df['DO_Hour'] = df['End Time'].dt.hour
    
    # Remove all rows with > 2 hr trip time and 0s
    df = df.loc[df['Trip Seconds']<7200]
    df = df.loc[df['Trip Seconds']!=0]
    
    df['Month'] = df['Start Time'].dt.strftime('%B')
    df['Day'] = df['Start Time'].dt.strftime('%A')
    df['Trip_Time'] = df['Trip Seconds']/60
    df = df.reset_index(drop=True)
    return df

def filter_df(df):
    df = df.loc[df['Trip Total']!=0]
    df = df.loc[df['Trip Miles']!=0]
    remove = df.loc[(df['Trip Miles']<0.25) & (df['Trip Total'] >5)]
    df = (df.drop(remove.index))
    df = df.reset_index(drop=True)
    return df

def add_CT_freq(df):
    df1 = df['Pickup Census Tract'].value_counts().reset_index(name='Pickup_CT_Freq').rename(columns={'index': 'Pickup Census Tract'})
    df2 = df['Dropoff Census Tract'].value_counts().reset_index(name='Dropoff_CT_Freq').rename(columns={'index': 'Dropoff Census Tract'})

    df = pd.merge(df,df1,on=['Pickup Census Tract'], how='left' )
    df = pd.merge(df,df2,on=['Dropoff Census Tract'], how='left' )
    return df

In [15]:
def haversine_np(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    distance = 3958.8 * c
    return distance

In [16]:

# Complete dataset has been stored on my hard drive. 
# Dataset can be accessed from https://data.cityofchicago.org/Transportation/Transportation-Network-Providers-Trips/m6dm-c72p


def chunk_preprocessing(chunk):
    
    chunk=chunk.drop(columns = ['Trip ID', 'Pickup Centroid Location', 'Dropoff Centroid Location' ])
    chunk = parse_timestamp(chunk)

    chunk = chunk.rename(columns={"Pickup Centroid Latitude": "PU_lat", "Pickup Centroid Longitude": "PU_long",
                             "Dropoff Centroid Latitude": "DO_lat", "Dropoff Centroid Longitude": "DO_long"})
    
    chunk = filter_df(chunk)
    chunk = chunk.drop(columns = ['Tip', 'Fare', 'Additional Charges', 'Trip Seconds', 
                                        'Start Time', 'End Time','Trip Start Timestamp','Trip End Timestamp'])
    
    chunk = chunk.dropna(subset = ['Trip Miles', 'Trip Total','PU_lat', 'DO_lat',
                                         'Pickup Census Tract','Dropoff Census Tract' ,
                                         'Pickup Community Area','Dropoff Community Area'])
    
    # Calculate distance from City Center (Downtown)
    chunk['PU_Dist_CC'] = haversine_np(41.8781, -87.6298,chunk['PU_lat'],chunk['PU_long'])
    chunk['DO_Dist_CC'] = haversine_np(41.8781, -87.6298,chunk['DO_lat'],chunk['DO_long'])
    chunk['Pickup Census Tract'] = chunk['Pickup Census Tract'].astype(float)
    chunk['Dropoff Census Tract'] = chunk['Dropoff Census Tract'].astype(float)
    
    # Add pickup and dropoff census tract frequency
    chunk = chunk.reset_index(drop=True)
    return chunk

In [17]:
chunk_size, skip_size = 250000 , 1000000
trips = pd.DataFrame() 
tic = process_time()
for n in range(101):
    
    dpath ='c:/Machine Learning/Pickled Data/'
    os.makedirs(dpath, exist_ok=True) 
    fname = 'c:/Machine Learning/Pickled Data/trips_part_'+ str(n)+ '.pkl'
    if os.path.exists(fname):
        chunk_filter = pd.read_pickle(fname)
    else:
        chunk = pd.read_csv('c:/Machine Learning/TNP_Trips.csv',skiprows=np.arange(1, (n+1)*skip_size),
                            nrows=chunk_size, low_memory=False) 

        chunk_filter = chunk_preprocessing(chunk)

    trips = trips.append(chunk_filter,sort = False,ignore_index=True)
    chunk_filter.to_pickle(fname)
    if n%10==0:
        toc = process_time()
        
        print("Processed chunk #",n, 'of',100, 'Time taken = ', int(toc-tic), 'seconds' )

trips.to_pickle("./Pickled Data/trips_25percent.pkl")
trips = trips.reset_index(drop=True)

trips['Month'].value_counts()

Processed chunk # 0 of 100 Time taken =  63 seconds
Processed chunk # 10 of 100 Time taken =  784 seconds
Processed chunk # 20 of 100 Time taken =  1603 seconds
Processed chunk # 30 of 100 Time taken =  2567 seconds
Processed chunk # 40 of 100 Time taken =  3638 seconds
Processed chunk # 50 of 100 Time taken =  4789 seconds
Processed chunk # 60 of 100 Time taken =  6051 seconds
Processed chunk # 70 of 100 Time taken =  7427 seconds
Processed chunk # 80 of 100 Time taken =  8913 seconds
Processed chunk # 90 of 100 Time taken =  10503 seconds
Processed chunk # 100 of 100 Time taken =  12140 seconds


March        1757769
May          1600219
November     1532804
August       1521295
April        1520967
February     1509063
December     1504954
June         1495256
July         1470235
January      1468379
September    1429114
Name: Month, dtype: int64

In [18]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16810055 entries, 0 to 16810054
Data columns (total 20 columns):
Trip Miles                float64
Pickup Census Tract       float64
Dropoff Census Tract      float64
Pickup Community Area     float64
Dropoff Community Area    float64
Trip Total                float64
Shared Trip Authorized    object
Trips Pooled              object
PU_lat                    float64
PU_long                   float64
DO_lat                    float64
DO_long                   float64
Year                      int64
PU_Hour                   int64
DO_Hour                   int64
Month                     object
Day                       object
Trip_Time                 float64
PU_Dist_CC                float64
DO_Dist_CC                float64
dtypes: float64(13), int64(3), object(4)
memory usage: 2.5+ GB


In [87]:
trips['Month'].value_counts()

March        3414576
May          3130483
April        2941499
February     2915330
June         2871229
January      2802250
November     2632653
December     2601766
August        944754
July          908879
September     887702
Name: Month, dtype: int64