# Indego Bike Duration Prediction Using Pipeline

Very First step ( optional ) , as data was not in proper nomenclature, so convert it into same nomenclature and collate in a single folder for easier access. In our case, 'data' folder contain all the files.

In [1]:
'''
Importing important packages required for operation..
'''
import matplotlib.pyplot as plt
import glob
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,Normalizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

Import all data files from the specific folder path or could be from remote machine path. Glob function is used to get all files list from directory, assuming all files is in .csv format for simplicity.

In [2]:
'''
Data load function is used for loading data at a given directory or path.

@param :
       path : path could be directory of local or remote folder.
       
@return :
      
      df - data frame concatenated of all the files together in row wise ( axis = 0 )

'''
def data_load(path):
    
    files = glob.glob("%s\*.csv"%(path))
    
    for i,file in enumerate(files):
        if i == 0 :
            df = pd.read_csv(file)
        else:
            df = pd.concat([df,pd.read_csv(file)],axis=0)
    
    return df

#load data....
df = data_load('data')

In [4]:
df.to_csv('all_data.csv')

In [3]:
class Indego(BaseEstimator, TransformerMixin):
    
    def fit( self, df, y = None ):
        print("in indego fit")
        return self
    
    def transform(self, df, y = None):
    
        def time_convert(df,time_col):
            df[time_col] = pd.to_datetime(arg=df[time_col], infer_datetime_format=True)
            return df
    
        def drop_column(df, drop_col_name):
            df.drop(columns=[drop_col_name],axis=1,inplace=True)
            return df
        
        def punctuation_cleaning(df,punc_col):
            char_index = df[df[punc_col].str.contains('[A-Za-z]', na=False)].index
            df.drop(char_index,inplace=True)
            return df
        
        def remove_null(df,col_name):
            df = df.loc[~df[col_name].isnull()]
            return df
        
        def convert_data_type(df, col_list,data_type):
            df[col_list] = df[col_list].astype(data_type)
            return df
    
        def station_cleaning(df, station_col_list,final_station_col):
            drop_index = df.loc[(df[station_col_list[0]].isnull()) & (df[station_col_list[1]].isnull())].index
            df.drop(drop_index,inplace=True)
            df[station_col_list] = df[station_col_list].fillna('')
            df[final_station_col] = df[station_col_list[0]].astype(str) +df[station_col_list[1]].astype(str)
            df.drop(columns=station_col_list,inplace=True)
            return df
    
        def lat_lon_cleaning(df,lat_lon_col):
            lat_lon_null = df.loc[(df[lat_lon_col[0]].isnull()) & (df[lat_lon_col[1]].isnull())].index
            df.drop(lat_lon_null,inplace=True)
            df =  remove_null(df,lat_lon_col[1])
            df = remove_null(df,lat_lon_col[0])
            return df
    
        def remove_lat_lon_outlier(df,lat_lon_list):
            df = df.loc[(df[lat_lon_list[0]]!=0) | (df[lat_lon_list[1]]!=0)]
            df = punctuation_cleaning(df,lat_lon_list[0])
            return df
    
        def change_lang_lat_value(df):
            df.loc[df.start_lat <=0,'start_lat'] = abs(df.start_lat)
            df.loc[df.end_lat <=0,'end_lat'] = abs(df.end_lat)
            return df
    
        def degree_to_radion(degree):
            return degree*(np.pi/180)

        def calculate_distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):

            from_lat = degree_to_radion(pickup_latitude)
            from_long = degree_to_radion(pickup_longitude)
            to_lat = degree_to_radion(dropoff_latitude)
            to_long = degree_to_radion(dropoff_longitude)

            radius = 6371.01

            lat_diff = to_lat - from_lat
            long_diff = to_long - from_long

            a = np.sin(lat_diff / 2)**2 + np.cos(degree_to_radion(from_lat)) * np.cos(degree_to_radion(to_lat)) * np.sin(long_diff / 2)**2
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

            return radius * c
    
        def add_new_date_time_features(dataset):
            dataset['hour'] = dataset.start_time.dt.hour
            dataset['day'] = dataset.start_time.dt.day
            dataset['month'] = dataset.start_time.dt.month
            dataset['year'] = dataset.start_time.dt.year
            dataset['day_of_week'] = dataset.start_time.dt.dayofweek

            return dataset
       
        print("in transformer method")
        #df = time_convert(df,'start_time')
        df.set_index('trip_id',inplace=True)
        df = station_cleaning(df,['start_station','start_station_id'],'start_station_complete')
        df = station_cleaning(df,['end_station','end_station_id'],'end_station_complete')
        df = punctuation_cleaning(df,'bike_id')
        df = remove_null(df,'bike_id')
        df = lat_lon_cleaning(df,['end_lat','start_lat'])
        df = remove_lat_lon_outlier(df,['end_lat','end_lon'])
        df = remove_lat_lon_outlier(df,['start_lat','start_lon'])
        df = convert_data_type(df,['start_lat','start_lon','end_lat','end_lon'],float)
        df = change_lang_lat_value(df)
        df['distance'] = calculate_distance(df.start_lat, df.start_lon, df.end_lat, df.end_lon)
        #df = add_new_date_time_features(df)
        df = convert_data_type(df,['start_station_complete','end_station_complete','bike_id'],float)
        drop_list = ['bike_type','passholder_type','start_time','end_time','end_lat','end_lon','start_lat','start_lon']
        df.drop(columns = drop_list,axis=1,inplace=True)        
        return df

In [4]:
class custom_label_encoder(BaseEstimator,TransformerMixin):
    def fit( self, df, y = None ):
        print("in label encoder fit.....")
        return self
    
    def transform(self, df, y = None):
        print("in label encoder transformer")
        label = LabelEncoder()
        df['trip_route'] = label.fit_transform(df['trip_route_category'])
        df.drop(columns='trip_route_category',inplace=True)
        return df

In [None]:
# obj = Indego().fit(df).transform(df)
pre_pipeline = Pipeline(steps=[('preprocess',Indego()),
                               ('label_encoder',custom_label_encoder()),
                               ('Normalizer',Normalizer()),
                               ('decision tree', DecisionTreeRegressor())
                              ])

In [None]:
temp = pre_pipeline.fit_transform(df)

In [5]:
def model_build():
    preprocess_pipeline = Pipeline(steps=[('preprocess',Indego()),('label',custom_label_encoder())])
    dataset = preprocess_pipeline.fit_transform(df) 
                                   
    X = dataset.drop(columns=['duration'])
    y = dataset['duration']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)
    
    y_train = y_train /60
    y_test =  y_test / 60
    
    y_train = np.log1p(y_train)
    y_test = np.log1p(y_test)
#     print(X_train.shape,y_train.shape)
#     print(y_test)
    model = Pipeline(steps=[('Normalizer',Normalizer()),
                               ('decision tree', DecisionTreeRegressor())
                            ])
    model.fit(X_train,y_train)
#     pred = model.predict(X_test)
#     print(np.expm1(y_test))
#     print(np.expm1(model.predict(X_test)))
#     print(np.sqrt(mean_squared_error(np.expm1(y_test),np.expm1(model.predict(X_test)))))
    return model

In [6]:
model = model_build()

in indego fit
in transformer method
in label encoder fit.....
in label encoder transformer


In [8]:
import pickle
filename = 'finalized_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [9]:
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
preprocess_pipeline = Pipeline(steps=[('preprocess',Indego()),('label',custom_label_encoder())])
dataset = preprocess_pipeline.fit_transform(df)
X = dataset.drop(columns=['duration'])
y = dataset['duration']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [None]:
X_train.shape

In [None]:
a,b= Normalizer().fit(X_train,y_train).transform(X_train,y_train)

In [None]:
a.shape

In [None]:
'''
Convert string type column into datetime format so we can use further its attribute.

@param:
    time_col : name of column which contain datetime value.

@return:
    df : dataframe which contain date time column.

'''
def time_convert(df,time_col):
    df[time_col] = pd.to_datetime(arg=df[time_col], infer_datetime_format=True)
    return df

df = time_convert(df,'start_time')

In [None]:
# set trip id as index which is unique as well as complete.
df.set_index('trip_id',inplace=True)

In [None]:
df_copy = df.copy()
df.shape
df.info()
df.isnull().sum()

In [None]:
df['start_time'] = pd.to_datetime(arg=df['start_time'], infer_datetime_format=True)

In [None]:
#sorting based on start time so we can check time wise trend and changes..
df.sort_values('start_time',inplace=True)

In [None]:
# drop bike type as almost more than 50% record are null and other 50% have biased towards specific value.. 
df.drop(columns=['bike_type'],inplace=True)

In [None]:
'''
Drop a column from dataframe and return new dataframe without deleted column.

@param:
    df : dataframe contain all column including which need to be deleted.
    drop_col_name : name of the column which need to be dropped.
    
@return:

    df : a dataframe all column except deleted column.
    
'''
def drop_column(df, drop_col_name):
    df.drop(columns=[drop_col_name],inplace=True)
    return df

As we station column, first we see how much null values in each columns ( taking start_station & start_station_id ). I figured out both have contradictory in term of data. i.e. if station has value than station_id doesn't have and vice versa. that means this column correspond to same value so try to reduce in 1 column.

first thing to remove those rows in which both of these column contain null values.
later we fill blanks because we need to concatenate togther this column to get a complete new column without change of meaning.

converted into str type to match type and added together and finally achieved a complete column, remove these two column as our new column is decribing two columns.

same operation for end station as well.

In [None]:
'''
Station cleaning is used to clean station and station_id.

@param :
      df : a dataframe contain all the colunmns
      station_col_list: It contain start_station and start_station_id similarly for end station as well.
      
@return :
      df : return a dataframe

'''
def station_cleaning(df, station_col_list,final_station_col):
    drop_index = df.loc[(df[station_col_list[0]].isnull()) & (df[station_col_list[1]].isnull())].index
    df.drop(drop_index,inplace=True)
    df[station_col_list] = df[station_col_list].fillna('')
    df[final_station_col] = df[station_col_list[0]].astype(str) +df[station_col_list[1]].astype(str)
    df.drop(columns=station_col_list,inplace=True)
    return df

In [None]:
df = station_cleaning(df,['start_station','start_station_id'],'start_station_complete')
df = station_cleaning(df,['end_station','end_station_id'],'end_station_complete')

In [None]:
# drop_index = df.loc[(df.start_station.isnull()) & (df.start_station_id.isnull())].index
# df.drop(drop_index,inplace=True)

In [None]:
# df.loc[(~df.start_station.isnull()) & (df.start_station_id.isnull())].shape
# df.loc[(df.start_station.isnull()) & (~df.start_station_id.isnull())].shape
# df[['start_station','start_station_id']] = df[['start_station','start_station_id']].fillna('')
# df['start_station_complete'] = df['start_station'].astype(str) +df['start_station_id'].astype(str)
# df.drop(columns=['start_station','start_station_id'],inplace=True)

In [None]:
# df.loc[(~df.end_station.isnull()) & (df.end_station_id.isnull())].shape
# df.loc[(df.end_station.isnull()) & (~df.end_station_id.isnull())].shape
# df[['end_station','end_station_id']] = df[['end_station','end_station_id']].fillna('')
# df['end_station_complete'] = df['end_station'].astype(str) +df['end_station_id'].astype(str)

In [None]:
# df.drop(columns=['end_station','end_station_id'],inplace=True)

Punctation cleaning is required because dataset metadata shows its integer column and as well approx 99% value are float/int only. secondly few rows have value like 'delete me' , it could be anything in future so for safer side remove all alpha value.

In [None]:
'''
Punctuation cleaning is used to clean any character present in integer or float column for now removing alphabets.

@param :
      df : a dataframe contain all the columns.
      punc_col : a column contain special or alphabet character.
      
@return :
      
      df : a dataframe with removed puctutation from specific column.
      
'''

def punctuation_cleaning(df,punc_col):
    char_index = df[df[punc_col].str.contains('[A-Za-z]', na=False)].index
    df.drop(char_index,inplace=True)
    return df

In [None]:
'''
Remove null values from a specific column.

@param :
        df - a dataframe which contain all columns
        col_name - name of column which contain null values.
        
@return :
       df - a dataframe with removed null values from specific column.

'''

def remove_null(df,col_name):
    df = df.loc[~df[col_name].isnull()]
    return df

In [None]:
# df[df['bike_id'].str.contains('[A-Za-z]',na=False)]
# df = df.loc[df['bike_id'] !='delete me']
# char_index = df[df['bike_id'].str.contains('[A-Za-z]', na=False)].index
# df.drop(char_index,inplace=True)
# df = df.loc[~df.bike_id.isnull()]

In [None]:
df = punctuation_cleaning(df,'bike_id')

In [None]:
df = remove_null(df,'bike_id')

In [None]:
def lat_lon_cleaning(df,lat_lon_col):
    lat_lon_null = df.loc[(df[lat_lon_col[0]].isnull()) & (df[lat_lon_col[1]].isnull())].index
    df.drop(lat_lon_null,inplace=True)
    df = remove_null(df,lat_lon_col[1])
    df = remove_null(df,lat_lon_col[0])
    return df

In [None]:
df = lat_lon_cleaning(df,['end_lat','start_lat'])

In [None]:
def remove_lat_lon_outlier(df,lat_lon_list):
    df = df.loc[(df[lat_lon_list[0]]!=0) | (df[lat_lon_list[1]]!=0)]
    df = punctuation_cleaning(df,lat_lon_list[0])
    return df

In [None]:
df = remove_lat_lon_outlier(df,['end_lat','end_lon'])
df = remove_lat_lon_outlier(df,['start_lat','start_lon'])

In [None]:
def convert_data_type(df, col_list,data_type):
    df[col_list] = df[col_list].astype(data_type)
    return df

In [None]:
df = convert_data_type(df,['start_lat','start_lon','end_lat','end_lon'],float)

In [None]:
def change_lang_lat_value(df):
    df.loc[df.start_lat <=0,'start_lat'] = abs(df.start_lat)
    df.loc[df.end_lat <=0,'end_lat'] = abs(df.end_lat)
    return df

In [None]:
df = change_lang_lat_value(df)

In [None]:
def degree_to_radion(degree):
    return degree*(np.pi/180)

def calculate_distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):
    
    from_lat = degree_to_radion(pickup_latitude)
    from_long = degree_to_radion(pickup_longitude)
    to_lat = degree_to_radion(dropoff_latitude)
    to_long = degree_to_radion(dropoff_longitude)
    
    radius = 6371.01
    
    lat_diff = to_lat - from_lat
    long_diff = to_long - from_long

    a = np.sin(lat_diff / 2)**2 + np.cos(degree_to_radion(from_lat)) * np.cos(degree_to_radion(to_lat)) * np.sin(long_diff / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return radius * c

In [None]:
df['distance'] = calculate_distance(df.start_lat, df.start_lon, df.end_lat, df.end_lon)

In [None]:
def add_new_date_time_features(dataset):
    dataset['hour'] = dataset.start_time.dt.hour
    dataset['day'] = dataset.start_time.dt.day
    dataset['month'] = dataset.start_time.dt.month
    dataset['year'] = dataset.start_time.dt.year
    dataset['day_of_week'] = dataset.start_time.dt.dayofweek
    
    return dataset

df = add_new_date_time_features(df)

In [None]:
df = convert_data_type(df,['start_station_complete','end_station_complete','bike_id'],float)

In [None]:
drop_list = ['passholder_type','start_time','end_time','end_lat','end_lon','start_lat','start_lon']
df = drop_column(df,drop_list)

In [None]:
# can use drop column function..
# df.drop(columns=['passholder_type'],inplace=True)
# df.drop(columns=['start_time','end_time'],inplace=True)
# df.drop(columns=['end_lat','end_lon','start_lat','start_lon'],inplace=True)

In [None]:
# df.start_station_complete = df.start_station_complete.astype(float)
# df.end_station_complete = df.end_station_complete.astype(float)
# df.bike_id = df.bike_id.astype(float)

In [None]:
# lat_null = df.loc[(df.end_lat.isnull()) & (df.start_lat.isnull())].index
# df.drop(lat_null,inplace=True)
# df.loc[(~df.end_lat.isnull()) & (df.start_lat.isnull())]
######
# df.loc[df['start_station_complete']=='3000.0']
# df = df.loc[~df['start_lat'].isnull()]
# df.loc[df['end_lat'].isnull()]['end_station_complete'].value_counts()
# df.loc[df['end_station_complete'] == '90018.0']
# df = df.loc[~df['end_lat'].isnull()]

In [None]:
# df = df.loc[(df['end_lat']!=0) | (df['end_lon']!=0)]
# df = df.loc[(df['start_lat']!=0) | (df['start_lon']!=0)]
# lat_special_char = df[df['start_lat'].str.contains('[A-Za-z]', na=False)].index
# df.drop(lat_special_char,inplace=True)

In [None]:
# df['start_lat'] = df['start_lat'].astype(float)
# df['start_lon'] = df['start_lon'].astype(float)
# df['end_lat'] = df['end_lat'].astype(float)
# df['end_lon'] = df['end_lon'].astype(float)

In [None]:
# df.loc[df.start_lat <=0,'start_lat'] = abs(df.start_lat)
# df.loc[df.end_lat <=0,'end_lat'] = abs(df.end_lat)

In [None]:
# lon_special_character = df[df['end_lat'].str.contains('[A-Za-z]', na=False)].index
# df.drop(lon_special_character,inplace=True)

In [None]:
# df.drop(columns=['start_time','end_time'],inplace=True)
# df.drop(columns=['end_lat','end_lon','start_lat','start_lon'],inplace=True)

In [None]:
# df.start_station_complete = df.start_station_complete.astype(float)
# df.end_station_complete = df.end_station_complete.astype(float)
# df.bike_id = df.bike_id.astype(float)

In [None]:
label = LabelEncoder()
df['trip_route'] = label.fit_transform(df['trip_route_category'])
df.drop(columns='trip_route_category',inplace=True)

In [None]:
df['duration'] = df['duration']/60

In [None]:
X = df.drop(columns=['duration'])
y = df['duration']

In [None]:
from sklearn.preprocessing import Normalizer
X = Normalizer().fit_transform(X)

In [None]:
y.skew()
y = np.log1p(y)
sns.distplot(y, color='blue')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X,y)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X,y)

In [None]:

print(np.sqrt(mean_squared_error(np.expm1(y),np.expm1(dt.predict(X)))))
#lin_reg.predict(X)