# Indego Bike Duration Prediction Using Pipeline

## Table of content:

1. loading all required packages.
2. Loading data and converting into master file.
3. Creation of preprocessing class ( Indengo ) which contain all operation in functional way.
4. Creation of custom label encoder class for label encoding.
5. Creattion of pipeline, train-test split, modelling and prediction.
6. Saving and loading model into serializable object ( pickle )
7. Last part some intial code from where i have arrived into some decisions - commented for now.

**This is just rough notebook for operation which shows my steps and opearion, I have deployed model using Flask app so please refer flask app code for deployment, I have made powerBI dashboard for certain insight as well so please check that as well** 

Very First step ( optional ) , as data was not in proper nomenclature, so convert it into same nomenclature and collate in a single folder for easier access. In our case, 'data' folder contain all the files.

In [1]:
'''
Importing important packages required for operation..
'''
import matplotlib.pyplot as plt
import glob
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,Normalizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

Import all data files from the specific folder path or could be from remote machine path. Glob function is used to get all files list from directory, assuming all files is in .csv format for simplicity.

In [3]:
'''
Data load function is used for loading data at a given directory or path.

@param :
       path : path could be directory of local or remote folder.
       
@return :
      
      df - data frame concatenated of all the files together in row wise ( axis = 0 )

'''
def data_load(path):
    
    files = glob.glob("%s\*.csv"%(path))
    
    for i,file in enumerate(files):
        if i == 0 :
            df = pd.read_csv(file)
        else:
            df = pd.concat([df,pd.read_csv(file)],axis=0)
    
    return df

#load data....
df = data_load('data')

In [4]:
df.to_csv('whole_data.csv')
df_copy = df.copy()

A preprocessing class inherited transformer and estimator which basically gives flexibility to act as a transformer or estimator class when required.

class contain fit method for now skipped ( later we can use to implement logical part )
class contain transformer method which internally contained various method for preprocessing part ( will explain each method function and working at down )

lastly this class return a cleaned dataframe which is used for modelling part.

In [22]:
class Indego(BaseEstimator, TransformerMixin):
    
    def fit( self, df, y = None ):
        print("in indego fit")
        return self
    
    def transform(self, df, y = None):
    
        def time_convert(df,time_col):
            df[time_col] = pd.to_datetime(arg=df[time_col], infer_datetime_format=True)
            return df
    
        def drop_column(df, drop_col_name):
            df.drop(columns=[drop_col_name],axis=1,inplace=True)
            return df
        
        def punctuation_cleaning(df,punc_col):
            char_index = df[df[punc_col].str.contains('[A-Za-z]', na=False)].index
            df.drop(char_index,inplace=True)
            return df
        
        def remove_null(df,col_name):
            df = df.loc[~df[col_name].isnull()]
            return df
        
        def convert_data_type(df, col_list,data_type):
            df[col_list] = df[col_list].astype(data_type)
            return df
    
        def station_cleaning(df, station_col_list,final_station_col):
            drop_index = df.loc[(df[station_col_list[0]].isnull()) & (df[station_col_list[1]].isnull())].index
            df.drop(drop_index,inplace=True)
            df[station_col_list] = df[station_col_list].fillna('')
            df[final_station_col] = df[station_col_list[0]].astype(str) +df[station_col_list[1]].astype(str)
            df.drop(columns=station_col_list,inplace=True)
            return df
    
        def lat_lon_cleaning(df,lat_lon_col):
            lat_lon_null = df.loc[(df[lat_lon_col[0]].isnull()) & (df[lat_lon_col[1]].isnull())].index
            df.drop(lat_lon_null,inplace=True)
            df =  remove_null(df,lat_lon_col[1])
            df = remove_null(df,lat_lon_col[0])
            return df
    
        def remove_lat_lon_outlier(df,lat_lon_list):
            df = df.loc[(df[lat_lon_list[0]]!=0) | (df[lat_lon_list[1]]!=0)]
            df = punctuation_cleaning(df,lat_lon_list[0])
            return df
    
        def change_lang_lat_value(df):
            df.loc[df.start_lat <=0,'start_lat'] = abs(df.start_lat)
            df.loc[df.end_lat <=0,'end_lat'] = abs(df.end_lat)
            return df
    
        def degree_to_radion(degree):
            return degree*(np.pi/180)

        def calculate_distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):

            from_lat = degree_to_radion(pickup_latitude)
            from_long = degree_to_radion(pickup_longitude)
            to_lat = degree_to_radion(dropoff_latitude)
            to_long = degree_to_radion(dropoff_longitude)

            radius = 6371.01

            lat_diff = to_lat - from_lat
            long_diff = to_long - from_long

            a = np.sin(lat_diff / 2)**2 + np.cos(degree_to_radion(from_lat)) * np.cos(degree_to_radion(to_lat)) * np.sin(long_diff / 2)**2
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

            return radius * c
    
        def add_new_date_time_features(dataset):
            dataset['hour'] = dataset.start_time.dt.hour
            dataset['day'] = dataset.start_time.dt.day
            dataset['month'] = dataset.start_time.dt.month
            dataset['year'] = dataset.start_time.dt.year
            dataset['day_of_week'] = dataset.start_time.dt.dayofweek
            dataset['quarter'] = dataset.start_time.dt.quarter

            return dataset
        
        def add_temp_variable(df):
            # Aug -nov : summer
            # may - june -july : rainy 
            # dec - april - winter 
            df['temprature'] = 0
            df.loc[(df['month']>=1) & (df['month']<=4),'temprature'] = 1
            df.loc[(df['month']>=5) & (df['month']<=8),'temprature'] = 2
            df.loc[(df['month']>=9) & (df['month']<=12),'temprature'] = 3
            return df
       
        print("in Indengo transformer method")
        df = time_convert(df,'start_time')
        df.set_index('trip_id',inplace=True)
        df = station_cleaning(df,['start_station','start_station_id'],'start_station_complete')
        df = station_cleaning(df,['end_station','end_station_id'],'end_station_complete')
        df = punctuation_cleaning(df,'bike_id')
        df = remove_null(df,'bike_id')
        df = lat_lon_cleaning(df,['end_lat','start_lat'])
        df = remove_lat_lon_outlier(df,['end_lat','end_lon'])
        df = remove_lat_lon_outlier(df,['start_lat','start_lon'])
        df = convert_data_type(df,['start_lat','start_lon','end_lat','end_lon'],float)
        df = change_lang_lat_value(df)
        df['distance'] = calculate_distance(df.start_lat, df.start_lon, df.end_lat, df.end_lon)
        df = add_new_date_time_features(df)
        df = add_temp_variable(df)
        df = convert_data_type(df,['start_station_complete','end_station_complete','bike_id'],float)
        drop_list = ['bike_type','passholder_type','start_time','end_time','end_lat','end_lon','start_lat','start_lon']
        df.drop(columns = drop_list,axis=1,inplace=True)        
        return df


Custom label encoder is required as label encoder in sklearn can act in 1 column at a time and when crossing in pipeline we dont have flexibility to specify particular column which encoding is required.

Custom label encoder also conatin estimator and transformer which will required during pipeline operation as pipeline will only accept estimator and transformer.

In [6]:
class custom_label_encoder(BaseEstimator,TransformerMixin):
    def fit( self, df, y = None ):
        print("in label encoder fit.....")
        return self
    
    def transform(self, df, y = None):
        print("in label encoder transformer")
        label = LabelEncoder()
        df['trip_route'] = label.fit_transform(df['trip_route_category'])
        df.drop(columns='trip_route_category',inplace=True)
        return df

Once we are done with function we need to create a model, first this should go through a pipeline. A pipeline is basically a sequence of steps arranged automatically.

In our case for demo purpose i m breaking into 2 pipeline as we don't have external **test dataset** but we could have single pipeline as well.

Here first divided into data preprocessing pipeline which is basically my main class ( **Indengo ) and custom label encode**.
because these pipeline will preprocess data and gives a clean modelling data.

Later on we have separated X and y variable, used train/test split (80-20%) for now, then as we have to predict in **minutes** and duration has given in seconds need to convert into minutes by dividing 60.

We have notices that our **Target column** is skewed which need to be transformed so using log transformation here.

once done we use our second pipeline, which is modelling pipeline and convert data into normalized first ( to scale it in 1 scale) so it should not crate any dominance.

Then try with Model ( I had tried with (linear regression) baseline model) but i couldn't achieve and from there it clear this is not linear data. so directly trying with tree based model.

Note : here we could check different model and compare best model with their metrics, for keeping it simpler using just decision tree. one more thing i have not performed hyperparameter tunning for now but we can do it.

In [7]:
def model_build():
    preprocess_pipeline = Pipeline(steps=[('preprocess',Indego()),('label',custom_label_encoder())])
    dataset = preprocess_pipeline.fit_transform(df) 
                                   
    X = dataset.drop(columns=['duration'])
    y = dataset['duration']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)
    
    y_train = y_train /60
    y_test =  y_test / 60
    
    y_train = np.log1p(y_train)
    y_test = np.log1p(y_test)
    model = Pipeline(steps=[('Normalizer',Normalizer()),
                               ('decision tree', DecisionTreeRegressor())
                            ])
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    #print(np.expm1(model.predict(X_test)))
    print(np.sqrt(mean_squared_error(np.expm1(y_test),np.expm1(model.predict(X_test)))))
    return model

Once modelling is done next step to store serialize model ( pickle or any serializable format) . Here using pickling to store model and then saving as finalized_model.pkl.

This is done because when we deploy and hit rest api ex : www.web.com/predict it will load model and perform predicton and return response.

**can check flask app code where i have performed it**

In [8]:
import pickle

model = model_build()
filename = 'finalized_model.pkl'
pickle.dump(model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))

in indego fit
in transformer method
in label encoder fit.....
in label encoder transformer
58.631065229797066


## Function Wise explanation

Adding temprature variable in dataset because temprature is also a important parameter to ride or not. when checked in internet basically summer season is on ( aug-nov) and (may-june-july) is rainy season, other as winters. 
for time being divided in 4 months and assigned as 1,2,3 which is respectively winter, rainy and summer.

In [12]:
'''
Add temprature column in dataframe.

@param :
     df - a dataframe contained month column.

@return :
     df - a dataframe with temprature column along with other column.
'''

def add_temp_variable(df):
    # Aug -nov : summer
    # may - june -july : rainy 
    # dec - april - winter 
    df['temprature'] = 0
    df.loc[(df['month']>=1) & (df['month']<=4),'temprature'] = 1
    df.loc[(df['month']>=5) & (df['month']<=8),'temprature'] = 2
    df.loc[(df['month']>=9) & (df['month']<=12),'temprature'] = 3
    return df

Punctation cleaning is required because dataset metadata shows its integer column and as well approx 99% value are float/int only. secondly few rows have value like 'delete me' , it could be anything in future so for safer side remove all alpha value.

In [13]:
'''
Punctuation cleaning is used to clean any character present in integer or float column for now removing alphabets.

@param :
      df : a dataframe contain all the columns.
      punc_col : a column contain special or alphabet character.
      
@return :
      
      df : a dataframe with removed puctutation from specific column.
      
'''
def punctuation_cleaning(df,punc_col):
    char_index = df[df[punc_col].str.contains('[A-Za-z]', na=False)].index
    df.drop(char_index,inplace=True)
    return df

Convrsion of datetime column ( start_time ) to datetime datatype because it has string type, converting in datetime because we need to use property of date and time like day, month, year, quarter etc. 

This function is used only for **start_time** and not touching **end_time** because we won't have end_time at test data, i mean start_time is look ahead variable and end_time is your outcome variable

In [14]:
'''
Convert string type column into datetime format so we can use further its attribute.

@param:
    time_col : name of column which contain datetime value.

@return:
    df : dataframe which contain date time column.

'''
def time_convert(df,time_col):
    df[time_col] = pd.to_datetime(arg=df[time_col], infer_datetime_format=True)
    return df

Drop a column from data frame which is not required or correlated with some other column. we need to create a simpler function at the end so we need to choose as minimal and effective column at the end.

In [15]:
'''
Drop a column from dataframe and return new dataframe without deleted column.

@param:
    df : dataframe contain all column including which need to be deleted.
    drop_col_name : name of the column which need to be dropped.
    
@return:

    df : a dataframe all column except deleted column.
    
'''
def drop_column(df, drop_col_name):
    df.drop(columns=[drop_col_name],inplace=True)
    return df

As we station column, first we see how much null values in each columns ( taking start_station & start_station_id ). I figured out both have contradictory in term of data. i.e. if station has value than station_id doesn't have and vice versa. that means this column correspond to same value so try to reduce in 1 column.

first thing to remove those rows in which both of these column contain null values.
later we fill blanks because we need to concatenate togther this column to get a complete new column without change of meaning.

converted into str type to match type and added together and finally achieved a complete column, remove these two column as our new column is decribing two columns.

same operation for end station as well.

In [16]:
'''
Station cleaning is used to clean station and station_id.

@param :
      df : a dataframe contain all the colunmns
      station_col_list: It contain start_station and start_station_id similarly for end station as well.
      
@return :
      df : return a dataframe

'''
def station_cleaning(df, station_col_list,final_station_col):
    drop_index = df.loc[(df[station_col_list[0]].isnull()) & (df[station_col_list[1]].isnull())].index
    df.drop(drop_index,inplace=True)
    df[station_col_list] = df[station_col_list].fillna('')
    df[final_station_col] = df[station_col_list[0]].astype(str) +df[station_col_list[1]].astype(str)
    df.drop(columns=station_col_list,inplace=True)
    return df

We need to remove null which is beyond imputation, as i haven't been gone to business level of this usecase initial judgement of imputing would be wrong, however someplace dropping null values is good because they will unwantedely creates anamoly.

In [17]:
'''
Remove null values from a specific column.

@param :
        df - a dataframe which contain all columns
        col_name - name of column which contain null values.
        
@return :
       df - a dataframe with removed null values from specific column.

'''

def remove_null(df,col_name):
    df = df.loc[~df[col_name].isnull()]
    return df

Latitude and longitude cleaning is similar to station cleaning, first we need to check if both start and end have 0 then we have to remove those rows.

And as we are calculating distance at later stage we have to remove any null value in both start and end latitude as it will correspond to 0 distance which will be anamoly.

I had tried to impute these but when you see for those null value station id = 3000 which is a virtual station, so whereever virtualstation present there is no lat and longitude which is ok also. for now remove this rows but we can have impute based on some business logic

In [23]:
'''
cleaning latitude and longitude which is enable for both ( start and end ). 

@param :
        df - a dataframe which contain all columns
        lat_lon_col - list of latitude and longitude column ex [start_lat, start_lon].
        
@return :
       df - a dataframe with clean values from lat, long column.

'''
def lat_lon_cleaning(df,lat_lon_col):
    lat_lon_null = df.loc[(df[lat_lon_col[0]].isnull()) & (df[lat_lon_col[1]].isnull())].index
    df.drop(lat_lon_null,inplace=True)
    df = remove_null(df,lat_lon_col[1])
    df = remove_null(df,lat_lon_col[0])
    return df

In latitude and longitude column , noticed some 0 value rows which is anamoly because our area of region lies in ( 39 , -75). secondly some of string value also mentioned in the some rows "//n" which need to be removed. This operation used for both start and end field.

In [24]:
'''
cleaning latitude and longitude which is enable for both ( start and end ) which have 0 values as well as some 
non lat,long values like here it is "\\n" which should be removed. 

@param :
        df - a dataframe which contain all columns
        lat_lon_col - list of latitude and longitude column ex [start_lat, start_lon].
        
@return :
       df - a dataframe with clean values from lat, long column.
'''

def remove_lat_lon_outlier(df,lat_lon_list):
    df = df.loc[(df[lat_lon_list[0]]!=0) | (df[lat_lon_list[1]]!=0)]
    df = punctuation_cleaning(df,lat_lon_list[0])
    return df

Convert data type from one data type to other data type, this is required at end because for modelling we required either float or integer value, but if we see some of our column , we found out string column which need to be converted

In [26]:
'''
convert data type of column. 

@param :
        df - a dataframe which contain all columns
        col_list - list of column to be converted.
        data_type - data type in which column need to be converted.
        
@return :
       df - a dataframe with changed data type for specified field.
'''

def convert_data_type(df, col_list,data_type):
    df[col_list] = df[col_list].astype(data_type)
    return df

In latitude column some of values by mistakely given as -ve which is actually a +ve so convert that neg into positive.

Example we have a latitude approx 39.2 - 39.7 etc but some places bymistakenly they have done -39.2 that should be 39.2 which need to be corrected

In [27]:
'''
anamoly cleaning latitude which is enable for both ( start and end ). 

@param :
        df - a dataframe which contain all columns
        
@return :
       df - a dataframe with clean values from lat column.

'''

def change_lang_lat_value(df):
    df.loc[df.start_lat <=0,'start_lat'] = abs(df.start_lat)
    df.loc[df.end_lat <=0,'end_lat'] = abs(df.end_lat)
    return df

generating distance attribute based on start and end (latitude,longitude). here basically converting longitude and latitude in radian and then using manhatten metrics to convert into distance.

converting into distance will give a huge bump up because distance is entirely related to duration to reach.

In [28]:
'''
Generating distance using manhatten distance formula using start and end latitude and longitude point. 

@param :
        df - a dataframe which contain all columns
        pickup_latitude - start latitude
        pickup_longitude - start longitude
        dropoff_latitude - end latitude
        dropoff_longitude - end longitude
        
@return :
       distance -  calculated distance.
'''

def degree_to_radion(degree):
    return degree*(np.pi/180)

def calculate_distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):
    
    from_lat = degree_to_radion(pickup_latitude)
    from_long = degree_to_radion(pickup_longitude)
    to_lat = degree_to_radion(dropoff_latitude)
    to_long = degree_to_radion(dropoff_longitude)
    
    radius = 6371.01
    
    lat_diff = to_lat - from_lat
    long_diff = to_long - from_long

    a = np.sin(lat_diff / 2)**2 + np.cos(degree_to_radion(from_lat)) * np.cos(degree_to_radion(to_lat)) * np.sin(long_diff / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return radius * c

Converting datetime column into different attribute of it like day,month,year,dayofweek,quarter etc. each variable has its significant like hour basis we can check whether what time of day, ride frequency increases/decreases.
converting into quarters which lead you to convert into temprature as quarterly we are specifing temprature. 
dayofweek and day wise we can check whether holiday or not, how much ride has been take off in weekend etc. info we can achieve from this attributes so improtant to unflod this.

In [29]:
'''
Generating datetime attribute using date time column. 

@param :
        df - a dataframe which contain all columns
        
@return :
       df -  dataframe with hour,day,month,year etc.
'''

def add_new_date_time_features(dataset):
    dataset['hour'] = dataset.start_time.dt.hour
    dataset['day'] = dataset.start_time.dt.day
    dataset['month'] = dataset.start_time.dt.month
    dataset['year'] = dataset.start_time.dt.year
    dataset['day_of_week'] = dataset.start_time.dt.dayofweek
    dataset['quarter'] = dataset.start_time.dt.quarter
    return dataset

## Some Intial work and Rough work - commented

In [None]:
# pre_pipeline = Pipeline(steps=[('preprocess',Indego())])
# temp = pre_pipeline.fit_transform(df_copy)

In [None]:
# temp = add_temp_variable(temp)

In [None]:
# temp.month.value_counts()

In [None]:
# station = temp.start_station_complete.value_counts()

In [None]:
# temp.day_of_week.value_counts()

In [None]:
# temp.groupby('hour').count()['distance']

In [None]:
#sns.barplot(df['hour'],df['duration'])

In [11]:
#sns.barplot(df['hour'],df['distance'])

In [None]:
# BB = (-75.5, -76.8, 39.5, 40.8)
# nyc_map = plt.imread('phiadelphia.PNG')

In [None]:
# this function will be used more often to plot data on the NYC map
# df.groupby('start_lat').count()

# def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2):
#     fig, axs = plt.subplots(1, 2, figsize=(16,10))
#     axs[0].scatter(df.start_lon, df.start_lat, zorder=1, alpha=alpha, c='r', s=s)
#     axs[0].set_xlim((BB[0], BB[1]))
#     axs[0].set_ylim((BB[2], BB[3]))
#     axs[0].set_title('Pickup locations')
#     axs[0].imshow(nyc_map, zorder=0, extent=BB)

# plot_on_map(df, BB, nyc_map, s=1, alpha=0.3)

In [None]:
# df = punctuation_cleaning(df,'start_lat')
# df = punctuation_cleaning(df,'end_lat')

In [None]:
# set trip id as index which is unique as well as complete.
# df.set_index('trip_id',inplace=True)

In [None]:
#sorting based on start time so we can check time wise trend and changes..
# df.sort_values('start_time',inplace=True)

In [None]:
# drop bike type as almost more than 50% record are null and other 50% have biased towards specific value.. 
# df.drop(columns=['bike_type'],inplace=True)

In [None]:
# df = station_cleaning(df,['start_station','start_station_id'],'start_station_complete')
# df = station_cleaning(df,['end_station','end_station_id'],'end_station_complete')

In [None]:
# drop_index = df.loc[(df.start_station.isnull()) & (df.start_station_id.isnull())].index
# df.drop(drop_index,inplace=True)

In [None]:
# df.loc[(~df.start_station.isnull()) & (df.start_station_id.isnull())].shape
# df.loc[(df.start_station.isnull()) & (~df.start_station_id.isnull())].shape
# df[['start_station','start_station_id']] = df[['start_station','start_station_id']].fillna('')
# df['start_station_complete'] = df['start_station'].astype(str) +df['start_station_id'].astype(str)
# df.drop(columns=['start_station','start_station_id'],inplace=True)

In [None]:
# df.loc[(~df.end_station.isnull()) & (df.end_station_id.isnull())].shape
# df.loc[(df.end_station.isnull()) & (~df.end_station_id.isnull())].shape
# df[['end_station','end_station_id']] = df[['end_station','end_station_id']].fillna('')
# df['end_station_complete'] = df['end_station'].astype(str) +df['end_station_id'].astype(str)

In [None]:
# df.drop(columns=['end_station','end_station_id'],inplace=True)

In [None]:
# df[df['bike_id'].str.contains('[A-Za-z]',na=False)]
# df = df.loc[df['bike_id'] !='delete me']
# char_index = df[df['bike_id'].str.contains('[A-Za-z]', na=False)].index
# df.drop(char_index,inplace=True)
# df = df.loc[~df.bike_id.isnull()]

In [None]:
# df = punctuation_cleaning(df,'bike_id')

In [None]:
# df = remove_null(df,'bike_id')

In [None]:
# df = remove_lat_lon_outlier(df,['end_lat','end_lon'])
# df = remove_lat_lon_outlier(df,['start_lat','start_lon'])

In [None]:
# df = convert_data_type(df,['start_lat','start_lon','end_lat','end_lon'],float)

In [None]:
# df = change_lang_lat_value(df)

In [None]:
# df['distance'] = calculate_distance(df.start_lat, df.start_lon, df.end_lat, df.end_lon)

In [None]:
# df = convert_data_type(df,['start_station_complete','end_station_complete','bike_id'],float)

In [None]:
# drop_list = ['passholder_type','start_time','end_time','end_lat','end_lon','start_lat','start_lon']
# df = drop_column(df,drop_list)

In [None]:
# can use drop column function..
# df.drop(columns=['passholder_type'],inplace=True)
# df.drop(columns=['start_time','end_time'],inplace=True)
# df.drop(columns=['end_lat','end_lon','start_lat','start_lon'],inplace=True)

In [None]:
# df.start_station_complete = df.start_station_complete.astype(float)
# df.end_station_complete = df.end_station_complete.astype(float)
# df.bike_id = df.bike_id.astype(float)

In [None]:
# lat_null = df.loc[(df.end_lat.isnull()) & (df.start_lat.isnull())].index
# df.drop(lat_null,inplace=True)
# df.loc[(~df.end_lat.isnull()) & (df.start_lat.isnull())]
######
# df.loc[df['start_station_complete']=='3000.0']
# df = df.loc[~df['start_lat'].isnull()]
# df.loc[df['end_lat'].isnull()]['end_station_complete'].value_counts()
# df.loc[df['end_station_complete'] == '90018.0']
# df = df.loc[~df['end_lat'].isnull()]

In [None]:
# df = df.loc[(df['end_lat']!=0) | (df['end_lon']!=0)]
# df = df.loc[(df['start_lat']!=0) | (df['start_lon']!=0)]
# lat_special_char = df[df['start_lat'].str.contains('[A-Za-z]', na=False)].index
# df.drop(lat_special_char,inplace=True)

In [None]:
# df['start_lat'] = df['start_lat'].astype(float)
# df['start_lon'] = df['start_lon'].astype(float)
# df['end_lat'] = df['end_lat'].astype(float)
# df['end_lon'] = df['end_lon'].astype(float)

In [None]:
# df.loc[df.start_lat <=0,'start_lat'] = abs(df.start_lat)
# df.loc[df.end_lat <=0,'end_lat'] = abs(df.end_lat)

In [None]:
# lon_special_character = df[df['end_lat'].str.contains('[A-Za-z]', na=False)].index
# df.drop(lon_special_character,inplace=True)

In [None]:
# df.drop(columns=['start_time','end_time'],inplace=True)
# df.drop(columns=['end_lat','end_lon','start_lat','start_lon'],inplace=True)

In [None]:
# df.start_station_complete = df.start_station_complete.astype(float)
# df.end_station_complete = df.end_station_complete.astype(float)
# df.bike_id = df.bike_id.astype(float)

In [None]:
# df['duration'] = df['duration']/60

In [None]:
# X = df.drop(columns=['duration'])
# y = df['duration']

In [None]:
# y.skew()
# y = np.log1p(y)
# sns.distplot(y, color='blue')

In [None]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score,mean_squared_error

In [None]:
# from sklearn.tree import DecisionTreeRegressor
# dt = DecisionTreeRegressor()
# dt.fit(X,y)

In [None]:
# lin_reg = LinearRegression()
# lin_reg.fit(X,y)

In [None]:
# print(np.sqrt(mean_squared_error(np.expm1(y),np.expm1(dt.predict(X)))))
#lin_reg.predict(X)