In [16]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
import seaborn as sns
import pickle

### file loading

In [2]:
def file_loading(file_path:str):
    df = pd.read_parquet(file_path)
    return df

In [3]:
df = file_loading('../Data/green_tripdata_2021-01.parquet')

In [4]:
df.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge'],
      dtype='object')

### preprocessing and feature eng

In [10]:
def preprocess(df:pd.DataFrame):
    # create new var duration for y, exclude outlier
    df['duration'] = (df['lpep_dropoff_datetime']-df['lpep_pickup_datetime']).dt.total_seconds()/60
    df = df[(df['duration']>=1)&(df['duration']<=60)]
    
    # combine 2 var to a new var as x
    df['PULocationID'] = df['PULocationID'].astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype(str)
    df['PU_DO'] = df['PULocationID'] + "_" + df['DOLocationID']
    
    return df
        

In [8]:
df_p = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOLocationID'] = df['DOLocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PU_DO'] = df['PULocationID'] + "_" + df['DOLocationID']


In [9]:
df_p.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,PU_DO
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0,3.933333,43_151
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,0.5,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75,8.75,166_239
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,0.5,0.5,1.0,0.0,,0.3,8.3,1.0,1.0,0.0,5.966667,41_42
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0,7.083333,168_75
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.5,0.5,0.5,0.96,0.0,,0.3,5.76,1.0,1.0,0.0,2.316667,75_75


### Vectorizer

In [13]:
def Vectorizer(df:pd.DataFrame, dv:DictVectorizer = None):
    
    dict= df[['PU_DO', 'trip_distance']].to_dict(orient='records')
    if dv:
        x = dv.transform(dict)

    else:
        dv = DictVectorizer()
        x = dv.fit_transform(dict)        
        
    y = df['duration'].values
    
    return dv, x, y

In [14]:
dv, x_train, y_train = Vectorizer(df_p)

### model training

In [20]:
def model_training(model, x, y):
    
    training_model = model()
    training_model.fit(x,y)
    y_pred = training_model.predict(x)
    
    rmse = root_mean_squared_error(y, y_pred)
    
    print(f'RMSE: {rmse}')
    
    return model, y_pred

In [21]:
model, y_pred = model_training(LinearRegression, x_train, y_train)

RMSE: 5.699564118198979


### model valdiation

In [22]:
def validation(model, x, y):
    y_pred = model.predict(x)
    rmse = root_mean_squared_error(y, y_pred)
    
    print(f'RMSE: {rmse}')
    
    return y_pred