In [41]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as pyplot
import seaborn as sns

In [42]:
def prepare_data(path):
    data=pd.read_parquet(path)
    data['duration']= data['tpep_dropoff_datetime']-data['tpep_pickup_datetime']
    data.duration = data.duration.apply(lambda td: td.total_seconds()/60)
    data=data[(data['duration']>=1) & (data['duration']<=60)]
    
    categorical = ['PULocationID', 'DOLocationID']
    data[categorical]=data[categorical].astype(str)

    return data

In [43]:
df_train=prepare_data('./dataset/yellow_tripdata_2023-01.parquet')
df_val=prepare_data('./dataset/yellow_tripdata_2023-02.parquet')

In [44]:
len(df_train), len(df_val)

(3009173, 2855951)

### Part 1

In [87]:
data_jan=pd.read_parquet('./dataset/yellow_tripdata_2023-01.parquet')
data_jan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

### Part 2

In [46]:
df_train.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,8.433333
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,6.316667
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,12.75
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,9.616667
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,10.833333


In [47]:
df_train['duration'].std()

9.939385620151036

### Part 3



In [49]:
data_jan['duration']= data_jan['tpep_dropoff_datetime']-data_jan['tpep_pickup_datetime']
orig_count = data_jan['duration'].describe()['count']

In [50]:
filtered_count=df_train['duration'].describe()['count']

In [51]:
fraction_left=filtered_count/orig_count

In [52]:
print(fraction_left)

0.9812202822125979


### Part 4

In [73]:
categorical = ['PULocationID', 'DOLocationID']
# Convert DataFrame to a list of dictionaries
train_data_list = df_train[categorical].to_dict(orient='records')
# Fit a DictVectorizer
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_data_list)

In [74]:
print(type(X_train))
print(X_train.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(3009173, 515)


In [75]:
vectorizer.feature_names_

['DOLocationID=1',
 'DOLocationID=10',
 'DOLocationID=100',
 'DOLocationID=101',
 'DOLocationID=102',
 'DOLocationID=106',
 'DOLocationID=107',
 'DOLocationID=108',
 'DOLocationID=109',
 'DOLocationID=11',
 'DOLocationID=111',
 'DOLocationID=112',
 'DOLocationID=113',
 'DOLocationID=114',
 'DOLocationID=115',
 'DOLocationID=116',
 'DOLocationID=117',
 'DOLocationID=118',
 'DOLocationID=119',
 'DOLocationID=12',
 'DOLocationID=120',
 'DOLocationID=121',
 'DOLocationID=122',
 'DOLocationID=123',
 'DOLocationID=124',
 'DOLocationID=125',
 'DOLocationID=126',
 'DOLocationID=127',
 'DOLocationID=128',
 'DOLocationID=129',
 'DOLocationID=13',
 'DOLocationID=130',
 'DOLocationID=131',
 'DOLocationID=132',
 'DOLocationID=133',
 'DOLocationID=134',
 'DOLocationID=135',
 'DOLocationID=136',
 'DOLocationID=137',
 'DOLocationID=138',
 'DOLocationID=139',
 'DOLocationID=14',
 'DOLocationID=140',
 'DOLocationID=141',
 'DOLocationID=142',
 'DOLocationID=143',
 'DOLocationID=144',
 'DOLocationID=145',

In [76]:
y_train=df_train['duration'].values
print(y_train.shape)

(3009173,)


### Part 5

In [77]:
model=LinearRegression()
model.fit(X_train,y_train)

In [78]:
y_pred=model.predict(X_train)

In [79]:
rmse=mean_squared_error(y_train, y_pred, squared=False)
print(rmse)

7.6492618170231745


### Part 6

In [80]:
# Convert DataFrame to a list of dictionaries
val_data_list = df_val[categorical].to_dict(orient='records')
X_val = vectorizer.transform(val_data_list)

In [83]:
print(X_val.shape)

(2855951, 515)


In [84]:
y_val=df_val['duration'].values
print(y_val.shape)

(2855951,)


In [85]:
y_val_pred=model.predict(X_val)

In [86]:
rmse_val=mean_squared_error(y_val, y_val_pred, squared=False)
print(rmse_val)

7.811820944560843
