In [19]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

In [39]:
def process_data(filename):
    data = pd.read_parquet(filename)
    
    data["duration"] = data["dropOff_datetime"] - data["pickup_datetime"]
    data.duration = data.duration.apply(lambda dur: dur.total_seconds()/60)
    #data = data[(data.duration >= 1) & (data.duration <= 60)]
    
    return data

In [72]:
data = process_data("../data/fhv_tripdata_2021-01.parquet")
data.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [73]:
#Q1
len(data)

1154112

In [74]:
#Q2
data.duration.mean()

19.1672240937939

In [75]:
#Remove "Outliers"
data = data[(data.duration >= 1) & (data.duration <= 60)]

In [76]:
len(data)

1109826

In [77]:
#Q3
data["PUlocationID"].isna().value_counts(normalize=True)

True     0.835273
False    0.164727
Name: PUlocationID, dtype: float64

In [78]:
# fill nas with -1
data["PUlocationID"] = data["PUlocationID"].fillna(-1)

In [79]:
#test
len(data[data["PUlocationID"] == -1]) / len(data)

0.8352732770722617

In [80]:
categorical = ["PUlocationID", "DOlocationID"]

# do one hot encoding for categorical features
data[categorical] = data[categorical].astype(str)

In [83]:
#vectorizer
dv = DictVectorizer()

train_dicts = data[categorical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

In [84]:
X_train.shape

(1109826, 525)

In [86]:
#Training
target = "duration"
Y_train = data[target].values

lr = LinearRegression()
lr.fit(X_train, Y_train)

Y_pred = lr.predict(X_train)

mean_squared_error(Y_train, Y_pred, squared=False)

10.528519107211688

In [89]:
#get validation data
val_data = process_data("../data/fhv_tripdata_2021-02.parquet")

val_data = val_data[(val_data.duration >= 1) & (val_data.duration <= 60)]

val_data["PUlocationID"] = val_data["PUlocationID"].fillna(-1)

val_data[categorical] = val_data[categorical].astype(str)

val_dicts = val_data[categorical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [90]:
#Validate
Y_val = val_data[target].values

Y_pred = lr.predict(X_val)

mean_squared_error(Y_val, Y_pred, squared=False)

11.01428319067211