In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [2]:
train_data_path = "../data/fhv_tripdata_2021-01.parquet"
val_data_path = "../data/fhv_tripdata_2021-02.parquet"

In [3]:
df = pd.read_parquet(train_data_path)
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


# Q1: How many records in January

In [4]:
# Q1: How many records in January

len(df.index)
# df.shape[0]

1154112

# Q2: Computing duration

In [5]:
# Q2: Computing duration

df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda x: x.total_seconds() / 60)
df.duration.mean()

19.1672240937939

# Data preparation

In [6]:
# Data preparation

df = df[(df.duration >= 1) & (df.duration <= 60)]  # filter by duration (from 1 min to 60 min)
print(len(df.index))

# 1154112 - 1109826
print(1154112 - 1109826)

1109826
44286


# Q3: Missing values

In [12]:
# Q3: Missing values

df['PUlocationID'] = df['PUlocationID'].apply(lambda x: x if not np.isnan(x) else -1)

B = len(df[df['PUlocationID'] >= 0])
A = len(df['PUlocationID'])

100 - (B / (A/100))

# todo: forgot about drop location!!

83.52732770722618

In [13]:
# # same for Drop Off Location
#
# df['DOlocationID'] = df['DOlocationID'].apply(lambda x: x if not np.isnan(x) else -1)
#
# B = len(df[df['DOlocationID'] >= 0])
# A = len(df['DOlocationID'])
#
# 100 - (B / (A/100))

# Q4: One-hot encoding

In [14]:
# Q4: One-hot encoding

categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].astype(str)
df[categorical].dtypes

PUlocationID    object
DOlocationID    object
dtype: object

In [15]:
train_dicts = df[categorical].to_dict(orient='records')

In [16]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [17]:
X_train.shape

(1109826, 525)

# Q5: Training model

In [15]:
# Q5: Training model

y_train = df['duration'].values

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.528519107204405

# Q6: Evaluation the Model

In [18]:
def load_stuff(dataset_path):
    df = pd.read_parquet(dataset_path)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda x: x.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    df['PUlocationID'] = df['PUlocationID'].apply(lambda x: x if not np.isnan(x) else -1)
    df['DOlocationID'] = df['DOlocationID'].apply(lambda x: x if not np.isnan(x) else -1)

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)

    dicts = df[categorical].to_dict(orient='records')

    return df, dicts

In [19]:
train_df, train_dicts = load_stuff(dataset_path=train_data_path)
val_df, val_dicts = load_stuff(dataset_path=val_data_path)

In [20]:
dv = DictVectorizer()
train_X = dv.fit_transform(train_dicts)
train_y = train_df['duration'].values

val_X = dv.transform(val_dicts)
val_y = val_df['duration'].values

In [21]:
model = LinearRegression()
model.fit(X_train, y_train)

val_y_pred = model.predict(val_X)

mean_squared_error(val_y, val_y_pred, squared=False)

11.014283137481941

In [22]:
print('all done')

all done
