In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037
...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285


# Q1

In [4]:
len(df)

1154112

# Q2

In [5]:
df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

In [6]:
print('Average duration: {:.2f}'.format(df['duration'].mean()))

Average duration: 19.17


# Data preparation

In [7]:
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
len(df)

1109826

In [8]:
1154112 - 1109826

44286

# Q3

In [9]:
df['PUlocationID'] = df['PUlocationID'].fillna(-1)
df['DOlocationID'] = df['DOlocationID'].fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PUlocationID'] = df['PUlocationID'].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOlocationID'] = df['DOlocationID'].fillna(-1)


In [10]:
sum(df['PUlocationID'] == -1)/len(df)

0.8352732770722617

# Q4

In [11]:
categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)


In [12]:
train_dicts = df[categorical].to_dict(orient = 'records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [13]:
X_train.shape[1]

525

# Q5

In [14]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

In [15]:
print('RMSE: {:.2f}'.format(mean_squared_error(y_train, y_pred, squared=False)))

RMSE: 10.53


# Q6

In [16]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)
    
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    
    df['PUlocationID'] = df['PUlocationID'].fillna(-1)
    df['DOlocationID'] = df['DOlocationID'].fillna(-1)
    
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [17]:
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')
val_dicts = df_val[categorical].to_dict(orient = 'records')

X_val = dv.transform(val_dicts)
y_val = df_val[target].values

y_pred_on_val = lr.predict(X_val)

In [18]:
print('RMSE: {:.2f}'.format(mean_squared_error(y_val, y_pred_on_val, squared=False)))

RMSE: 11.01
