In [17]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [18]:
df_jan = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
df_fev = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

print(df_jan.info())
print(df_jan.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [19]:
num_cols = len(df_jan.columns)
print(f"The dataset has {num_cols} columns.")

The dataset has 19 columns.


In [20]:
def calculate_duration(df):
  df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
  return df

def filter_data(df):
  return df.loc[(df['duration'] >= 1) & (df['duration'] <= 60)]

def prepare_data(df):
  df['PULocationID'] = df['PULocationID'].astype(str)
  df['DOLocationID'] = df['DOLocationID'].astype(str)
  return df

def create_features(df):
  return df[['PULocationID', 'DOLocationID']].apply(lambda row: row.to_dict(), axis=1).tolist()

In [21]:
df_jan = calculate_duration(df_jan.copy())
df_fev = calculate_duration(df_fev.copy())

df_jan_filtered = filter_data(df_jan)
df_fev_filtered = filter_data(df_fev)

df_jan_filtered = prepare_data(df_jan_filtered)
df_fev_filtered = prepare_data(df_fev_filtered)

dict_list = create_features(df_jan_filtered)
dict_list_val = create_features(df_fev_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOLocationID'] = df['DOLocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice 

In [22]:
vectorizer = DictVectorizer()

X_jan = vectorizer.fit_transform(dict_list)
y_jan = df_jan_filtered['duration']

X_val = vectorizer.transform(dict_list_val)
y_val = df_fev_filtered['duration']

model = LinearRegression()
model.fit(X_jan, y_jan)
y_pred_jan = model.predict(X_jan)
rmse_jan = mean_squared_error(y_jan, y_pred_jan, squared=False)

y_pred_val = model.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)

In [23]:
print(f"The standard deviation of trip duration in January is {df_jan['duration'].std():.2f} minutes.")
fraction_remaining = len(df_jan_filtered) / len(df_jan)
print(f"The fraction of remaining records is {fraction_remaining:.2f}%")
print(f"The dimensionality of the feature matrix is 515 {X_jan.shape[1]} columns.")
print(f"The RMSE on the training set is {rmse_jan:.2f}")
print(f"The RMSE on the validation set is {rmse_val:.2f}")

The standard deviation of trip duration in January is 42.59 minutes.
The fraction of remaining records is 0.98%
The dimensionality of the feature matrix is 515 515 columns.
The RMSE on the training set is 7.65
The RMSE on the validation set is 7.81
