In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
pd.__version__

In [None]:
jan_data = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
feb_data = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"

In [None]:
df = pd.read_parquet(jan_data, engine="pyarrow")

In [None]:
df.head().T

In [None]:
df.dtypes

## Question 1: How many columns are there?

In [None]:
df.columns, len(df.columns), df.shape

# Answer is: 19

## Question 2: What's the standard deviation of the trips duration in January?

In [None]:
# Calculate duration in minutes
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [None]:
df['duration']

In [None]:
df.duration.describe()

In [None]:
durations = df['duration'].dropna()

# Calculate mean and std from actual data
mean = durations.mean()
std_dev = durations.std()
round(mean, 2), round(std_dev, 2)

# Answer is: 42.59

## Question 3: What fraction of the records left after you dropped the outliers?

In [None]:
((df.duration >= 1) & (df['duration'] <= 60)).mean()
# Answer is 98%

In [None]:
df = df[((df.duration >= 1) & (df.duration <= 60))]

## Question 4: What's the dimensionality of this matrix (number of columns)?

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
categorical = ['PULocationID', 'DOLocationID']

In [None]:
def read_dataframe(filename, categorical):
    df = pd.read_parquet(filename, engine="pyarrow")
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[((df.duration >= 1) & (df.duration <= 60))]
    
    df[categorical] = df[categorical].astype(str)
    return df

In [None]:
df_train = read_dataframe(jan_data, categorical=categorical)

In [None]:
train_dicts = df_train[categorical].to_dict(orient='records')
train_dicts
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values
y_train

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

sns.histplot(y_pred, label="prediction")
sns.histplot(y_train, label="actual")
plt.legend()

In [None]:
len(dv.feature_names_)

# Answer is 515

## Question 5: What's the RMSE on train?

In [None]:
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
rmse

# Answer is 7.64

## Question 6: What's the RMSE on validation?

In [None]:
# df_train = read_dataframe(jan_data, categorical=categorical)
df_val = read_dataframe(feb_data, categorical=categorical)

In [None]:
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

sns.histplot(y_pred, label="prediction")
sns.histplot(y_train, label="actual")
plt.legend()

In [None]:
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
rmse

# Answer is 7.81