# Download dataset

!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet

!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

# Homework 1

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [2]:
def read_dataframe_with_some_info(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    initial_rows, initial_cols = df.shape    
    print("Number of rows: ", initial_rows, "number of columns: ", initial_cols)
    
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    duration_std = df.duration.std()
    print("standard deviation of duration: ", duration_std)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    
    filtered_rows, _ = df.shape
    
    remaining_rows_percentage = filtered_rows / initial_rows
    print("records left percentage: ", remaining_rows_percentage)
    
    
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [3]:
train_data = read_dataframe_with_some_info("yellow_tripdata_2023-01.parquet")

Number of rows:  3066766 number of columns:  19
standard deviation of duration:  42.594351241920904
records left percentage:  0.9812202822125979


In [4]:
feature_cols = ['PULocationID', 'DOLocationID']
target = 'duration'
y_train = train_data[target].values

In [5]:
train_dicts = train_data[feature_cols].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print('Dimensionality of matrix: ', X_train.shape[1])

lr = LinearRegression()
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred, squared=False)
print("RMSE on train: ", train_mse)


Dimensionality of matrix:  515
RMSE on train:  7.649261027750482


In [6]:
val_data = read_dataframe_with_some_info("yellow_tripdata_2023-02.parquet")

Number of rows:  2913955 number of columns:  19
standard deviation of duration:  42.84210176105097
records left percentage:  0.9800944077722545


In [7]:
val_dicts = val_data[feature_cols].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = val_data[target].values
y_val_pred = lr.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred, squared=False)
print("RMSE on val: ", val_mse)

RMSE on val:  7.811832565322446
