In [1]:
# Q1. Downloading the data
# We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records".
# Download the data for January and February 2024.
# Read the data for January. How many columns are there?

import pandas as pd
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')
len(df.columns)

19

In [2]:
# Q2. Computing duration
# Now let's compute the duration variable. It should contain the duration of a ride in minutes.
# What's the standard deviation of the trips duration in January?

df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df['duration'] = df.duration.dt.total_seconds() / 60
# df.duration.mean()
df.duration.std()

34.851053592192876

In [3]:
# Q3. Dropping outliers
# Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).
# What fraction of the records left after you dropped the outliers?

len(df[(df.duration >= 1) & (df.duration <= 60)]) / len(df) * 100

97.78326020432945

In [4]:
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

In [5]:
# Q4. One-hot encoding
# Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.
# Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
# Fit a dictionary vectorizer
# Get a feature matrix from it
# What's the dimensionality of this matrix (number of columns)?

from sklearn.feature_extraction import DictVectorizer

target_columns = ['PULocationID', 'DOLocationID']
df[target_columns] = df[target_columns].astype(str)  # Преобразование в строки
data_dicts = df[target_columns].to_dict(orient='records')  # Преобразование в список словарей

vectorizer = DictVectorizer()  # Создаём объект DictVectorizer
feature_matrix = vectorizer.fit_transform(data_dicts)  # Преобразуем данные в матрицу

# Вывод размера матрицы признаков
print(f'Feature matrix size: {feature_matrix.shape}')


Feature matrix size: (2898906, 518)


In [6]:
# Q5. Training a model
# Now let's use the feature matrix from the previous step to train a model.
# Train a plain linear regression model with default parameters, where duration is the response variable
# Calculate the RMSE of the model on the training data
# What's the RMSE on train?

# Define the target variable (the column in the DataFrame containing the labels)
target_column = 'duration'
labels_train = df[target_column].values  # Extract target values as a NumPy array

# Initialize the linear regression model
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

# Train the model using the training data (feature matrix and target labels)
linear_model.fit(feature_matrix, labels_train)

# Make predictions on the training data
predictions_train = linear_model.predict(feature_matrix)

# Calculate the Root Mean Squared Error (RMSE) between actual and predicted values
from sklearn.metrics import mean_squared_error
train_rmse = mean_squared_error(labels_train, predictions_train, squared=False)
print(f'Train RMSE: {train_rmse}')

Train RMSE: 7.946173359625103


In [7]:
# Q6. Evaluating the model
# Now let's apply this model to the validation dataset (February 2024). 
# What's the RMSE on validation?

# Load and preprocess the validation dataset
validation_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet')
validation_df['duration'] = validation_df.tpep_dropoff_datetime - validation_df.tpep_pickup_datetime
validation_df['duration'] = validation_df.duration.dt.total_seconds() / 60
validation_df = validation_df[(validation_df.duration >= 1) & (validation_df.duration <= 60)].copy()

# Preprocess validation data
target_columns = ['PULocationID', 'DOLocationID']
validation_df[target_columns] = validation_df[target_columns].astype(str)
validation_data_dicts = validation_df[target_columns].to_dict(orient='records')
# Ensure validation data uses the same vectorizer without fitting again
validation_feature_matrix = vectorizer.transform(validation_data_dicts)

# Extract validation labels
target_column = 'duration'
validation_labels = validation_df[target_column].values

# Make predictions on the validation data
# Predict using the already trained linear model
validation_pred = linear_model.predict(validation_feature_matrix)
# Calculate RMSE for the validation set
validation_rmse = mean_squared_error(validation_labels, validation_pred, squared=False)
print(f'Validation RMSE: {validation_rmse}')





Validation RMSE: 8.12338297606338
