# Homework - Week 1

In [20]:
# Load dependencies
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Import paths
from config import PATH_DATA_FHV_JAN, PATH_DATA_FHV_FEB

In [10]:
# Load the dataset
df = pd.read_parquet(PATH_DATA_FHV_JAN)
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [11]:
# Number of records
old_len = len(df)
print(f"Number of records in the dataset is {old_len}")

Number of records in the dataset is 1154112


In [12]:
# Compute duration
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df['duration'] = df.duration.dt.total_seconds() / 60

# Average duration
print(f"Average duration is {df.duration.mean()}")

Average duration is 19.167224093791006


In [13]:
# Missing values
display(df.isnull().sum())

# Transfor null values into -1
df.PUlocationID.fillna(-1,inplace=True)
df.DOlocationID.fillna(-1,inplace=True)

# Calculate the fraction of missing values
fraction = (len(df[df.PUlocationID==-1]) / len(df))*100
print(f"Fractions of missing values for the pickup location ID is {fraction}%")

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               958267
DOlocationID               162220
SR_Flag                   1154112
Affiliated_base_number        885
duration                        0
dtype: int64

Fractions of missing values for the pickup location ID is 83.03067639882438%


In [14]:
# Data preparation
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

# Define categorical variables
categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].fillna(-1).astype('int')
df[categorical] = df[categorical].astype('str')

train_dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
print(f"Shape of X_train is {X_train.shape}")

y_train = df.duration.values
print(f"Number of features {len(dv.feature_names_)}")

Shape of X_train is (1109826, 525)
Number of features 525


In [15]:
# Model Training
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = lin_reg.predict(X_train)

# Evaluate the model performance
mean_squared_error(y_train, y_pred, squared=False)

10.5285194284187

In [21]:
def read_data(filename):
    """
    Reads and prepares data for the analysis
    Args:
        filename(str): File path for the dataset

    Returns:
        pd.DataFrame
    """
    # Load teh dataset
    df = pd.read_parquet(filename)

    # Select categorical variables
    categorical = ['PUlocationID', 'DOlocationID']

    # Calculate trip duration
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    # Filter the data and correct the data type
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [22]:
# Load the validation data
df_val = read_data(PATH_DATA_FHV_FEB)

In [23]:
# Define features and the target
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val.duration.values

In [24]:
# Make predictions by using the trained model
y_pred = lin_reg.predict(X_val)

# Evaluate the model on validation data
mean_squared_error(y_val, y_pred, squared=False)

11.014285518755779