In [1]:
import pandas as pd

In [2]:
!pip install pyarrow



In [69]:
df = pd.read_parquet('yellow_tripdata_2022-01.parquet')

Q1 Downloading the data Read the data for January. How many columns are there?

In [70]:
df.shape[1]

19

In [71]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

Q2 Computing duration
What's the standard deciation of the trips in January

In [72]:
df['duration'] = df['tpep_dropoff_datetime']-df['tpep_pickup_datetime']
df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

In [73]:
df['duration'].std()

46.44530513776499

Q3 Dropping outliers 

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [74]:
df.query('duration >=1 and duration <= 60').shape[0] / df.shape[0]

0.9827547930522406

In [75]:
df = df.query('duration >=1 and duration <= 60')

Q4 One-hot encoding

In [76]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

In [77]:
df[categorical] = df[categorical].astype(str)

In [78]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

In [79]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()

In [80]:
X_train = dv.fit_transform(train_dicts)

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

Turn the dataframe into a list of dictionaries
Fit a dictionary vectorizer
Get a feature matrix from it
What's the dimensionality of this matrix (number of columns)?

In [81]:
X_train

<2421440x516 sparse matrix of type '<class 'numpy.float64'>'
	with 7264320 stored elements in Compressed Sparse Row format>

In [82]:
target = 'duration'
y_train = df[target].values

In [83]:
from sklearn.linear_model import LinearRegression

In [84]:
from sklearn.metrics import mean_squared_error

In [85]:
lr = LinearRegression()
lr.fit(X_train, y_train)

Q5. Training a model

In [86]:
y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

7.001496179445599

In [89]:
X_train

<2421440x516 sparse matrix of type '<class 'numpy.float64'>'
	with 7264320 stored elements in Compressed Sparse Row format>

In [90]:
X_train.shape

(2421440, 516)

In [91]:
lr.predict(X_train)

array([ 9.4096893 , 16.43549296, 13.95556019, ..., 10.45664443,
       10.29179032, 27.21893733])

Refactoring

In [93]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)
    
    df = df.query('duration >=1 and duration <= 60')
    
    categorical = ['PULocationID','DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [94]:
df_train = read_dataframe('yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('yellow_tripdata_2022-02.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)


In [97]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

df_train[categorical] = df_train[categorical].astype(str)

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [98]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [99]:
from sklearn.metrics import mean_squared_error

In [100]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

7.001496179445599

Q6. Evaluating the model

In [101]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.795498122351478