In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Question 1

In [2]:
# File 1: yellow_tripdata_2022-01.parquet
# File 2: yellow_tripdata_2022-02.parquet

df = pd.read_parquet("../data/yellow_tripdata_2022-01.parquet")
df.shape

(2463931, 19)

# Question 2

In [3]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [4]:
df.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
count,2463931.0,2392428.0,2463931.0,2392428.0,2463931.0,2463931.0,2463931.0,2463931.0,2463931.0,2463931.0,2463931.0,2463931.0,2463931.0,2463931.0,2392428.0,2392428.0,2463931.0
mean,1.707819,1.389453,5.372751,1.415507,166.0768,163.5807,1.194449,12.94648,1.00671,0.4914539,2.385696,0.3749773,0.2967234,19.16937,2.282322,0.08249935,14.2122
std,0.5021375,0.9829686,547.8714,5.917573,65.46806,70.79016,0.5001778,255.8149,1.236226,0.0829156,2.830698,1.680538,0.04374741,255.9641,0.743204,0.3125554,46.44531
min,1.0,0.0,0.0,1.0,1.0,1.0,0.0,-480.0,-4.5,-0.5,-125.22,-31.4,-0.3,-480.3,-2.5,-1.25,-3442.4
25%,1.0,1.0,1.04,1.0,132.0,113.0,1.0,6.5,0.0,0.5,0.72,0.0,0.3,11.3,2.5,0.0,6.316667
50%,2.0,1.0,1.74,1.0,162.0,162.0,1.0,9.0,0.5,0.5,2.0,0.0,0.3,14.69,2.5,0.0,10.18333
75%,2.0,1.0,3.13,1.0,234.0,236.0,1.0,14.0,2.5,0.5,3.0,0.0,0.3,20.02,2.5,0.0,16.16667
max,6.0,9.0,306159.3,99.0,265.0,265.0,5.0,401092.3,33.5,16.59,888.88,193.3,0.3,401095.6,2.5,1.25,8513.183


# Question 3

In [5]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [6]:
df.shape

(2421440, 20)

In [7]:
2421440/2463931

0.9827547930522406

# Question 4

In [8]:
categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].astype(str)

In [9]:
train_dicts = df[categorical].to_dict(orient = "records")

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [10]:
X_train

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

# Question 5

In [11]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared = False)

6.986190814952337

# Question 6

In [12]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [13]:
df_val = read_dataframe("../data/yellow_tripdata_2022-02.parquet")
df_val.shape

(2918187, 20)

In [14]:
categorical = ['PULocationID', 'DOLocationID']

In [15]:
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [16]:
target = 'duration'
y_val = df_val[target].values

In [17]:
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared = False)

7.786407163179794