#### Retrieve Traffic Dataset

In [1]:
!wget "https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet"
!wget "https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet"

--2022-05-19 19:40:51--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.216.147.108
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.216.147.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11886281 (11M) [binary/octet-stream]
Saving to: ‘fhv_tripdata_2021-01.parquet’


2022-05-19 19:40:51 (101 MB/s) - ‘fhv_tripdata_2021-01.parquet’ saved [11886281/11886281]

--2022-05-19 19:40:51--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.216.147.108
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.216.147.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10645466 (10M) [binary/octet-stream]
Saving to: ‘fhv_tripdata_2021-02.parquet’


2022-05-19 19:40:51 (69.5 MB/s) - ‘fhv_tripdata_2021-02.parquet’ saved [10645466/10

#### Import Libraries/Packages

In [21]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer

#### Segregate train and test datasets

In [37]:
df_train=pd.read_parquet('/content/fhv_tripdata_2021-01.parquet')
df_val=pd.read_parquet('/content/fhv_tripdata_2021-02.parquet')

#### Check number of records in both datasets

In [38]:
print('Records in Jan 2021 dataset ',df_train.shape[0])
print('Records in Feb 2021 dataset ',df_val.shape[0])

Records in Jan 2021 dataset  1154112
Records in Feb 2021 dataset  1037692


### Question 1: Number of records in Jan 2021 FHV data

---

### Answer : 1154112

#### Function to calculate trip duration

In [39]:
def compute_duration(dataframe):
  dataframe['duration'] = dataframe.dropOff_datetime - dataframe.pickup_datetime
  dataframe.duration = dataframe.duration.apply(lambda td: td.total_seconds() / 60)
  return dataframe

#### Calculate duration in both datasets

In [40]:
df_train=compute_duration(df_train)
df_val=compute_duration(df_val)

#### Calculate mean duration for the January dataset

In [41]:
df_train['duration'].mean()

19.1672240937939

### Question 2: Average duration in Jan 2021 FHV


---

### Answer: 19.16


#### Identify how many outlier records exist

In [42]:
outliers = df_train[(df_train.duration < 1) | (df_train.duration > 60)]
print(outliers.shape[0])

44286


#### Function to filter rows where trip duration is greater than and equal to one minute and less than and equal to 60 minutes. Also fill rows with Nan value with -1 for PUlocationID and DOlocationID columns

In [43]:
def data_processing(dataframe):
  dataframe=dataframe[(dataframe.duration >= 1) & (dataframe.duration <= 60)]
  dataframe['PUlocationID'].fillna((-1), inplace=True)
  dataframe['DOlocationID'].fillna((-1), inplace=True)
  return dataframe

### Perform data processing

In [44]:
df_train=data_processing(df_train)
df_val=data_processing(df_val)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [45]:
print('fractions of missing values for the pickup location ID',((df_train[df_train.PUlocationID == -1.0].shape[0])/(df_train.shape[0]))*100)

fractions of missing values for the pickup location ID 83.52732770722618


### Question 3: Fraction of missing values
---
### Answer: 83% approx.




### Perform One Hot Encoding

In [53]:
categorical = ['PUlocationID', 'DOlocationID']
numerical = ['duration']

df_train[categorical] = df_train[categorical].astype(str)
dicts = df_train[categorical].to_dict(orient='records')
dv = DictVectorizer()

X_train = dv.fit_transform(dicts)
y_train = df_train['duration'].values

In [54]:
print(X_train.shape)

(1109826, 525)


### Question 4: Dimensionality after OHE


---

### Answer:525

### Train a basic Linear Regression model and calculate its RMSE 

In [52]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

rmse=mean_squared_error(y_train, y_pred, squared=False)

print(rmse)

10.528519107210744


### Question 5: RMSE on train


---

### 10.52

### Compute the RMSE on the validation dataset

In [51]:
df_val[categorical] = df_val[categorical].astype(str)
dicts_val = df_val[categorical].to_dict(orient='records')

X_val = dv.transform(dicts_val)
y_val = df_val['duration'].values

y_val_pred = lr.predict(X_val)

rmse=mean_squared_error(y_val, y_val_pred, squared=False)

print(rmse)


11.014283196111764


### Question 6: RMSE on validation


---

### Answer: 11.01
