In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2
scikit-learn-intelex==2021.20220215.212715


In [1]:
import os
import pickle
import pandas as pd
import numpy

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [4]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet')

In [6]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

# Q1:

In [7]:
print(f'Standatad deviation equals {numpy.std(y_pred)}')

Standatad deviation equals 5.28140357655334


# Q2:

In [29]:
def enrich_with_id(df: pd.DataFrame, year: int, month: int, y_pred: list):
    df_result = df.copy()
    df_result['ride_id'] = f'{year:04d}/{month:02d}_' + df_result.index.astype('str')
    df_result['predictions'] = y_pred
    return df_result

In [31]:
df_result = enrich_with_id(df, 2022, 2, y_pred)

In [32]:
df_result.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,ride_id,predictions
0,1,2022-02-01 00:06:58,2022-02-01 00:19:24,1.0,5.4,1.0,N,138,252,1,...,0.5,3.9,0.0,0.3,23.45,0.0,1.25,12.433333,2022/02_0,18.527783
1,1,2022-02-01 00:38:22,2022-02-01 00:55:55,1.0,6.4,1.0,N,138,41,2,...,0.5,0.0,6.55,0.3,30.1,0.0,1.25,17.55,2022/02_1,23.065782
2,1,2022-02-01 00:03:20,2022-02-01 00:26:59,1.0,12.5,1.0,N,138,200,2,...,0.5,0.0,6.55,0.3,44.6,0.0,1.25,23.65,2022/02_2,33.686359
3,2,2022-02-01 00:08:00,2022-02-01 00:28:05,1.0,9.88,1.0,N,239,200,2,...,0.5,0.0,3.0,0.3,34.8,2.5,0.0,20.083333,2022/02_3,23.757436
4,2,2022-02-01 00:06:48,2022-02-01 00:33:07,1.0,12.16,1.0,N,138,125,1,...,0.5,8.11,0.0,0.3,48.66,2.5,1.25,26.316667,2022/02_4,21.492904


In [33]:
df_results = df_result[['ride_id', 'predictions']]

In [34]:
def save_to_parquet(data, output_file: str, path_to_output_dir: str='output') -> None:
    data.to_parquet(
        os.path.join(path_to_output_dir, output_file),
        engine='pyarrow',
        compression=None,
        index=False)
    return None

In [36]:
save_to_parquet(df_results, 'yellow_tripdata_2022-02_with_predictions.parquet')

In [40]:
!ls -alh ./output

total 58M
drwxrwxr-x 2 taras taras 4,0K Jun 15 18:01 .
drwxrwxr-x 4 taras taras 4,0K Jun 15 18:03 ..
-rw-rw-r-- 1 taras taras  58M Jun 15 18:04 yellow_tripdata_2022-02_with_predictions.parquet


The size is 58M

Q3: Convert to a python script: 
```bash
jupyter nbconvert --to script startet.ipynb
```

Q4: What's the first hash for the Scikit-Learn dependency?

In [45]:
!grep "scikit-learn" -A 2 Pipfile.lock

        "scikit-learn": {
            "hashes": [
                "sha256:065e9673e24e0dc5113e2dd2b4ca30c9d8aa2fa90f4c0597241c93b63130d233",


Q5: Run the script for March 2022. What is the mean of predictions?

In [2]:
!python starter.py --path_to_pickle model.bin --year 2022 --month 3

Standatad deviation equals 5.556602554785001
Mean of predictions 12.758556818790902
