In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
import pickle

with open("cleaned_df_features.pkl", "rb") as f:
    df, numerics, curated_cat, other_cat = pickle.load(f)

In [3]:
# first we split trials

from sklearn.model_selection import train_test_split, KFold

# 10% for testing
[df_full_train,df_test] = train_test_split(df,test_size=0.1,random_state=42)
# 72% and 18% for train/val
[df_train,df_val] = train_test_split(df_full_train,test_size=0.2,random_state=42)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(14514552, 13)
(3628639, 13)
(2015911, 13)


In [None]:
# very much challenged by RAM size right now
# try to not hold all data at the same time
del df

with open("df_full_train.pkl", "wb") as f:
    pickle.dump(df_full_train, f)
del df_full_train

with open("df_train.pkl", "wb") as f:
    pickle.dump(df_train, f)
del df_train
    
with open("df_val.pkl", "wb") as f:
    pickle.dump(df_val, f)
del df_val

with open("df_test.pkl", "wb") as f:
    pickle.dump(df_test, f)
del df_test

import gc
gc.collect()

In [5]:
# get response variable out

y_val = df_val.base_passenger_fare.values
y_train = df_train.base_passenger_fare.values

df_val = df_val.drop(columns=['base_passenger_fare'])
df_train = df_train.drop(columns=['base_passenger_fare'])

In [6]:
# one-hot encoding: fit from training set

from sklearn.feature_extraction import DictVectorizer
import time

# all features
dv_full = DictVectorizer(sparse=True)

# this takes some time
t0 = time.time()

# curated features only
X_full_train = dv_full.fit_transform(df_train.to_dict(orient='records'))
X_full_val = dv_full.transform(df_val.to_dict(orient='records'))

t1 = time.time()

print(f"DV took {(t1-t0)/60:5.2f} min")
print(X_full_train.shape)

MemoryError: 

In [2]:
import psutil
import gc
gc.collect()

mem = psutil.virtual_memory()
print(f"Total:     {mem.total/1e9:.2f} GB")
print(f"Available: {mem.available/1e9:.2f} GB")
print(f"Used:      {mem.used/1e9:.2f} GB")

Total:     16.88 GB
Available: 8.35 GB
Used:      8.52 GB


In [8]:
import pympler.asizeof as aso
print(aso.asizeof(dv_full) / 1e9)
print(aso.asizeof(df_train) / 1e9)
size_bytes = (
    X_full_train.data.nbytes +
    X_full_train.indices.nbytes +
    X_full_train.indptr.nbytes
)
print(size_bytes / 1e9)

6.24e-07
8.064982664


NameError: name 'X_full_train' is not defined

In [None]:
from sklearn.linear_model import SGDRegressor



l1_ratios = [0.0, 0.05, 0.1, 0.2, 0.5]
SGDalphas = [1e-5, 1e-4, 1e-3, 1e-2]

print("\n=== ElasticNet hyperparameter search ===")
SGD_results = []

for alpha in tqdm(SGDalphas, desc="SGD ElasticNet alphas"):
    for l1r in tqdm(l1_ratios, desc="l1 ratio", leave=False):
    
        model_SGD = Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("reg", SGDRegressor( \
                penalty="elasticnet",
                alpha=alpha,
                l1_ratio=l1r,
                max_iter=100,
                verbose=1)
            )
        ])
        
        t0 = time.time()
        model_SGD.fit(X_full_train, y_train)
    
        y_pred = model_SGD.predict(X_full_train)
        rmse_train = root_mean_squared_error(y_train, y_pred)
        
        y_pred = model_SGD.predict(X_full_val)
        rmse_val = root_mean_squared_error(y_val, y_pred)
        
        t1 = time.time()
        train_time = t1 - t0
        
        print(f"alpha={alpha:8.6f} | l1r={l1r:8.4f} | train_RMSE={rmse_train:8.4f} | val_RMSE={rmse_val:8.4f} | time={train_time/60:5.2f} min")
        SGD_results.append((alpha, l1r, rmse_train, rmse_val, train_time, model_lasso))

import pickle

with open("full_SGDEN_results.pkl", "wb") as f:
    pickle.dump((SGD_results, l1_ratios, SGDalphas), f)

best_SGD_alpha, best_SGD_l1r, best_SGD_rmse_train, best_SGD_rmse_val, best_SGD_time, best_SGD_model = min(
    SGD_results,
    key=lambda x: x[3]
)

print(f"\nBest SGD alpha: {best_SGD_alpha}")
print(f"Best SGD L1 ratio: {best_SGD_l1r:.4f}")
print(f"Best SGD training RMSE: {best_SGD_rmse_train:.4f}")
print(f"Best SGD validation RMSE: {best_SGD_rmse_val:.4f}")
print(f"Best SGD training time: {best_SGD_time/60:.2f} minutes")