In [1]:
import pandas as pd
import numpy as np
from time import time
import math
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import cudf



In [2]:
# df_raw = cudf.read_csv('df_train_clean.csv').to_pandas()
df_raw = pd.read_csv('df_train_clean.csv')

In [3]:
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'distance', 'dow', 'month', 'if_night']
df_features = df_raw[features]
df_features['if_night'] = df_features['if_night'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features['if_night'] = df_features['if_night'].astype('int')


In [4]:
x = df_features.to_numpy()
y = df_raw['fare_amount'].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2784)
x_train = np.column_stack((x_train, np.ones_like(y_train)))
x_test = np.column_stack((x_test, np.ones_like(y_test)))
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(42138129, 10) (42138129,) (10534533, 10) (10534533,)


In [5]:
def evaluate(pred, target):
    return np.sqrt(np.sum((pred - target) ** 2) / len(pred))

In [6]:
from numba import jit

@jit(nopython=True, parallel = True)
def evaluate(pred, target):
    return np.sqrt(np.mean(((pred - target) ** 2)))

In [7]:
from numba import jit

@jit(nopython=True, parallel = True)
def numba_poly(x, degree):
    m, n = x.shape
    x_poly = np.ones((m, degree*n + 1))
    
    for d in range(1, degree + 1):
        x_poly[:, (d-1)*n:d*n] = np.power(x, d)
    
    return x_poly

In [8]:
# closed-form solution
def cfs(x_train, y_train, x_test, y_test):
    A = np.linalg.inv(x_train.T.dot(x_train)).dot(x_train.T).dot(y_train)
    pred_train = A @ x_train.T
    pred_test = A @ x_test.T
    return A, pred_train, pred_test

A, pred_train, pred_test = cfs(x_train, y_train, x_test, y_test)
print('train RMSE: {}, test RMSE: {}'.format(evaluate(pred_train, y_train), evaluate(pred_test, y_test)))

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


train RMSE: 3.888225323375803, test RMSE: 3.8863474694461924


In [9]:
def ridge_cfs(x_train, y_train, x_test, y_test, alpha):
    m, n = x_train.shape
    alphaI = alpha * np.identity(n)
    
    A = np.linalg.inv(x_train.T.dot(x_train) + alphaI).dot(x_train.T.dot(y_train))
    pred_train = A @ x_train.T
    pred_test = A @ x_test.T
    return A, pred_train, pred_test

### Bagging

In [10]:
num_model = 10
start_time = time()
size = x_train.shape[0] // num_model
x_train_bag = []
y_train_bag = []
for i in range(num_model):
    if i != (num_model-1):
        x_train_bag.append(x_train[i*size: (i+1)*size, :])
        y_train_bag.append(y_train[i*size: (i+1)*size])
    else:
        x_train_bag.append(x_train[i*size:, :])
        y_train_bag.append(y_train[i*size:])
print('Time:', time()-start_time)

Time: 0.0001761913299560547


In [11]:
start_time = time()
x_train_bag = np.array_split(x_train, num_model)
y_train_bag = np.array_split(y_train, num_model)
print('Time:', time()-start_time)

Time: 0.0001811981201171875


**Without Pooling**

In [12]:
start_time = time()

degrees = range(1, 5)
alphas = [0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9]
models = []
degree = []


for index, x in enumerate(x_train_bag):
    y = y_train_bag[index]
    best_model = None
    best_d = None
    rmse_min = np.inf 
    
    for i, d in enumerate(degrees):
        x_train_poly = numba_poly(x, d)
        x_test_poly = numba_poly(x_test, d)
        for j, alpha in enumerate(alphas):
            A, pred_train, pred_test = ridge_cfs(x_train_poly, y, x_test_poly, y_test, alpha)
            test_rmse = evaluate(pred_test, y_test)
            if test_rmse < rmse_min:
                best_model = A
                best_d = d
                rmse_min = test_rmse
    models.append(best_model)
    degree.append(best_d)
print('Time:', time()-start_time)

preds_train = []
preds_test = []
for i, m in tqdm(enumerate(models)):
    d = degree[i]
    x_train_poly = numba_poly(x_train, d)
    x_test_poly = numba_poly(x_test, d)
    preds_train.append(x_train_poly.dot(m))
    preds_test.append(x_test_poly.dot(m))
    
preds_train = np.array(preds_train)
preds_train = np.mean(preds_train, axis=0)
preds_test = np.array(preds_test)
preds_test = np.mean(preds_test, axis=0)
print(evaluate(preds_train, y_train), evaluate(preds_test, y_test))

Time: 63.302993297576904


10it [00:47,  4.80s/it]


3.7941910505352165 3.792306243166858


**With multiprocessing pool**

In [13]:
def build(xy):
    x = xy[:, :-1]
    y = xy[:, -1]
    best_model = None
    best_d = None
    rmse_min = np.inf
    print('-')
    for i, d in enumerate(degrees):
        x_train_poly = numba_poly(x, d)
        x_test_poly = numba_poly(x_test, d)
        for j, alpha in enumerate(alphas):
            A, pred_train, pred_test = ridge_cfs(x_train_poly, y, x_test_poly, y_test, alpha)
            test_rmse = evaluate(pred_test, y_test)
            if test_rmse < rmse_min:
                best_model = A
                best_d = d
                rmse_min = test_rmse
    return best_model, best_d


degrees = range(1, 5)
alphas = [0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9]

train_combined = np.hstack((x_train, y_train.reshape((y_train.shape[0], 1))))
num_model = 10
size = train_combined.shape[0] // num_model
xy_train_bag = np.array_split(train_combined, num_model)

from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count

start_time = time()
with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
    results = list(executor.map(build, tqdm(xy_train_bag)))
print('Time:', time()-start_time)

preds_train = []
preds_test = []
for i, result in tqdm(enumerate(results)):
    m = result[0]
    d = result[1]
    x_train_poly = numba_poly(x_train, d)
    x_test_poly = numba_poly(x_test, d)
    preds_train.append(x_train_poly.dot(m))
    preds_test.append(x_test_poly.dot(m))

preds_train = np.array(preds_train)
preds_train = np.mean(preds_train, axis=0)
preds_test = np.array(preds_test)
preds_test = np.mean(preds_test, axis=0)
print(evaluate(preds_train, y_train), evaluate(preds_test, y_test))

100%|██████████| 10/10 [00:00<00:00, 651.83it/s]


-
-
-
-
-
-
-
-
-
-
Time: 100.72308254241943


10it [00:50,  5.01s/it]


3.7941910505352165 3.792306243166858


**Threading & numpy optimization**

In [15]:
def build(xy):
    x = xy[:, :-1]
    y = xy[:, -1]
    A = np.linalg.inv(x.T.dot(x)).dot(x.T).dot(y)
    return A

train_combined = np.hstack((x_train, y_train.reshape((y_train.shape[0], 1))))
num_model = 100
size = x_train.shape[0] // num_model
xy_train_bag = []
for i in range(num_model):
    if i != (num_model-1):
        xy_train_bag.append(train_combined[i*size: (i+1)*size, :])
    else:
        xy_train_bag.append(train_combined[i*size:, :])

In [16]:
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count
start_time = time()

with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
    models = list(executor.map(build, tqdm(xy_train_bag)))

pred_train = np.mean(x_train.dot(np.array(models).T), axis=1)
pred_test = np.mean(x_test.dot(np.array(models).T), axis=1)
print(evaluate(pred_train, y_train), evaluate(pred_test, y_test))
print('Time:', time()-start_time)

100%|██████████| 100/100 [00:03<00:00, 27.88it/s]


3.8874843046946705 3.8893123081804886
Time: 42.552006006240845


**xgboost**

In [7]:
import xgboost as xgb
from multiprocessing import Pool, cpu_count
start_time = time()
dtrain = xgb.DMatrix(x_train, label=y_train)
params = {
    'objective': 'reg:squarederror',
    'max_depth': 4,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'rmse',
    'n_jobs': -1,
    'tree_method': 'gpu_hist',  # GPU acceleration
}

num_boost_round = 200
model = xgb.train(params, dtrain, num_boost_round)
print(time()-start_time)

29.77746605873108


In [8]:
dtest = xgb.DMatrix(x_test)
pred_train = model.predict(dtrain)
pred_test = model.predict(dtest)
evaluate(pred_train, y_train), evaluate(pred_test, y_test)

(3.43670333520457, 3.4361313508607387)