In [None]:
!pip install gpytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gpytorch
  Downloading gpytorch-1.8.1-py2.py3-none-any.whl (361 kB)
[K     |████████████████████████████████| 361 kB 6.7 MB/s 
Installing collected packages: gpytorch
Successfully installed gpytorch-1.8.1


In [None]:
import math
import torch
import gpytorch
from matplotlib import pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np

In [None]:
# Training data is 100 points in [0,1] inclusive regularly spaced
train_x = torch.linspace(0, 1, 100)
# True function is sin(2*pi*x) with Gaussian noise
train_y = torch.sin(train_x * (2 * math.pi)) + torch.randn(train_x.size()) * math.sqrt(0.04)

In [None]:
import pandas as pd
train = pd.read_csv('train_x.csv')
test = pd.read_csv('test_x.csv')
target = pd.read_csv('train_y.csv')

x = train.to_numpy()
t = test.to_numpy()
yy = target.to_numpy().flatten()

In [None]:
tt = torch.tensor(t, dtype=torch.float32)

In [None]:
from sklearn.model_selection import *
X_train, X_test, y_train, y_test = train_test_split(x, yy, test_size=0.2, random_state=42)
xt = torch.from_numpy(X_train)
yt = torch.from_numpy(y_train)
xtest = torch.tensor(X_test, dtype=torch.float32)
ytest =torch.tensor(y_test, dtype=torch.float32)

train_x = torch.tensor(x, dtype=torch.float32)
train_y = torch.tensor(yy, dtype=torch.float32)

In [None]:
# We will use the simplest form of GP model, exact inference
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        # self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5))
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [None]:
# initialize likelihood and model
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(train_x, train_y, likelihood)

# train_x = train_x.cuda()
# train_y = train_y.cuda()
# model = model.cuda()
# likelihood = likelihood.cuda()

In [None]:
# Find optimal model hyperparameters
model.train()
likelihood.train()

GaussianLikelihood(
  (noise_covar): HomoskedasticNoise(
    (raw_noise_constraint): GreaterThan(1.000E-04)
  )
)

In [None]:
loaded = torch.load('model_state_improved4.pth', map_location = torch.device('cpu'))
model.load_state_dict(loaded)

<All keys matched successfully>

In [None]:
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

GaussianLikelihood(
  (noise_covar): HomoskedasticNoise(
    (raw_noise_constraint): GreaterThan(1.000E-04)
  )
)

In [None]:
# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    f_preds = model(train_x)
    y_preds = likelihood(model(train_x))
    observed_pred = likelihood(model(train_x))
    # print(cost_function(y_test, f_preds))



In [None]:
n_bins = 100

In [None]:
train['mean'] = f_preds.mean.cpu().numpy()
train['std'] = f_preds.stddev.cpu().detach().numpy()

In [None]:
train['latbin'] = pd.cut(train.lat,
                         bins = np.linspace(0, 1, n_bins),
                         labels = [i for i in range(n_bins-1)])
train['lonbin'] = pd.cut(train.lon,
                         bins = np.linspace(0, 1, n_bins),
                         labels = [i for i in range(n_bins-1)])

In [None]:
train['y'] = target.values

In [None]:
train['latenc'] = train.groupby('latbin')['y'].transform('mean')
train['lonenc'] = train.groupby('lonbin')['y'].transform('mean')

In [None]:
train.to_parquet('traineng.csv', index=False)

In [None]:
def cost_function(ground_truth: np.ndarray, predictions: np.ndarray) -> float:
    """
    Calculates the cost of a set of predictions.

    :param ground_truth: Ground truth pollution levels as a 1d NumPy float array
    :param predictions: Predicted pollution levels as a 1d NumPy float array
    :return: Total cost of all predictions as a single float
    """
    assert ground_truth.ndim == 1 and predictions.ndim == 1 and ground_truth.shape == predictions.shape

    # Unweighted cost
    cost = (ground_truth - predictions) ** 2
    weights = np.ones_like(cost) * COST_W_NORMAL

    # Case i): underprediction
    mask_1 = predictions < ground_truth
    weights[mask_1] = COST_W_UNDERPREDICT

    # Case ii): significant overprediction
    mask_2 = (predictions >= 1.2*ground_truth)
    weights[mask_2] = COST_W_OVERPREDICT

    # Weigh the cost and return the average
    return weights

In [None]:
def cost_function3(ground_truth: np.ndarray, predictions: np.ndarray) -> float:
    # Unweighted cost
    cost = (ground_truth - predictions) ** 2
    weights = np.ones_like(cost) * COST_W_NORMAL

    # Case i): underprediction
    mask_1 = predictions < ground_truth
    weights[mask_1] = COST_W_UNDERPREDICT

    # Case ii): significant overprediction
    mask_2 = (predictions >= 1.2*ground_truth)
    weights[mask_2] = COST_W_OVERPREDICT

    # Weigh the cost and return the average
    return np.mean(cost * weights)


In [None]:
def custom_asymmetric_train(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    grad = np.where(y_pred < y_true, -2 * residual * COST_W_UNDERPREDICT, -2 * residual)
    grad[(y_pred >= 1.2*y_pred)] *= COST_W_OVERPREDICT
    hess = np.where(residual < 0, 2, 2 * 1.15)
    return grad, hess

def custom_asymmetric_valid(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    loss = np.where(y_pred < y_true, (residual ** 2)*COST_W_UNDERPREDICT , (residual ** 2)) 
    loss[(y_pred >= 1.2*y_pred)] *= COST_W_OVERPREDICT
    return "custom_asymmetric_eval", np.mean(loss), False

In [None]:
EXTENDED_EVALUATION = False
EVALUATION_GRID_POINTS = 300  # Number of grid points used in extended evaluation
EVALUATION_GRID_POINTS_3D = 50  # Number of points displayed in 3D during evaluation


# Cost function constants
COST_W_UNDERPREDICT = 25.0
COST_W_NORMAL = 1.0
COST_W_OVERPREDICT = 10.0

In [None]:
cost_function3(yy, train['mean'].to_numpy())

25.382101186882302

In [None]:
train['outcome'] = cost_function(yy, train['mean'].to_numpy())

In [None]:
def classify(x):
  if x == 25.0:
    return 2
  if x == 1.0:
    return 0
  else:
    return 1

In [None]:
train['outcome'] = train['outcome'].apply(classify)

In [None]:
train['under'] = train['outcome'].apply(lambda x: int(x==2))
train['over'] = train['outcome'].apply(lambda x: int(x==1))

In [None]:
train[train.outcome == 1]

Unnamed: 0,lon,lat,mean,std,latbin,lonbin,y,latenc,lonenc,outcome,under,over,lightgbm1,over_predict,prediction
24,0.13750,0.11750,2.787493,4.199947,11,13,1.525908,16.237673,16.596188,1,0,1,0.968044,1,4.887466
27,0.13625,0.31375,7.384171,4.208464,31,13,6.101633,17.371968,16.596188,1,0,1,0.963469,1,9.488402
29,0.16500,0.15000,2.584280,4.155226,14,16,1.316842,15.698029,17.185169,1,0,1,0.936452,1,4.661893
38,0.01750,0.55125,4.038328,4.204961,54,1,2.677605,26.439575,27.373991,1,0,1,0.971666,1,6.140809
72,0.14500,0.03750,2.557619,4.211265,3,14,1.426685,13.389095,18.991625,1,0,1,0.939641,1,4.663251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15022,0.13750,0.92250,2.949808,4.180389,91,13,0.805654,36.853762,16.596188,1,0,1,0.966127,1,5.040003
15045,0.06500,0.10625,3.174505,4.153405,10,6,1.459521,16.685854,29.063866,1,0,1,0.967660,1,5.251208
15082,0.93875,0.66375,4.933876,4.172325,65,92,4.066125,39.946386,28.131684,1,0,1,0.957244,1,7.020038
15137,0.47125,0.98375,7.110817,4.169038,97,46,5.899046,21.547225,38.620667,1,0,1,0.972859,1,9.195336


In [None]:
train[train.outcome == 2]

Unnamed: 0,lon,lat,mean,std,latbin,lonbin,y,latenc,lonenc,outcome,under,over,lightgbm1,over_predict,prediction
0,0.85750,0.68625,35.508850,3.959634,67,84,36.203168,41.447538,36.532480,2,1,0,0.979027,1,37.488667
4,0.46875,0.11625,10.258566,4.218989,11,46,11.136404,16.237673,38.620667,2,1,0,0.959630,1,12.368061
5,0.89625,0.72875,28.533716,3.934246,72,88,29.275946,38.613180,34.766182,2,1,0,0.976839,1,30.500839
7,0.74500,0.46125,11.503793,4.186938,45,73,12.247759,27.317678,27.920998,2,1,0,0.970570,1,13.597262
8,0.38875,0.90250,54.875893,3.343747,89,38,55.417773,42.399862,40.805870,2,1,0,0.980185,1,56.547766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15183,0.34000,0.77375,57.231979,2.376330,76,33,57.536278,45.217468,43.777419,2,1,0,0.981296,1,58.420145
15184,0.39500,0.52375,28.101591,4.184958,51,39,28.158460,33.092367,41.461661,2,1,0,0.980039,1,30.194070
15185,0.29000,0.48125,35.522850,3.919003,47,28,36.928889,28.728491,43.010980,2,1,0,0.976273,1,37.482351
15186,0.13500,0.64000,35.978584,4.159695,63,13,36.641309,37.067451,16.596188,2,1,0,0.981050,1,38.058432


In [None]:
train[train.outcome == 0]

Unnamed: 0,lon,lat,mean,std,latbin,lonbin,y,latenc,lonenc,outcome,under,over,lightgbm1,over_predict,prediction
1,0.41125,0.67500,56.236191,3.079334,66,40,55.946348,40.539971,38.607726,0,0,0,0.977580,1,57.775858
2,0.86250,0.90625,12.601891,4.204191,89,85,12.042066,42.399862,38.135750,0,0,0,0.974998,1,14.703986
3,0.81125,0.81000,52.124763,3.237166,80,80,48.804589,50.701784,34.862988,0,0,0,0.980600,1,53.743347
6,0.43875,0.48500,16.022964,4.152954,48,43,15.184169,31.222883,37.886449,0,0,0,0.967388,1,18.099441
9,0.30875,0.62375,46.328178,3.890174,61,30,46.253389,35.903674,45.789751,0,0,0,0.976810,1,48.273266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15171,0.62500,0.83000,32.043709,3.964493,82,61,31.863254,42.193048,25.272235,0,0,0,0.961655,1,34.025955
15172,0.32125,0.80625,58.794563,2.590908,79,31,58.570414,50.959837,44.775352,0,0,0,0.964286,1,60.090017
15176,0.31875,0.19375,32.753857,4.150734,19,31,31.601010,31.956235,44.775352,0,0,0,0.972766,1,34.829224
15177,0.98625,0.99000,9.093245,4.195194,98,97,7.688593,26.151663,18.769895,0,0,0,0.972775,1,11.190842


In [None]:
test

Unnamed: 0,lon,lat
0,0.24250,0.88125
1,0.21000,0.91375
2,0.15750,0.89000
3,0.74000,0.49000
4,0.83250,0.28250
...,...,...
1650,0.23375,0.80125
1651,0.32625,0.87750
1652,0.59875,0.49125
1653,0.68125,0.36500


In [None]:
train.under.sum()

7557

In [None]:
train_features = train.loc[:, ['lat', 'lon', 'mean', 'std', 'latenc', 'lonenc']]

In [None]:
train.to_csv('predicted.csv', index = False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train['under'], test_size=0.2, random_state=42)

In [None]:
import lightgbm as lgb

In [None]:
params = {
        'objective': 'binary',
        'metric': ['auc', 'accuracy'],
        'boosting': 'gbdt',
        'max_depth': 8,
        'seed': 42,
        'scale_pos_weight': 5,
        'num_leaves': 100,
        'max_bin': 100,
        'learning_rate': 0.01,
        # 'feature_fraction': 0.20,
        'bagging_freq': 5,
        'bagging_fraction': 0.8,
        'n_jobs': -1,
        # 'lambda_l2': 2,
        # 'min_data_in_leaf': 10
        }

In [None]:
train[['under', 'outcome']]

Unnamed: 0,under,outcome
0,1,2
1,0,0
2,0,0
3,0,0
4,1,2
...,...,...
15184,1,2
15185,1,2
15186,1,2
15187,1,2


In [None]:
lgb_train = lgb.Dataset(train_features, train.under)
# lgb_valid = lgb.Dataset(X_test, y_test)
clf = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 200,
    # valid_sets = [lgb_train, lgb_valid],
    # callbacks=[lgb.early_stopping(stopping_rounds=100)],
    # verbose_eval = 25,
    )

In [None]:
# !pip install -U imbalanced-learn

In [None]:
train['lightgbm1'] = clf.predict(train_features)

In [None]:
train[train.lightgbm1 < 0.6]

Unnamed: 0,lon,lat,mean,std,latbin,lonbin,y,latenc,lonenc,outcome,under,over,lightgbm1,over_predict,prediction
51,0.72500,0.99125,58.022491,3.622193,98,71,55.311882,26.151663,31.117236,0,0,0,0.545004,0,58.022491
308,0.04250,0.58625,16.494669,4.193117,58,4,16.043844,29.688505,19.972018,0,0,0,0.546366,1,18.591227
338,0.94500,0.99375,10.551656,4.212098,98,93,9.879269,26.151663,21.685201,0,0,0,0.514440,1,12.657705
371,0.06125,0.36375,14.677321,4.176701,36,6,14.641935,24.205007,29.063866,0,0,0,0.559777,1,16.765672
616,0.00875,0.08625,2.807066,4.170551,8,0,1.407584,13.669001,21.006022,1,0,1,0.590099,1,4.892341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14208,0.00125,0.52375,4.731562,4.215725,51,0,3.960592,33.092367,21.006022,0,0,0,0.560814,1,6.839424
14397,0.00625,0.01000,2.064196,4.185112,0,0,1.614460,14.041169,21.006022,1,0,1,0.494041,0,2.064196
14491,0.98875,0.34125,5.538363,4.204209,33,97,4.828708,21.452617,18.769895,0,0,0,0.599096,1,7.640467
14954,0.01125,0.21500,61.072037,3.691821,21,1,59.806403,33.993507,27.373991,0,0,0,0.537713,1,62.917947


In [None]:
train['over_predict'] =  train['lightgbm1'].apply(lambda x: int(x > 0.7))
train['prediction'] = train['mean'] + train['over_predict']*train['std']*0.5

In [None]:
cost_function3(yy, train['prediction'].to_numpy().flatten())

20.170209456837814

In [None]:
train_features

Unnamed: 0,lat,lon,mean,std,latenc,lonenc
0,0.68625,0.85750,35.508850,3.959634,41.447538,36.532480
1,0.67500,0.41125,56.236191,3.079334,40.539971,38.607726
2,0.90625,0.86250,12.601891,4.204191,42.399862,38.135750
3,0.81000,0.81125,52.124763,3.237166,50.701784,34.862988
4,0.11625,0.46875,10.258566,4.218989,16.237673,38.620667
...,...,...,...,...,...,...
15184,0.52375,0.39500,28.101591,4.184958,33.092367,41.461661
15185,0.48125,0.29000,35.522850,3.919003,28.728491,43.010980
15186,0.64000,0.13500,35.978584,4.159695,37.067451,16.596188
15187,0.78875,0.07125,16.108635,4.156893,49.926753,28.616813


In [None]:
X_test['lgb'] = clf.predict(X_test)
X_test['over_predict'] =  X_test['lgb'].apply(lambda x: int(x > 0.7))
X_test['prediction'] = X_test['mean'] + X_test['over_predict']*X_test['std']*0.25

In [None]:
cost_function3(target.loc[y_test.index].to_numpy().flatten(), X_test['prediction'].to_numpy().flatten())

15.13451908476907

In [None]:
target.loc[y_test.index]

Unnamed: 0,pm25
13105,50.243044
4046,54.726749
14532,46.308703
2807,8.072060
5568,46.137911
...,...
10385,56.376260
9939,22.420527
1498,55.590047
9438,32.949724


In [None]:
clf.save_model('clf3.txt')

<lightgbm.basic.Booster at 0x7ffa5e9eba90>

In [None]:
X_test['over_predict'].mean()

0.9440421329822252

In [None]:
pd.DataFrame(X_test[['lat', 'lon']].to_numpy(), columns = ['lat', 'lon'])

Unnamed: 0,lat,lon
0,0.88875,0.42250
1,0.81250,0.58875
2,0.51875,0.06750
3,0.14625,0.53375
4,0.47125,0.95125
...,...,...
3033,0.78125,0.34000
3034,0.86625,0.17750
3035,0.89875,0.37750
3036,0.52000,0.04625


In [None]:
lgb.__version__

'2.2.3'

In [None]:
params = {
        'objective': 'regression',
        # 'metric': ['auc', 'accuracy'],
        'boosting': 'gbdt',
        'max_depth': 8,
        'seed': 42,
        # 'scale_pos_weight': 50,
        # 'num_leaves': 100,
        # 'max_bin': 100,
        'learning_rate': 0.001,
        # 'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.8,
        'n_jobs': -1,
        # 'lambda_l2': 2,
        # 'min_data_in_leaf': 10
        }

In [None]:
train['dist'] = (train['lat'] - train['lon'])**2

In [None]:
train_features = train.loc[:, ['lat', 'lon', 'mean', 'std', 'latenc', 'lonenc', 'lightgbm1', 'dist']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train['y'], test_size=0.2, random_state=42)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
# lgb_train = lgb.Dataset(train_features, train['y'])
lgb_valid = lgb.Dataset(X_test, y_test)
reg = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 10000,
    valid_sets = [lgb_train, lgb_valid],
    callbacks=[lgb.early_stopping(stopping_rounds=100)],
    verbose_eval = 25,
    fobj = custom_asymmetric_train, 
    feval = custom_asymmetric_valid
    )

Training until validation scores don't improve for 100 rounds.
[25]	training's custom_asymmetric_eval: 7707.4	valid_1's custom_asymmetric_eval: 7817.89
[50]	training's custom_asymmetric_eval: 2595.84	valid_1's custom_asymmetric_eval: 2630.82
[75]	training's custom_asymmetric_eval: 890.876	valid_1's custom_asymmetric_eval: 902.269
[100]	training's custom_asymmetric_eval: 321.531	valid_1's custom_asymmetric_eval: 326.106
[125]	training's custom_asymmetric_eval: 128.744	valid_1's custom_asymmetric_eval: 132.141
[150]	training's custom_asymmetric_eval: 60.8247	valid_1's custom_asymmetric_eval: 64.1799
[175]	training's custom_asymmetric_eval: 34.1526	valid_1's custom_asymmetric_eval: 37.4176
[200]	training's custom_asymmetric_eval: 22.2886	valid_1's custom_asymmetric_eval: 25.3837
[225]	training's custom_asymmetric_eval: 16.2283	valid_1's custom_asymmetric_eval: 19.2275
[250]	training's custom_asymmetric_eval: 12.81	valid_1's custom_asymmetric_eval: 15.707
[275]	training's custom_asymmetric

In [None]:
reg.save_model('dart.txt')

In [None]:
train.to_pickle('train2.pkl')