In [None]:
!pip install gpytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gpytorch
  Downloading gpytorch-1.8.1-py2.py3-none-any.whl (361 kB)
[K     |████████████████████████████████| 361 kB 4.8 MB/s 
Installing collected packages: gpytorch
Successfully installed gpytorch-1.8.1


In [None]:
import math
import torch
import gpytorch
from gpytorch.kernels import *
from matplotlib import pyplot as plt
from itertools import product

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np

In [None]:
# Training data is 100 points in [0,1] inclusive regularly spaced
train_x = torch.linspace(0, 1, 100)
# True function is sin(2*pi*x) with Gaussian noise
train_y = torch.sin(train_x * (2 * math.pi)) + torch.randn(train_x.size()) * math.sqrt(0.04)

In [None]:
import pandas as pd
train = pd.read_csv('train_x.csv')
test = pd.read_csv('test_x.csv')
target = pd.read_csv('train_y.csv')

x = train.to_numpy()
t = test.to_numpy()
yy = target.to_numpy().flatten()

In [None]:
tt = torch.tensor(t, dtype=torch.float32)

In [None]:
from sklearn.model_selection import *
X_train, X_test, y_train, y_test = train_test_split(x, yy, test_size=0.2, random_state=42)
xt = torch.from_numpy(X_train)
yt = torch.from_numpy(y_train)
xtest = torch.tensor(X_test, dtype=torch.float32)
ytest =torch.tensor(y_test, dtype=torch.float32)

train_x = torch.tensor(xt, dtype=torch.float32)
train_y = torch.tensor(yt, dtype=torch.float32)

In [None]:
# We will use the simplest form of GP model, exact inference
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            RBFKernel(lengthscale_prior = gpytorch.priors.NormalPrior(30, 15))*LinearKernel() + MaternKernel())
        # self.covar_module = gpytorch.kernels.ScaleKernel(RBFKernel())
        # self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1.5))
        # self.covar_module = gpytorch.kernels.SpectralMixtureKernel(num_mixtures=4, ard_num_dims = 2)
        # self.covar_module.initialize_from_data(train_x, train_y)
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [None]:
# initialize likelihood and model
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(train_x, train_y, likelihood)

# train_x = train_x.cuda()
# train_y = train_y.cuda()
# model = model.cuda()
# likelihood = likelihood.cuda()

In [None]:
# # Find optimal model hyperparameters
# model.train()
# likelihood.train()

In [None]:
loaded = torch.load('model_state_scaledextra.pth', map_location = torch.device('cpu'))
model.load_state_dict(loaded)

<All keys matched successfully>

In [None]:
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

GaussianLikelihood(
  (noise_covar): HomoskedasticNoise(
    (raw_noise_constraint): GreaterThan(1.000E-04)
  )
)

In [None]:
# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    f_preds = model(train_x)
    y_preds = likelihood(model(train_x))
    observed_pred = likelihood(model(train_x))
    # print(cost_function(y_test, f_preds))



In [None]:
train['mean'] = f_preds.mean.cpu().numpy()
train['std'] = f_preds.stddev.cpu().detach().numpy()
train['y'] = yy

In [None]:
train2 = train2.copy()
# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    f_preds = model(train_x)
    observed_pred = likelihood(model(train_x))

train2['mean'] = observed_pred.mean.cpu().numpy()
train2['std'] = observed_pred.stddev.cpu().detach().numpy()

In [None]:
train = pd.concat([train, train2])

In [None]:
n_bins = 20
train['latbin'] = pd.cut(train.lat,
                         bins = np.linspace(0, 1, n_bins),
                         labels = [i for i in range(n_bins-1)])
train['lonbin'] = pd.cut(train.lon,
                         bins = np.linspace(0, 1, n_bins),
                         labels = [i for i in range(n_bins-1)])

In [None]:
traing = train.groupby(['latbin', 'lonbin']).size().reset_index()
traing['cats'] = traing['latbin'].astype(str)+ traing['lonbin'].astype(str)
traing

Unnamed: 0,latbin,lonbin,0,cats
0,0,0,102,00
1,0,1,87,01
2,0,2,96,02
3,0,3,84,03
4,0,4,93,04
...,...,...,...,...
356,18,14,78,1814
357,18,15,63,1815
358,18,16,54,1816
359,18,17,48,1817


In [None]:
categories = traing['cats']

In [None]:
train['cats'] = train['latbin'].astype(str)+ train['lonbin'].astype(str)

In [None]:
train['latenc'] = train.groupby('latbin')['y'].transform('mean')
train['lonenc'] = train.groupby('lonbin')['y'].transform('mean')
train['catenc'] = train.groupby('cats')['y'].transform('mean')

In [None]:
def cost_function(ground_truth: np.ndarray, predictions: np.ndarray) -> float:
    """
    Calculates the cost of a set of predictions.

    :param ground_truth: Ground truth pollution levels as a 1d NumPy float array
    :param predictions: Predicted pollution levels as a 1d NumPy float array
    :return: Total cost of all predictions as a single float
    """
    assert ground_truth.ndim == 1 and predictions.ndim == 1 and ground_truth.shape == predictions.shape

    # Unweighted cost
    cost = (ground_truth - predictions) ** 2
    weights = np.ones_like(cost) * COST_W_NORMAL

    # Case i): underprediction
    mask_1 = predictions < ground_truth
    weights[mask_1] = COST_W_UNDERPREDICT

    # Case ii): significant overprediction
    mask_2 = (predictions >= 1.2*ground_truth)
    weights[mask_2] = COST_W_OVERPREDICT

    # Weigh the cost and return the average
    return weights

In [None]:
def cost_function3(ground_truth: np.ndarray, predictions: np.ndarray) -> float:
    # Unweighted cost
    cost = (ground_truth - predictions) ** 2
    weights = np.ones_like(cost) * COST_W_NORMAL

    # Case i): underprediction
    mask_1 = predictions < ground_truth
    weights[mask_1] = COST_W_UNDERPREDICT

    # Case ii): significant overprediction
    mask_2 = (predictions >= 1.2*ground_truth)
    weights[mask_2] = COST_W_OVERPREDICT

    # Weigh the cost and return the average
    return np.mean(cost * weights)


In [None]:
def custom_asymmetric_train(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    grad = np.where(y_pred < y_true, -2 * residual * COST_W_UNDERPREDICT, -2 * residual)
    grad[(y_pred >= 1.2*y_pred)] *= COST_W_OVERPREDICT
    hess = np.where(y_pred < y_true, 2*COST_W_UNDERPREDICT, 2)
    return grad, hess

def custom_asymmetric_valid(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    loss = np.where(y_pred < y_true, (residual ** 2)*COST_W_UNDERPREDICT , (residual ** 2)) 
    loss[(y_pred >= 1.2*y_pred)] *= COST_W_OVERPREDICT
    return "custom_asymmetric_eval", np.mean(loss), False

In [None]:
EXTENDED_EVALUATION = False
EVALUATION_GRID_POINTS = 300  # Number of grid points used in extended evaluation
EVALUATION_GRID_POINTS_3D = 50  # Number of points displayed in 3D during evaluation


# Cost function constants
COST_W_UNDERPREDICT = 25.0
COST_W_NORMAL = 1.0
COST_W_OVERPREDICT = 10.0

In [None]:
yy = train.y.values

In [None]:
train['outcome'] = cost_function(yy, train['mean'].to_numpy())

In [None]:
def classify(x):
  if x == 25.0:
    return 2
  if x == 1.0:
    return 0
  else:
    return 1

In [None]:
train['outcome'] = train['outcome'].apply(classify)

In [None]:
train['under'] = train['outcome'].apply(lambda x: int(x==2))
train['over'] = train['outcome'].apply(lambda x: int(x==1))

In [None]:
train[train.outcome == 1]

Unnamed: 0,lon,lat,mean,std,y,latbin,lonbin,latenc,lonenc,cats,catenc,outcome,under,over
24,0.13750,0.11750,2.357113,8.654955,1.525908,2,2,16.564155,19.687086,22,2.906833,1,0,1
29,0.16500,0.15000,2.157019,8.515218,1.316842,2,3,16.564155,25.500984,23,3.140791,1,0,1
38,0.01750,0.55125,3.529221,8.510155,2.677605,10,0,27.948769,24.355655,100,6.289975,1,0,1
72,0.14500,0.03750,2.267813,8.309195,1.426685,0,2,12.966999,19.687086,02,2.756900,1,0,1
126,0.14000,0.02625,2.689135,8.432120,1.624092,0,2,12.966999,19.687086,02,2.756900,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14980,0.07000,0.10125,3.942963,8.565783,2.370336,1,1,13.436385,28.382257,11,3.139628,1,0,1
15005,0.24625,0.26000,8.804585,8.796293,6.553803,4,4,30.958405,38.966694,44,11.127302,1,0,1
15022,0.13750,0.92250,2.771095,8.403680,0.805654,17,2,39.097411,19.687086,172,7.068823,1,0,1
15045,0.06500,0.10625,2.422272,8.544442,1.459521,2,1,16.564155,28.382257,21,3.092241,1,0,1


In [None]:
train[train.outcome == 2]

Unnamed: 0,lon,lat,mean,std,y,latbin,lonbin,latenc,lonenc,cats,catenc,outcome,under,over
0,0.85750,0.68625,36.018692,7.518279,36.203168,13,16,39.766453,37.665178,1316,48.322336,2,1,0
4,0.46875,0.11625,10.425234,8.856779,11.136404,2,8,16.564155,38.924089,28,15.470082,2,1,0
5,0.89625,0.72875,29.084921,7.651525,29.275946,13,17,39.766453,29.201009,1317,19.078740,2,1,0
7,0.74500,0.46125,11.827122,8.789547,12.247759,8,14,25.177741,29.739814,814,9.725168,2,1,0
10,0.85125,0.84875,16.906811,8.597893,17.516266,16,16,43.728037,37.665178,1616,13.690483,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15183,0.34000,0.77375,57.507629,4.879779,57.536278,14,6,44.063324,43.140775,146,52.261886,2,1,0
15184,0.39500,0.52375,27.922565,8.720506,28.158460,9,7,31.817124,39.897288,97,22.172720,2,1,0
15185,0.29000,0.48125,35.377037,7.466801,36.928889,9,5,31.817124,43.755993,95,46.749843,2,1,0
15186,0.13500,0.64000,36.303581,8.142138,36.641309,12,2,39.224492,19.687086,122,33.579195,2,1,0


In [None]:
train[train.outcome == 0]

Unnamed: 0,lon,lat,mean,std,y,latbin,lonbin,latenc,lonenc,cats,catenc,outcome,under,over
1,0.41125,0.67500,57.037819,5.435280,55.946348,12,7,39.224492,39.897288,127,50.897888,0,0,0
2,0.86250,0.90625,12.339543,9.084759,12.042066,17,16,39.097411,37.665178,1716,9.830223,0,0,0
3,0.81125,0.81000,49.940948,6.032210,48.804589,15,15,48.348403,35.457035,1515,50.775624,0,0,0
6,0.43875,0.48500,15.752648,8.428197,15.184169,9,8,31.817124,38.924089,98,19.250741,0,0,0
8,0.38875,0.90250,55.758003,5.874866,55.417773,17,7,39.097411,39.897288,177,48.306537,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15170,0.02375,0.95000,6.740692,9.059623,6.674442,18,0,26.252751,24.355655,180,6.248843,0,0,0
15176,0.31875,0.19375,31.988503,8.395994,31.601010,3,6,26.640901,43.140775,36,22.800073,0,0,0
15177,0.98625,0.99000,8.390464,9.344408,7.688593,18,18,26.252751,21.532326,1818,10.353756,0,0,0
15179,0.85125,0.74875,56.876602,5.609599,56.422804,14,16,44.063324,37.665178,1416,52.384893,0,0,0


In [None]:
test

Unnamed: 0,lon,lat
0,0.24250,0.88125
1,0.21000,0.91375
2,0.15750,0.89000
3,0.74000,0.49000
4,0.83250,0.28250
...,...,...
1650,0.23375,0.80125
1651,0.32625,0.87750
1652,0.59875,0.49125
1653,0.68125,0.36500


In [None]:
train.under.sum()

22833

In [None]:
train_features = train.loc[:, ['lat', 'lon', 'mean', 'std', 'catenc']]

In [None]:
train.to_csv('predicted.csv', index = False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train['under'], test_size=0.2, random_state=42)

In [None]:
import lightgbm as lgb

In [None]:
params = {
        'objective': 'binary',
        'metric': ['auc', 'accuracy'],
        'boosting': 'gbdt',
        'max_depth': 8,
        'seed': 42,
        'scale_pos_weight': 5,
        'num_leaves': 100,
        'max_bin': 100,
        'learning_rate': 0.01,
        # 'feature_fraction': 0.20,
        'bagging_freq': 5,
        'bagging_fraction': 0.8,
        'n_jobs': -1,
        # 'lambda_l2': 2,
        # 'min_data_in_leaf': 10
        }

In [None]:
train[['under', 'outcome']]

Unnamed: 0,under,outcome
0,1,2
1,0,0
2,0,0
3,0,0
4,1,2
...,...,...
15184,1,2
15185,1,2
15186,1,2
15187,1,2


In [None]:
lgb_train = lgb.Dataset(train_features, train.under)
# lgb_valid = lgb.Dataset(X_test, y_test)
clf = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 200,
    # valid_sets = [lgb_train, lgb_valid],
    # callbacks=[lgb.early_stopping(stopping_rounds=100)],
    # verbose_eval = 25,
    )

In [None]:
lgb_train = lgb.Dataset(train_features, train.over)
# lgb_valid = lgb.Dataset(X_test, y_test)
clfover = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 200,
    # valid_sets = [lgb_train, lgb_valid],
    # callbacks=[lgb.early_stopping(stopping_rounds=100)],
    # verbose_eval = 25,
    )

In [None]:
# !pip install -U imbalanced-learn

In [None]:
train['lightgbm1'] = clf.predict(train_features)

In [None]:
train['lightgbm2'] = clfover.predict(train_features)

In [None]:
train[train.lightgbm1 > 0.8]

Unnamed: 0,lon,lat,mean,std,y,latbin,lonbin,latenc,lonenc,cats,catenc,outcome,under,over,lightgbm1,lightgbm2,under_predict,over_predict,prediction
0,0.85750,0.68625,36.018692,7.518279,36.203168,13,16,39.766453,37.665178,1316,48.322336,2,1,0,0.809435,0.002628,1,0,35.266864
7,0.74500,0.46125,11.827122,8.789547,12.247759,8,14,25.177741,29.739814,814,9.725168,2,1,0,0.811352,0.005193,1,0,10.948167
8,0.38875,0.90250,55.758003,5.874866,55.417773,17,7,39.097411,39.897288,177,48.306537,0,0,0,0.800997,0.002628,1,0,55.170517
10,0.85125,0.84875,16.906811,8.597893,17.516266,16,16,43.728037,37.665178,1616,13.690483,2,1,0,0.865600,0.004005,1,0,16.047021
11,0.27750,0.47000,18.734116,8.336804,18.925327,8,5,25.177741,43.755993,85,17.645353,2,1,0,0.800208,0.002812,1,0,17.900435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15180,0.28750,0.92625,57.234322,6.591998,57.964154,17,5,39.097411,43.755993,175,52.947271,2,1,0,0.805062,0.002628,1,0,56.575122
15181,0.92250,0.83000,52.607407,6.640575,52.803814,15,17,48.348403,29.201009,1517,53.126737,2,1,0,0.844730,0.002628,1,0,51.943349
15184,0.39500,0.52375,27.922565,8.720506,28.158460,9,7,31.817124,39.897288,97,22.172720,2,1,0,0.839364,0.002628,1,0,27.050515
15186,0.13500,0.64000,36.303581,8.142138,36.641309,12,2,39.224492,19.687086,122,33.579195,2,1,0,0.809804,0.002628,1,0,35.489367


In [None]:
train[train.lightgbm2 > 0.8].tail(10)

Unnamed: 0,lon,lat,mean,std,y,latbin,lonbin,latenc,lonenc,cats,catenc,outcome,under,over,lightgbm1,lightgbm2,under_predict,over_predict,prediction
631,0.175,0.14125,1.828562,8.494775,1.464847,2,3,16.564155,25.500984,23,3.140791,1,0,1,0.735625,0.802487,1,0,0.979084
1425,0.7325,0.77375,1.450851,8.274945,1.1777,14,13,44.063324,32.943546,1413,31.699444,1,0,1,0.223936,0.800924,0,0,1.450851
8157,0.12625,0.125,2.331501,8.77669,1.688547,2,2,16.564155,19.687086,22,2.906833,1,0,1,0.635654,0.80459,0,0,2.331501
8400,0.12875,0.1375,1.633375,8.766681,-0.077774,2,2,16.564155,19.687086,22,2.906833,1,0,1,0.681551,0.806354,0,0,1.633375
11420,0.96625,0.0675,2.150627,8.808228,1.739147,1,18,13.436385,21.532326,118,12.772315,1,0,1,0.20214,0.804295,0,0,2.150627
11450,0.04,0.36875,2.329113,8.401885,1.73577,7,0,25.371174,24.355655,70,4.002702,1,0,1,0.34984,0.816206,0,0,2.329113
11505,0.96,0.07,2.222677,8.818819,1.972151,1,18,13.436385,21.532326,118,12.772315,0,0,0,0.179662,0.803053,0,0,2.222677
12275,0.945,0.06,2.036966,8.823899,-0.088993,1,17,13.436385,29.201009,117,25.75983,1,0,1,0.147845,0.808267,0,0,2.036966
12890,0.03875,0.37625,1.404892,8.429545,0.492631,7,0,25.371174,24.355655,70,4.002702,1,0,1,0.355545,0.821703,0,0,1.404892
13654,0.57375,0.91875,25.975048,7.644673,20.051317,17,10,39.097411,23.656125,1710,33.732305,1,0,1,0.608723,0.812202,0,0,25.975048


In [None]:
train['under_predict'] =  train['lightgbm1'].apply(lambda x: int(x > 0.7))
train['over_predict'] =  train['lightgbm2'].apply(lambda x: int(x > 0.7))
train['prediction'] = train['mean'] + train['under_predict']*train['std']*0.1 - train['over_predict']*train['std']*0.1

In [None]:
cost_function3(yy, train['prediction'].to_numpy().flatten())

5.241610487758148

In [None]:
train_features

Unnamed: 0,lat,lon,mean,std,catenc
0,0.68625,0.85750,36.018692,7.518279,48.322336
1,0.67500,0.41125,57.037819,5.435280,50.897888
2,0.90625,0.86250,12.339543,9.084759,9.830223
3,0.81000,0.81125,49.940948,6.032210,50.775624
4,0.11625,0.46875,10.425234,8.856779,15.470082
...,...,...,...,...,...
15184,0.52375,0.39500,27.922565,8.720506,22.172720
15185,0.48125,0.29000,35.377037,7.466801,46.749843
15186,0.64000,0.13500,36.303581,8.142138,33.579195
15187,0.78875,0.07125,16.232861,8.700656,21.887749


In [None]:
X_test['lgb'] = clf.predict(X_test)
X_test['under_predict'] =  X_test['lgb'].apply(lambda x: int(x > 0.7))
X_test['prediction'] = X_test['mean'] + X_test['under_predict']*X_test['std']*0.1

In [None]:
cost_function3(target.loc[y_test.index].to_numpy().flatten(), X_test['prediction'].to_numpy().flatten())

5.42791956269638

In [None]:
target.loc[y_test.index]

Unnamed: 0,pm25
7003,42.223090
9667,43.105505
7520,34.680551
13513,2.014432
5780,17.846950
...,...
14161,21.072224
12133,6.625559
10539,23.101554
3429,55.023050


In [None]:
clf.save_model('clfunder.txt')
clfover.save_model('clfover.txt')

<lightgbm.basic.Booster at 0x7f956847f510>

In [None]:
X_test['under_predict'].mean()

0.9177090190915076

In [None]:
pd.DataFrame(X_test[['lat', 'lon']].to_numpy(), columns = ['lat', 'lon'])

Unnamed: 0,lat,lon
0,0.94000,0.33750
1,0.71125,0.41625
2,0.95625,0.20125
3,0.15625,0.09000
4,0.61125,0.60625
...,...,...
9109,0.54125,0.68250
9110,0.27750,0.59125
9111,0.53125,0.46125
9112,0.48375,0.92000


In [None]:
lgb.__version__

'2.2.3'

In [None]:
params = {
        'objective': 'regression',
        # 'metric': ['auc', 'accuracy'],
        'boosting': 'gbdt',
        'max_depth': 8,
        'seed': 42,
        # 'scale_pos_weight': 50,
        # 'num_leaves': 100,
        # 'max_bin': 100,
        'learning_rate': 0.001,
        # 'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.8,
        'n_jobs': -1,
        # 'lambda_l2': 2,
        # 'min_data_in_leaf': 10
        }

In [None]:
train['dist'] = (train['lat'] - train['lon'])**2

In [None]:
train_features = train.loc[:, ['lat', 'lon', 'mean', 'std', 'catenc', 'lightgbm1', 'lightgbm2', 'dist']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train['y'], test_size=0.05, random_state=42)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
# lgb_train = lgb.Dataset(train_features, train['y'])
lgb_valid = lgb.Dataset(X_test, y_test)
reg = lgb.train(
    params = params,
    train_set = lgb_train,
    num_boost_round = 10000,
    valid_sets = [lgb_train, lgb_valid],
    callbacks=[lgb.early_stopping(stopping_rounds=100)],
    verbose_eval = 25,
    fobj = custom_asymmetric_train, 
    feval = custom_asymmetric_valid
    )