In [1]:
## imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, precision_score
# We will use the lightgbm python package. its advantages is that you can insert to the model
# lines with missing values. Another advantage is that categorical features can be handled with
# unique integers/floats and is much more faster than one-hot coding

In [2]:
## Load and observe the data
filepath = r"C:\Projects\Deep Learning Academy\PerimeterX\Nominal2_obf.csv"
data = pd.read_csv(filepath)
print(data.head(5))
print(data.shape)


     ID  product vertical  HOUR Device Location  was clicked gender age
0  2721      695      'J'     0   'D8'        ?        False      m   ?
1  2722      695      'A'     0   'D8'        ?        False      m   ?
2  2723      695      'B'     0   'D8'        ?        False      m   ?
3  2724      695      'C'     0   'D8'        ?        False      m   ?
4  2725      121      'D'     2      ?        ?        False      f  21
(100000, 9)


In [3]:
## lets remove columns that are non-informative
data[data == "?"] = np.nan
print("Percentage of missing values in each column:")
print(pd.isnull(data).sum()/data.shape[0] * 100)


Percentage of missing values in each column:
ID               0.000
product          0.000
vertical         0.001
HOUR             0.000
Device          39.351
Location       100.000
was clicked      0.000
gender           0.000
age              9.751
dtype: float64


In [4]:
# We can see that all values in the location column are missing - location is not informative
# There are also some missing values in columns "vertical", "Device" and "age", but the model will handle it
#  We will remove the columns "ID" and "Location" which are non-informative
data.drop(["ID", "Location"], axis=1, inplace=True)


In [5]:
## lets check if the data is balanced
percentage_clicked = (data["was clicked"].sum())/data.shape[0]
print("The percentage of lines clicked is: " + str(round(percentage_clicked*100,1)) + "%")


The percentage of lines clicked is: 6.5%


In [6]:
# The data is unbalanced
## Convert categorical columns to floats
categorical_cols = ["product", "vertical", "Device", "gender"]
for col in categorical_cols:
    data[col], levels  = pd.factorize(data[col], na_sentinel=-100)
data[data == -100] = np.nan


In [7]:
## Convert the age column (string) to float
data["age"] = data["age"].astype(float)


In [8]:
## Divide to features and target
y = data["was clicked"]
x = data.drop(["was clicked"], axis=1)


In [9]:
## Divide to train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=42) #random_state for reproducing the results between runs
x_train = x_train.values
x_test = x_test.values


In [10]:
# create train data set for the lgb training model
lgb_train = lgb.Dataset(x_train, y_train)


In [11]:
# Specifying the parameters of the model and taking into consideration the unbalanced data
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {"binary_error"},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': 0,
    'scale_pos_weight':0.85 # handles with the unbalanced data
}


In [12]:
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000)


In [13]:
## The following is a function that predicts and evaluates the predictions of the model
def evaluate_prediction(x, ytrue, gbm, threshold=0.5):
    # The function evaluates the predictions of a given model
    # Inputs:
    # x - MxN numpy array of the features
    # ytrue - Mx1 numpy array of the targets
    # gbm - predictive model
    # threshold - The threshold for a prediction probability to be considered as class "1" (clicked)
    #
    # Outputs:
    # x - features
    # ypred - Mx1 numpy array of the predicted probabilities to be clicked 
    # accuracy - 1x1 of the accuracy of the model
    # conf_matrix - 2x2 of the confusion matrix of the model
    # conf_matrix[0,0] - true negatives
    # conf_matrix[1,0] - false negatives
    # conf_matrix[0,1] - false positives
    # conf_matrix[1,1] - true positives
    # precision - 1x1 of the precision of the model
    ypred = gbm.predict(x)

    # binarize the predictions based on a threshold (default of 50%). If the probability to have a click is greater
    #  than the threshold, it will be considered as a prediction of a clicked ad
    ypred_binary = ypred > threshold
    accuracy = (sum(ypred_binary == ytrue)) / ytrue.shape[0]
    conf_matrix = confusion_matrix(ytrue, ypred_binary)
    precision = precision_score(ytrue, ypred_binary, average="binary")

    return ypred, accuracy, conf_matrix, precision

In [14]:
## prediction of the test set
ypred_test, accuracy_test, conf_matrix_test, precision_test = evaluate_prediction(x_test, y_test, gbm)


In [15]:
# We will print some results of the prediction of the test set
# "ypred_test" holds the probability to have a click. We will print The first 10 values
print(ypred_test[:10])
print("The prediction accuracy on the test set is: " + str(round(accuracy_test*100,1)) + str("%"))
print("The confusion matrix on the test set is: ")
print(conf_matrix_test)
# We could adjust the parameter "scale_pos_weight" to get more true positives - it depends on what is
# important for us in the prediction
print("The precision on the test set is: " + str(round(precision_test*100,1)) + str("%"))


[9.19841286e-01 3.55404998e-05 3.68506633e-03 1.57380199e-01
 1.27153877e-04 2.08039020e-02 1.15406829e-03 2.87942422e-03
 4.08753388e-05 1.17074086e-02]
The prediction accuracy on the test set is: 96.7%
The confusion matrix on the test set is: 
[[27795   266]
 [  718  1221]]
The precision on the test set is: 82.1%


In [16]:
## Prediction of all lines of the dataset as requested
ypred_all, accuracy_all, conf_matrix_all, precision_all = evaluate_prediction(x, y, gbm)


In [17]:
## Printing some results of the prediction of all lines in the dataset
print(ypred_all[:10])
print("The prediction accuracy on the whole dataset is: " + str(round(accuracy_all*100,1)) + str("%"))
print("The confusion matrix on the whole data set is: ")
print(conf_matrix_all)
print("The precision on the whole dataset is: " + str(round(precision_all*100,1)) + str("%"))


[1.28521031e-10 3.08080928e-09 7.02731075e-10 2.46250652e-08
 3.06340355e-04 4.23061900e-04 6.15251178e-04 4.33842378e-04
 4.53744867e-04 5.09575348e-03]
The prediction accuracy on the whole dataset is: 97.2%
The confusion matrix on the whole data set is: 
[[92767   710]
 [ 2128  4395]]
The precision on the whole dataset is: 86.1%


In [None]:
## We can also perform a grid search to optimize the parameters of the classifier