In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

# Load in Data for Exploration

In [2]:
df = pd.read_csv("data/train_features.csv")
df.head()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner
0,2,3,2,1,76,1,0
1,1,3,2,0,70,4,3
2,0,3,1,0,70,4,2
3,0,2,2,1,65,1,0
4,2,3,2,0,80,3,1


In [3]:
df_target = pd.read_csv("data/train_target.csv")
df_target.head()

Unnamed: 0,verification_result
0,False
1,False
2,True
3,False
4,True


In [4]:
df_target['verification_result'] = df_target['verification_result'].astype(int)
df_target.head()

Unnamed: 0,verification_result
0,0
1,0
2,1
3,0
4,1


In [5]:
df["target"] = df_target["verification_result"]
df.head()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target
0,2,3,2,1,76,1,0,0
1,1,3,2,0,70,4,3,0
2,0,3,1,0,70,4,2,1
3,0,2,2,1,65,1,0,0
4,2,3,2,0,80,3,1,1


# Get basic information about the dataframe

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1430 entries, 0 to 1429
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   bidder1.capacity  1430 non-null   int64
 1   bidder2.capacity  1430 non-null   int64
 2   bidder3.capacity  1430 non-null   int64
 3   bidder4.capacity  1430 non-null   int64
 4   price             1430 non-null   int64
 5   product           1430 non-null   int64
 6   winner            1430 non-null   int64
 7   target            1430 non-null   int32
dtypes: int32(1), int64(7)
memory usage: 83.9 KB


Data types are fine, no NaNs

In [7]:
df.isna().any()

bidder1.capacity    False
bidder2.capacity    False
bidder3.capacity    False
bidder4.capacity    False
price               False
product             False
winner              False
target              False
dtype: bool

No NaNs !

In [8]:
df.describe()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target
count,1430.0,1430.0,1430.0,1430.0,1430.0,1430.0,1430.0,1430.0
mean,1.011888,2.1,1.878322,0.585315,71.424476,3.255944,0.459441,0.127972
std,0.816553,0.814795,0.327028,0.49284,8.031487,1.810537,1.069701,0.334176
min,0.0,0.0,1.0,0.0,59.0,1.0,0.0,0.0
25%,0.0,1.0,2.0,0.0,65.0,2.0,0.0,0.0
50%,1.0,2.0,2.0,1.0,70.0,3.0,0.0,0.0
75%,2.0,3.0,2.0,1.0,78.0,5.0,0.0,0.0
max,2.0,3.0,2.0,1.0,90.0,6.0,4.0,1.0


Nothing unusual


In [9]:
df.corr()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target
bidder1.capacity,1.0,0.066579,0.461402,-0.052081,0.280293,0.074622,-0.119222,-0.123548
bidder2.capacity,0.066579,1.0,-0.048848,-0.088353,0.051682,-0.104645,-0.020634,-0.031612
bidder3.capacity,0.461402,-0.048848,1.0,-0.065802,0.198454,-0.011187,-0.102136,-0.081533
bidder4.capacity,-0.052081,-0.088353,-0.065802,1.0,0.097716,0.134714,0.121388,0.003771
price,0.280293,0.051682,0.198454,0.097716,1.0,-0.029325,0.22922,0.211017
product,0.074622,-0.104645,-0.011187,0.134714,-0.029325,1.0,0.046916,0.025633
winner,-0.119222,-0.020634,-0.102136,0.121388,0.22922,0.046916,1.0,0.22106
target,-0.123548,-0.031612,-0.081533,0.003771,0.211017,0.025633,0.22106,1.0


Price and Winner might be interesting to look into

# Plots about the features

In [10]:
counts = df['winner'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Winner', 'y':'Values'},
             title='Entries per Winner')

# Show plot 
fig.show()

Fairly imbalanced by Winners

In [11]:
counts = df['product'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Products', 'y':'Values'},
             title='Entries per Product')

# Show plot 
fig.show()

In [12]:
fig = px.box(df, y="price")
fig.show()

In [13]:
counts = df.groupby(['winner', 'product']).size().reset_index(name='counts')
fig = px.bar(counts, x='winner', y='counts', color='product',
             title='Products per Winner', barmode='stack',
             labels={'winner':'Winner', 'counts':'Values', 'product':'Product'})
fig.show()

Nothing unusual

In [14]:
# Freedman-Diaconis Rule
# unfortunately it creates too many bins

IQR = 78 - 65  # Q3 - Q1

bin_width = 2 * IQR / (len(df["price"]) ** (1/3))

# Determine number of bins
num_bins = int((max(df["price"]) - min(df["price"])) / bin_width)

print(bin_width)
print(num_bins)

2.3077814003417765
13


In [15]:
# compute boundaries for 6 bins
bin_boundaries = np.linspace(min(df["price"]), max(df["price"]), 6)
bin_boundaries

array([59. , 65.2, 71.4, 77.6, 83.8, 90. ])

In [16]:
# apply bins
df["price_bin"] = np.digitize(df["price"], bin_boundaries)
df.head()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner,target,price_bin
0,2,3,2,1,76,1,0,0,3
1,1,3,2,0,70,4,3,0,2
2,0,3,1,0,70,4,2,1,2
3,0,2,2,1,65,1,0,0,1
4,2,3,2,0,80,3,1,1,4


In [17]:
counts = df.groupby(['winner', 'price_bin']).size().reset_index(name='counts')
fig = px.bar(counts, x='winner', y='counts', color='price_bin',
             title='Price Bins per Winner', barmode='stack')
fig.show()

Nothing unusual

In [18]:
counts = df['price_bin'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Bins', 'y':'Values'},
             title='Distribution of Price Entries per Bin')

# Show plot 
fig.show()

## Target analysis

In [19]:
counts = df['target'].value_counts()
fig = px.bar(x=counts.index, y=counts.values, labels={'x':'Label', 'y':'Values'},
             title='Distribution of Target Variable')

# Show plot 
fig.show()

Highly imbalanced data !

Could be difficult to train a model !

In [20]:
counts = df.groupby(['winner', 'target']).size().reset_index(name='counts')
fig = px.bar(counts, x='winner', y='counts', color='target',
             title='Target per Winner', barmode='stack')
fig.show()

Winner=1 has a lot of target 1

But nothing unusual

In [21]:
counts = df.groupby(['price_bin', 'target']).size().reset_index(name='counts')
fig = px.bar(counts, x='price_bin', y='counts', color='target',
             title='Target per price bin', barmode='stack')
fig.show()

Nothing unusual

### Capacity per Person with Targets

In [22]:
figures = []
for ind, i in enumerate(df.columns[:4]):
    counts = df.groupby([i, 'target']).size().reset_index(name='counts')
    figures.append(px.bar(counts, x=[i], y='counts', color='target',
             title=f'Target per {i}', barmode='stack'))
    
fig = make_subplots(rows=len(figures), cols=1) 
for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=i+1, col=1)

fig.show()

Nothing unusual

# Modeling

## Load in Data

In [2]:
df = pd.read_csv("data/train_features.csv")
df_target = pd.read_csv("data/train_target.csv")
df_target['verification_result'] = df_target['verification_result'].astype(int)

## Split Data

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df, df_target, test_size=0.33, random_state=0)

In [4]:
# a static train test split can be just lucky or unlucky
# thats why we can use cross validation
# but since we have highly imbalanced data we have to make sure we have enough data from the minority class in each test split
# so we need stratified k fold cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

## Train Baselines

As evaluation method i used the classification report (P,R,F1) combined with the mcc

The classification report of sklearn is a fantastic tool
It returns precision, recall and F1 score for every class
This way, it is easy to detect overfitting to one class

Additionally it returns the accuracy of the model, which can be ignored for this challenge, since accuracy is very misleading/useless for highly imbalanced data

But precision, recall and f1 score do not have information about true negatives, which the mcc (the metric we are evaluated on) accounts for.
So i added the mcc as another metric as well.
If i would use only the mcc i would miss the information about the class specific metrics which i find very important.

So over all i will choose the model with the best combination of F1 per class, the mcc and general knowledge about the model type with its up and downsides
For example I would always choose a random forest over a single decision tree

As a baseline i will use a single decision tree and a knn classifier

In [11]:
clf = DecisionTreeClassifier(max_depth=13, random_state=0)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
matthews_corrcoef(y_test, preds)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       401
           1       0.98      0.92      0.95        71

    accuracy                           0.99       472
   macro avg       0.99      0.96      0.97       472
weighted avg       0.99      0.99      0.98       472



0.9411021581997254

In [12]:
clf.feature_importances_

array([0.05674863, 0.01715012, 0.00698519, 0.0028085 , 0.35636581,
       0.21297983, 0.34696192])

In [6]:
# knn needs scaling
scaler_x = StandardScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

In [7]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
preds = neigh.predict(X_test)
print(classification_report(y_test, preds))
print(matthews_corrcoef(y_test, preds))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       401
           1       0.69      0.34      0.45        71

    accuracy                           0.88       472
   macro avg       0.79      0.66      0.69       472
weighted avg       0.86      0.88      0.86       472

0.4237634773274484


  return self._fit(X, y)


Fairly good results for highly imbalanced data

## Train Models

In [8]:
clf = RandomForestClassifier(max_depth=13, random_state=0)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
matthews_corrcoef(y_test, preds)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       401
           1       0.92      0.69      0.79        71

    accuracy                           0.94       472
   macro avg       0.94      0.84      0.88       472
weighted avg       0.94      0.94      0.94       472



  return fit_method(estimator, *args, **kwargs)


0.7701426843409656

In [9]:
clf = GradientBoostingClassifier(max_depth=13, random_state=0, n_estimators=1000)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(classification_report(y_test, preds, ))
matthews_corrcoef(y_test, preds)

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       401
           1       0.97      0.89      0.93        71

    accuracy                           0.98       472
   macro avg       0.97      0.94      0.96       472
weighted avg       0.98      0.98      0.98       472



0.9153383953111449

In [29]:
clf = BalancedRandomForestClassifier(
    sampling_strategy="all", replacement=True, max_depth=13, random_state=0, class_weight="balanced", n_estimators=500)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(classification_report(y_test, preds, ))
matthews_corrcoef(y_test, preds)




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



              precision    recall  f1-score   support

           0       0.99      0.88      0.93       407
           1       0.55      0.94      0.70        65

    accuracy                           0.89       472
   macro avg       0.77      0.91      0.81       472
weighted avg       0.93      0.89      0.90       472



0.6667958403537972

### Try Oversampling

In [30]:
# oversample the minority class
# But only oversample the training data and leave testing as it is to ensure realistic testing results
sm = ADASYN(random_state=42, n_neighbors=3)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [31]:
clf = RandomForestClassifier(max_depth=13, random_state=42, n_estimators=1000)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(classification_report(y_test, preds, ))
print(matthews_corrcoef(y_test, preds))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



              precision    recall  f1-score   support

           0       0.96      0.97      0.96       407
           1       0.78      0.75      0.77        65

    accuracy                           0.94       472
   macro avg       0.87      0.86      0.86       472
weighted avg       0.94      0.94      0.94       472

0.7289897734956902


In [32]:
clf_gb = GradientBoostingClassifier(max_depth=13, random_state=0, n_estimators=1000)
clf_gb.fit(X_train, y_train)
preds = clf_gb.predict(X_test)
print(classification_report(y_test, preds))
matthews_corrcoef(y_test, preds)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



              precision    recall  f1-score   support

           0       0.98      0.99      0.98       407
           1       0.92      0.88      0.90        65

    accuracy                           0.97       472
   macro avg       0.95      0.93      0.94       472
weighted avg       0.97      0.97      0.97       472



0.8820647378908713

interesting to see boosting being that much better than a random forest

In [33]:
clf = DecisionTreeClassifier(max_depth=13, random_state=0)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
matthews_corrcoef(y_test, preds)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       407
           1       0.88      0.88      0.88        65

    accuracy                           0.97       472
   macro avg       0.93      0.93      0.93       472
weighted avg       0.97      0.97      0.97       472



0.8572670572670573

### Feed Forward Neural Network

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        # dropout to avoid overfitting
        self.dropout1 = nn.Dropout(p=0.2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.sigmoid(out)
        return out


# class WeightedBCELoss(nn.Module):
#     def __init__(self, pos_weight):
#         super(WeightedBCELoss, self).__init__()
#         self.pos_weight = pos_weight

#     def forward(self, input, target):
#         loss = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)(input, target)
#         return loss

# Load in Data
df = pd.read_csv("data/train_features.csv")
df_target = pd.read_csv("data/train_target.csv")
df_target['verification_result'] = df_target['verification_result'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(df, df_target, test_size=0.33, random_state=42)

# oversample
sm = ADASYN(random_state=42, n_neighbors=3)
X_train, y_train = sm.fit_resample(X_train, y_train)

#scale features
scaler_x = StandardScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

# assign class weights
# not used here because oversampling got me better results
# total_samples = len(y_train)
# num_positives = y_train.sum()
# num_negatives = total_samples - num_positives
# class_weight = torch.tensor([num_positives / total_samples, num_negatives / total_samples])
# print(class_weight)
# criterion = WeightedBCELoss(class_weight)

# define network params
criterion = nn.BCELoss()
input_size = 7  
hidden_size = 64
output_size = 1  

# Instantiate the model
model = SimpleNN(input_size, hidden_size, output_size)

# Define loss function (binary cross entropy)
criterion = nn.BCELoss()

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to torch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
X_val = torch.tensor(X_test, dtype=torch.float32)
y_val = torch.tensor(y_test.values, dtype=torch.float32)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    # Training phase
    model.train()
    optimizer.zero_grad()
    train_outputs = model(X_train)
    train_loss = criterion(train_outputs, y_train.view(-1, 1))
    train_loss.backward()
    optimizer.step()

    # Validation phase
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_predictions = (val_outputs > 0.5).float()  # Can be adjusted if test data distribution is known to favor one class
        mcc = matthews_corrcoef(y_val.numpy(), val_predictions.numpy().flatten())
        c =classification_report(y_val.numpy(), val_predictions.numpy().flatten(), zero_division=0)
        val_loss = criterion(val_outputs, y_val.view(-1, 1))

    # Print progress
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss.item():.4f}, Validation Loss: {val_loss.item():.4f}, MCC: {mcc:.4f}')
        print(c)

Epoch [100/500], Training Loss: 0.3610, Validation Loss: 0.4455, MCC: 0.3814
              precision    recall  f1-score   support

         0.0       0.94      0.80      0.86       407
         1.0       0.35      0.69      0.47        65

    accuracy                           0.78       472
   macro avg       0.65      0.75      0.67       472
weighted avg       0.86      0.78      0.81       472

Epoch [200/500], Training Loss: 0.1916, Validation Loss: 0.4494, MCC: 0.3851
              precision    recall  f1-score   support

         0.0       0.93      0.87      0.90       407
         1.0       0.41      0.57      0.48        65

    accuracy                           0.83       472
   macro avg       0.67      0.72      0.69       472
weighted avg       0.86      0.83      0.84       472

Epoch [300/500], Training Loss: 0.1126, Validation Loss: 0.5410, MCC: 0.4586
              precision    recall  f1-score   support

         0.0       0.93      0.90      0.92       407
      

In [35]:
# Adapt the number of epochs according to how much time you have
# since this notebook sould be executed in less than 1 minute, i adjusted the value to 500
# 10k epochs resulted in ~0.66 mcc

# It would be also possible to use a transformer encoder model for the classification, but this would take longer to train/run.

# It would be also possible to use variations of boosting for probably slightly better results, such as xgboost, lightgbm or catboost.

# With more data and better domain knowledge it would be beneficial to do some feature engineering and create additional features.
# But with the current features this is (to my knowledge) not possible.

## Predict test features

The best model was the gradient boosting model with oversampled data

In [36]:
df_test = pd.read_csv("data/test_features.csv")
df_test.head()

Unnamed: 0,bidder1.capacity,bidder2.capacity,bidder3.capacity,bidder4.capacity,price,product,winner
0,0,3,2,1,70,1,0
1,2,1,2,1,71,2,0
2,2,1,2,1,67,6,0
3,0,1,2,1,70,4,3
4,2,1,2,0,66,2,0


In [37]:
df_test.isna().any()

bidder1.capacity    False
bidder2.capacity    False
bidder3.capacity    False
bidder4.capacity    False
price               False
product             False
winner              False
dtype: bool

In [38]:
y_test_pred = pd.Series(data=clf_gb.predict(X=df_test.values), name='verification_result')
y_test_pred.head()


X does not have valid feature names, but GradientBoostingClassifier was fitted with feature names



0    1
1    0
2    0
3    0
4    0
Name: verification_result, dtype: int32

In [39]:
y_test_pred = y_test_pred.astype("boolean")
y_test_pred.head()

0     True
1    False
2    False
3    False
4    False
Name: verification_result, dtype: boolean

In [40]:
y_test_pred.to_csv('Yacin_Boualili_prediction.csv', index=False)