# Effectiveness of Self-Training using Pseudo Labeling

Program is modified from https://datawhatnow.com/pseudo-labeling-semi-supervised-learning/

Example on Kaggle Mercedes_Benz Greener Manufacturing Competition

## Load Data

In [1]:
import pandas as pd
# Load the data
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
print(train.shape, test.shape)

(4209, 378) (4209, 377)


This is a hard problem with relatively small number of samples but a lot of features

In [2]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


## Some feature transformation

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

features = train.columns[2:]
for column_name in features:
    label_encoder = LabelEncoder() 
    
    # Get the column values
    train_column_values = list(train[column_name].values)
    test_column_values = list(test[column_name].values)
    
    # Fit the label encoder
    label_encoder.fit(train_column_values + test_column_values)
    
    # Transform the feature
    train[column_name] = label_encoder.transform(train_column_values)
    test[column_name] = label_encoder.transform(test_column_values)

### Self-training

In [4]:
def create_augmented_train(X, y, model, test, features, target, sample_rate):
    '''
    Create and return the augmented_train set that consists
    of pseudo-labeled and labeled data.
    '''
    num_of_samples = int(len(test) * sample_rate)
    # Train the model and creat the pseudo-labeles
    model.fit(X, y)
    pseudo_labeles = model.predict(test[features])
    # Add the pseudo-labeles to the test set
    augmented_test = test.copy(deep=True)
    augmented_test[target] = pseudo_labeles
    # Take a subset of the test set with pseudo-labeles and append in onto
    # the training set
    sampled_test = augmented_test.sample(n=num_of_samples)
    temp_train = pd.concat([X, y], axis=1)
    augemented_train = pd.concat([sampled_test, temp_train])
    
    # Shuffle the augmented dataset and return it
    return shuffle(augemented_train)

In [5]:
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, RegressorMixin
class PseudoLabeler(BaseEstimator, RegressorMixin):
    
    def __init__(self, model, test, features, target, sample_rate=0.2, seed=42):
        self.sample_rate = sample_rate
        self.seed = seed
        self.model = model
        self.model.seed = seed
        
        self.test = test
        self.features = features
        self.target = target
        
    def get_params(self, deep=True):
        return {
            "sample_rate": self.sample_rate,
            "seed": self.seed,
            "model": self.model,
            "test": self.test,
            "features": self.features,
            "target": self.target
        }
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
        
    def fit(self, X, y):
        if self.sample_rate > 0.0:
            augemented_train = self.__create_augmented_train(X, y)
            self.model.fit(
                augemented_train[self.features],
                augemented_train[self.target]
            )
        else:
            self.model.fit(X, y)
        
        return self
    def __create_augmented_train(self, X, y):
        num_of_samples = int(len(test) * self.sample_rate)
        
        # Train the model and creat the pseudo-labels
        self.model.fit(X, y)
        pseudo_labels = self.model.predict(self.test[self.features])
        
        # Add the pseudo-labels to the test set
        augmented_test = test.copy(deep=True)
        augmented_test[self.target] = pseudo_labels
        
        # Take a subset of the test set with pseudo-labels and append in onto
        # the training set
        sampled_test = augmented_test.sample(n=num_of_samples)
        temp_train = pd.concat([X, y], axis=1)
        augemented_train = pd.concat([sampled_test, temp_train])
        return shuffle(augemented_train)
        
    def predict(self, X):
        return self.model.predict(X)
    
    def get_model_name(self):
        return self.model.__class__.__name__

### Training with XGBoost

In [6]:
import xgboost as xgb
import numpy as np
target = 'y'
# Preprocess the data
X_train, X_test = train[features], test[features]
y_train = train[target]

X_train, X_test2, y_train, y_test2 = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
# Create the PseudoLabeler with XGBRegressor as the base regressor
model = PseudoLabeler(
    xgb.XGBRegressor(nthread=1),
    test,
    features,
    target,
    0.0
)


# Train the model and use it to predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test2)
error = np.sqrt(np.sum((y_pred - y_test2)*(y_pred - y_test2)))/y_pred.shape[0]
print("XGB Error:%0.4f"%error)

XGB Error:0.2796


### Training with Pseudo Labeling

In [7]:
model = PseudoLabeler(
    xgb.XGBRegressor(nthread=1),
    test,
    features,
    target,
    0.2
)


# Train the model and use it to predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test2)
error = np.sqrt(np.sum((y_pred - y_test2)*(y_pred - y_test2)))/y_pred.shape[0]
print("Pseudo Labeling Error:%0.4f"%error)

Pseudo Labeling Error:0.2781


### Using the whole unlabeled set for Pseudo Labeling

In [8]:
model = PseudoLabeler(
    xgb.XGBRegressor(nthread=1),
    test,
    features,
    target,
    1.0
)


# Train the model and use it to predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test2)
error = np.sqrt(np.sum((y_pred - y_test2)*(y_pred - y_test2)))/y_pred.shape[0]
print("Pseudo Labeling Error:%0.4f"%error)

Pseudo Labeling Error:0.2737
