In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Semi Supervised Algorithms

Self-training is a semi-supervised learning technique where a model is iteratively trained on labeled data and then used to generate pseudo-labels for unlabeled data. These pseudo-labels are then added to the training set for further refinement.

These are commonly classified into 5 types:
1. Self-training
2. Co-training
3. Graph-Based Methods
4. Pseudo-Labeling
5. Semi-Supervised Support Vector Machines (S3VM)

Out of which we will be implementing the first four algorithms.

Resources :

# Self Training

## #Importing libraries and load the data

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load the Iris Dataset
data = load_iris()
X = data.data
y = data.target

# Create a Dataset with Missing Labels:
rng = np.random.RandomState(42)
mask_unlabeled = rng.rand(y.shape[0]) < 0.5
y[mask_unlabeled] = -1  # -1 indicates unlabeled data


## # Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## #Training the model

In [None]:
base_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
model = SelfTrainingClassifier(base_classifier)
model.fit(X_train, y_train)


## # Result

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Self-Training Accuracy: {accuracy:.2f}")


Self-Training Accuracy: 0.37


# Co-training

## #Importing libraries and load the data

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# loading the dataset
data = load_iris()
X = data.data
y = data.target





## # Splitting and training the model

In [None]:
# Create Different Feature Views
X1 = X[:, :2]  # First two features
X2 = X[:, 2:]  # Last two features


# creating dataset with unlabelled data
rng = np.random.RandomState(42)
mask_unlabeled = rng.rand(y.shape[0]) < 0.5
y[mask_unlabeled] = -1  # -1 indicates unlabeled data

X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(X1, X2, y, test_size=0.2, random_state=42)

model1 = RandomForestClassifier(n_estimators=50, random_state=42)
model2 = RandomForestClassifier(n_estimators=50, random_state=42)


num_iterations = 10  # Number of co-training iterations

for _ in range(num_iterations):
    # Train model1 on X1_train
    model1.fit(X1_train, y_train)

    # Predict labels for X2_train using model1
    pseudo_labels_model1 = model1.predict(X2_train)

    # Add pseudo-labels to training set
    X2_train_extended = np.copy(X2_train)
    y_train_extended = np.copy(y_train)
    mask_pseudo_labels = (y_train_extended == -1)  # Unlabeled data
    y_train_extended[mask_pseudo_labels] = pseudo_labels_model1[mask_pseudo_labels]

    # Train model2 on X2_train
    model2.fit(X2_train_extended, y_train_extended)

    # Predict labels for X1_train using model2
    pseudo_labels_model2 = model2.predict(X1_train)

    # Add pseudo-labels to training set
    X1_train_extended = np.copy(X1_train)
    y_train_extended = np.copy(y_train)
    mask_pseudo_labels = (y_train_extended == -1)  # Unlabeled data
    y_train_extended[mask_pseudo_labels] = pseudo_labels_model2[mask_pseudo_labels]



## # Evaluating the model

In [None]:

y_pred1 = model1.predict(X1_test)
y_pred2 = model2.predict(X2_test)

accuracy1 = accuracy_score(y_test, y_pred1)
accuracy2 = accuracy_score(y_test, y_pred2)

print(f"Model1 Accuracy: {accuracy1:.2f}")
print(f"Model2 Accuracy: {accuracy2:.2f}")


Model1 Accuracy: 0.50
Model2 Accuracy: 0.43


# Pseudo-Labeling

## #Importing libraries and load the data

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
data = load_iris()
X = data.data
y = data.target


## # Creating labelled and unlabelled dataset

In [None]:
# Create a mask for unlabeled data
rng = np.random.RandomState(42)
mask_unlabeled = rng.rand(y.shape[0]) < 0.5

# Set masked labels to -1 to indicate unlabeled data
y_unlabeled = np.copy(y)
y_unlabeled[mask_unlabeled] = -1


## #Splitting and Training

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_unlabeled, test_size=0.3, random_state=42)




## #Generate Pseudo-Labels

In [None]:
# Initialize a classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model using only labeled data
model.fit(X_train[y_train != -1], y_train[y_train != -1])

# Predict pseudo-labels for the unlabeled data
pseudo_labels = model.predict(X_train[y_train == -1])

# Combine labeled and pseudo-labeled data
X_train_combined = np.vstack([X_train[y_train != -1], X_train[y_train == -1]])
y_train_combined = np.concatenate([y_train[y_train != -1], pseudo_labels])

 ## #Retraining  the Model

In [None]:
# Retrain the model with labeled and pseudo-labeled data
model.fit(X_train_combined, y_train_combined)


## # Evaluating the model

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.38
