In [1]:
datasets = {}

from sklearn.datasets import fetch_mldata
data = fetch_mldata("abalone")

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
preprocessing_pipe = make_pipeline(
    #OneHotEncoder on "Sex" feature
    OneHotEncoder(categorical_features=[0], sparse=False),
    #Scale all from 0 to 1
    MinMaxScaler())

#datasets["abalone"] = {
#    "X": preprocessing_pipe.fit_transform(data.data),
#    "y": data.target
#}


In [3]:
data = fetch_mldata("uci-20070111 dermatology")
datasets["dermatology"] = {
    "X": data.data[:,0:-1],
    "y": data.data[:,-1]
}

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
class base_classifiers:
    KNN = KNeighborsClassifier(
        n_neighbors=3,
        metric="euclidean",
        #n_jobs=2  # Parallelize work on CPUs
    )
    NB = GaussianNB(
        priors=None
    )
    SVM = SVC(
        C=1.0,
        kernel='poly',
        degree=1,
        tol=0.001,
        # Epsilon parameter missing?
    )
    CART = DecisionTreeClassifier(
        criterion='entropy',
        # splitter='best',
        # max_depth=None,
        # min_samples_split=2,
        min_samples_leaf=2,
        # min_weight_fraction_leaf=0.0,
        # max_features=None,
        # random_state=None,
        # max_leaf_nodes=None,
        # min_impurity_split=1e-07,
        # class_weight=None,
        # presort=False,
    )

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

def _training_scoring_iteration(clf, X, y, training_index, test_index, labeling_rate):
    """ One iteration of training and scoring on given data"""
    #Testing set is set aside.. - 1/10th of the data
    X_test, y_test = X[test_index], y[test_index]

    #For generating a testing and transductive set
    split_data = train_test_split(
        X[training_index],
        y[training_index],
        test_size=labeling_rate,
        random_state=42
    )
    (X_unlabeled, X_labeled, y_unlabeled, y_labeled) = split_data

    #Training set - 9/10 of data
    X_train = np.concatenate((X_labeled, X_unlabeled))
    y_train = np.concatenate((
        y_labeled.astype(str),
        np.full_like(y_unlabeled.astype(str), "unlabeled")
    ))
    
    #Train the classifier
    clf.fit(X_train, y_train)
    
    #Score the classifier
    transductive_score = clf.score(X_unlabeled, y_unlabeled.astype(str))
    testing_score = clf.score(X_test, y_test.astype(str))
    
    return transductive_score, testing_score
    
def train_and_score(clf, X, y, cv, labeling_rate):
    """ Collects scores using CV for splitting the data in different folds"""
    transductive_scores = []
    testing_scores = []
    for training_index, test_index in cv.split(X,y):
        transductive_score, testing_score = _training_scoring_iteration(clf, X, y, training_index, test_index, labeling_rate)
        
        transductive_scores.append(transductive_score)
        testing_scores.append(testing_score)
        print("#", end="")
    print()
    return {
        "trans_mean": np.mean(transductive_scores),
        "test_mean": np.mean(testing_scores),
        "trans_std": np.std(transductive_scores),
        "test_std": np.std(testing_scores)
    }

In [6]:
from StandardSelfTraining import StandardSelfTraining
from tri_training import TriTraining
from sklearn.model_selection import KFold
import pandas as pd

# All classifiers used for testing
classifiers = [
    #TriTraining("TriTraining (KNN)", base_classifiers.KNN),
    #TriTraining("TriTraining (NB)", base_classifiers.NB),
    #TriTraining("TriTraining (SVM)", base_classifiers.SVM),
    #TriTraining("TriTraining (CART)", base_classifiers.CART),
    StandardSelfTraining("Self-Training (KNN)", base_classifiers.KNN),
    StandardSelfTraining("Self-Training (NB)", base_classifiers.NB),
    #StandardSelfTraining("Self-Training (SVM)", base_classifiers.SVM),
    StandardSelfTraining("Self-Training (CART)", base_classifiers.CART)
]
labeling_rates = [0.10, 0.20, 0.30, 0.40]

#Columns in datasets that are categorical and need o be replaced with hot-one
categorical_columns = [[], [0]]
results = None

for classifier in classifiers:
    print(classifier.name)
    for dataset_name, dataset in datasets.items():
        print("dataset:", dataset_name, "\t")
        for labeling_rate in labeling_rates:
            print("rate:", labeling_rate, end=" ")

            test_info = { "classifier": classifier.name, "dataset":dataset_name, "labeling_rate":labeling_rate}
            cv = KFold(n_splits=10, random_state=42)
            scores = train_and_score(classifier, dataset["X"], dataset["y"], cv, labeling_rate)

            if results is None:
                results = pd.DataFrame([{**test_info, **scores}])
            else:
                results.loc[len(results.index)] = {**test_info, **scores}
    print()
    print("--------")

Self-Training (KNN)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########

--------
Self-Training (NB)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########

--------
Self-Training (CART)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########

--------


In [7]:
results

Unnamed: 0,classifier,dataset,labeling_rate,test_mean,test_std,trans_mean,trans_std
0,Self-Training (KNN),dermatology,0.1,0.49467,0.053112,0.453488,0.036719
1,Self-Training (KNN),dermatology,0.2,0.678003,0.08659,0.673888,0.027315
2,Self-Training (KNN),dermatology,0.3,0.724324,0.072453,0.732635,0.025003
3,Self-Training (KNN),dermatology,0.4,0.765165,0.073864,0.796865,0.032399
4,Self-Training (NB),dermatology,0.1,0.28506,0.081652,0.260744,0.054078
5,Self-Training (NB),dermatology,0.2,0.196396,0.069305,0.194363,0.012955
6,Self-Training (NB),dermatology,0.3,0.196396,0.069305,0.215726,0.0409
7,Self-Training (NB),dermatology,0.4,0.196396,0.069305,0.22393,0.035261
8,Self-Training (CART),dermatology,0.1,0.755781,0.100688,0.788888,0.074608
9,Self-Training (CART),dermatology,0.2,0.88536,0.083147,0.911561,0.02899


In [8]:
pd.pivot_table(results, values=None, index=['dataset', 'classifier'], columns=['labeling_rate'])

Unnamed: 0_level_0,Unnamed: 1_level_0,test_mean,test_mean,test_mean,test_mean,test_std,test_std,test_std,test_std,trans_mean,trans_mean,trans_mean,trans_mean,trans_std,trans_std,trans_std,trans_std
Unnamed: 0_level_1,labeling_rate,0.1,0.2,0.3,0.4,0.1,0.2,0.3,0.4,0.1,0.2,0.3,0.4,0.1,0.2,0.3,0.4
dataset,classifier,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
dermatology,Self-Training (CART),0.755781,0.88536,0.888063,0.899099,0.100688,0.083147,0.083173,0.064845,0.788888,0.911561,0.905411,0.933156,0.074608,0.02899,0.026721,0.023002
dermatology,Self-Training (KNN),0.49467,0.678003,0.724324,0.765165,0.053112,0.08659,0.072453,0.073864,0.453488,0.673888,0.732635,0.796865,0.036719,0.027315,0.025003,0.032399
dermatology,Self-Training (NB),0.28506,0.196396,0.196396,0.196396,0.081652,0.069305,0.069305,0.069305,0.260744,0.194363,0.215726,0.22393,0.054078,0.012955,0.0409,0.035261
