In [1]:
datasets = {}

from sklearn.datasets import fetch_mldata
data = fetch_mldata("abalone")

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
preprocessing_pipe = make_pipeline(
    #OneHotEncoder on "Sex" feature
    OneHotEncoder(categorical_features=[0], sparse=False),
    #Scale all from 0 to 1
    MinMaxScaler())

datasets["abalone"] = {
    "X": preprocessing_pipe.fit_transform(data.data),
    "y": data.target
}


In [3]:
data = fetch_mldata("uci-20070111 dermatology")
datasets["dermatology"] = {
    "X": data.data[:,0:-1],
    "y": data.data[:,-1]
}

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
class base_classifiers:
    KNN = KNeighborsClassifier(
        n_neighbors=3,
        metric="euclidean",
        #n_jobs=2  # Parallelize work on CPUs
    )
    NB = GaussianNB(
        priors=None
    )
    SVM = SVC(
        C=1.0,
        kernel='poly',
        degree=1,
        tol=0.001,
        # Epsilon parameter missing?
    )
    CART = DecisionTreeClassifier(
        criterion='entropy',
        # splitter='best',
        # max_depth=None,
        # min_samples_split=2,
        min_samples_leaf=2,
        # min_weight_fraction_leaf=0.0,
        # max_features=None,
        # random_state=None,
        # max_leaf_nodes=None,
        # min_impurity_split=1e-07,
        # class_weight=None,
        # presort=False,
    )

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

def _training_scoring_iteration(clf, X, y, training_index, test_index, labeling_rate):
    """ One iteration of training and scoring on given data"""
    #Testing set is set aside.. - 1/10th of the data
    X_test, y_test = X[test_index], y[test_index]

    #For generating a testing and transductive set
    split_data = train_test_split(
        X[training_index],
        y[training_index],
        test_size=labeling_rate,
        random_state=42
    )
    (X_unlabeled, X_labeled, y_unlabeled, y_labeled) = split_data

    #Training set - 9/10 of data
    X_train = np.concatenate((X_labeled, X_unlabeled))
    y_train = np.concatenate((
        y_labeled.astype(str),
        np.full_like(y_unlabeled.astype(str), "unlabeled")
    ))
    
    #Train the classifier
    clf.fit(X_train, y_train)
    
    #Score the classifier
    transductive_score = clf.score(X_unlabeled, y_unlabeled.astype(str))
    testing_score = clf.score(X_test, y_test.astype(str))
    
    return transductive_score, testing_score
    
def train_and_score(clf, X, y, cv, labeling_rate):
    """ Collects scores using CV for splitting the data in different folds"""
    transductive_scores = []
    testing_scores = []
    for training_index, test_index in cv.split(X,y):
        transductive_score, testing_score = _training_scoring_iteration(clf, X, y, training_index, test_index, labeling_rate)
        
        transductive_scores.append(transductive_score)
        testing_scores.append(testing_score)
        print("#", end="")
    print()
    return {
        "trans_mean": np.mean(transductive_scores),
        "test_mean": np.mean(testing_scores),
        "trans_std": np.std(transductive_scores),
        "test_std": np.std(testing_scores)
    }

In [6]:
from StandardSelfTraining import StandardSelfTraining
from tri_training import TriTraining
from sklearn.model_selection import KFold
import pandas as pd

# All classifiers used for testing
classifiers = [
    TriTraining("TriTraining (KNN)", base_classifiers.KNN),
    TriTraining("TriTraining (NB)", base_classifiers.NB),
    #TriTraining("TriTraining (SVM)", base_classifiers.SVM),
    TriTraining("TriTraining (CART)", base_classifiers.CART),
    StandardSelfTraining("Self-Training (KNN)", base_classifiers.KNN),
    StandardSelfTraining("Self-Training (NB)", base_classifiers.NB),
    #StandardSelfTraining("Self-Training (SVM)", base_classifiers.SVM),
    StandardSelfTraining("Self-Training (CART)", base_classifiers.CART)
]
labeling_rates = [0.10, 0.20, 0.30, 0.40]

#Columns in datasets that are categorical and need o be replaced with hot-one
categorical_columns = [[], [0]]
results = None

for classifier in classifiers:
    print(classifier.name)
    for dataset_name, dataset in datasets.items():
        print("dataset:", dataset_name, "\t")
        for labeling_rate in labeling_rates:
            print("rate:", labeling_rate, end=" ")

            test_info = { "classifier": classifier.name, "dataset":dataset_name, "labeling_rate":labeling_rate}
            cv = KFold(n_splits=10, random_state=42)
            scores = train_and_score(classifier, dataset["X"], dataset["y"], cv, labeling_rate)

            if results is None:
                results = pd.DataFrame([{**test_info, **scores}])
            else:
                results.loc[len(results.index)] = {**test_info, **scores}
    print()
    print("--------")

TriTraining (KNN)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########
dataset: abalone 	
rate: 0.1 ####

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#####
rate: 0.2 

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##
rate: 0.3 #

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#########
rate: 0.4 ###

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


###

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


###

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

--------
TriTraining (NB)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ###

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#######
rate: 0.3 #

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#####

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


####
rate: 0.4 ########

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##
dataset: abalone 	
rate: 0.1 ##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


####
rate: 0.2 

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##
rate: 0.3 #

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#
rate: 0.4 

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


###

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#

--------
TriTraining (CART)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########
dataset: abalone 	
rate: 0.1 ##########
rate: 0.2 ####

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


######
rate: 0.3 #

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


#######

  check = self.filled(0).__eq__(other)
  score = y_true == y_pred


##
rate: 0.4 ##########

--------
Self-Training (KNN)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########
dataset: abalone 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########

--------
Self-Training (NB)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########
dataset: abalone 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########

--------
Self-Training (CART)
dataset: dermatology 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########
dataset: abalone 	
rate: 0.1 ##########
rate: 0.2 ##########
rate: 0.3 ##########
rate: 0.4 ##########

--------


In [7]:
results

Unnamed: 0,classifier,dataset,labeling_rate,test_mean,test_std,trans_mean,trans_std
0,TriTraining (KNN),dermatology,0.1,0.480781,0.052778,0.453492,0.045858
1,TriTraining (KNN),dermatology,0.2,0.647748,0.070364,0.66478,0.020741
2,TriTraining (KNN),dermatology,0.3,0.737913,0.08006,0.740439,0.019958
3,TriTraining (KNN),dermatology,0.4,0.784234,0.08052,0.79939,0.025973
4,TriTraining (KNN),abalone,0.1,0.175739,0.084579,0.189192,0.063644
5,TriTraining (KNN),abalone,0.2,0.114168,0.10332,0.144376,0.09471
6,TriTraining (KNN),abalone,0.3,0.194894,0.057701,0.182568,0.061844
7,TriTraining (KNN),abalone,0.4,0.176231,0.079328,0.166234,0.083732
8,TriTraining (NB),dermatology,0.1,0.364264,0.188874,0.363607,0.233967
9,TriTraining (NB),dermatology,0.2,0.171847,0.099674,0.169632,0.067516


In [8]:
pd.pivot_table(results, values=None, index=['dataset', 'classifier'], columns=['labeling_rate'])

Unnamed: 0_level_0,Unnamed: 1_level_0,test_mean,test_mean,test_mean,test_mean,test_std,test_std,test_std,test_std,trans_mean,trans_mean,trans_mean,trans_mean,trans_std,trans_std,trans_std,trans_std
Unnamed: 0_level_1,labeling_rate,0.1,0.2,0.3,0.4,0.1,0.2,0.3,0.4,0.1,0.2,0.3,0.4,0.1,0.2,0.3,0.4
dataset,classifier,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
abalone,Self-Training (CART),0.182905,0.197023,0.190086,0.199186,0.054273,0.055371,0.043549,0.051497,0.199156,0.20294,0.200358,0.197537,0.009199,0.012675,0.01808,0.014328
abalone,Self-Training (KNN),0.215711,0.21068,0.209482,0.202054,0.063134,0.063786,0.056126,0.058924,0.21748,0.216107,0.213241,0.205958,0.009506,0.008401,0.009116,0.010749
abalone,Self-Training (NB),0.08043,0.085939,0.070136,0.067502,0.027688,0.034251,0.027342,0.027958,0.076111,0.0826,0.076694,0.067753,0.017747,0.01156,0.016192,0.01352
abalone,TriTraining (CART),0.183867,0.197975,0.170664,0.192958,0.049214,0.057287,0.070739,0.049187,0.201253,0.178398,0.173144,0.199486,0.007512,0.060256,0.058505,0.011102
abalone,TriTraining (KNN),0.175739,0.114168,0.194894,0.176231,0.084579,0.10332,0.057701,0.079328,0.189192,0.144376,0.182568,0.166234,0.063644,0.09471,0.061844,0.083732
abalone,TriTraining (NB),0.093609,0.05674,0.063414,0.078755,0.03611,0.061495,0.064562,0.065267,0.064698,0.055394,0.051646,0.062252,0.04324,0.046118,0.051901,0.051021
dermatology,Self-Training (CART),0.788889,0.882658,0.882658,0.91524,0.123208,0.094648,0.074047,0.064454,0.805061,0.90094,0.906704,0.933151,0.068278,0.032633,0.028222,0.021025
dermatology,Self-Training (KNN),0.49467,0.678003,0.724324,0.765165,0.053112,0.08659,0.072453,0.073864,0.453488,0.673888,0.732635,0.796865,0.036719,0.027315,0.025003,0.032399
dermatology,Self-Training (NB),0.28506,0.196396,0.196396,0.196396,0.081652,0.069305,0.069305,0.069305,0.260744,0.194363,0.215726,0.22393,0.054078,0.012955,0.0409,0.035261
dermatology,TriTraining (CART),0.767042,0.860811,0.901727,0.896171,0.119685,0.071722,0.07247,0.078643,0.794259,0.895949,0.915816,0.926552,0.068619,0.029256,0.02721,0.0215
