In [1]:
import pandas as pd
from StandardSelfTraining import StandardSelfTraining

In [2]:
path_to_datasets = "../Datasets/"

# All datasets used for testing
dataset_names = ["bupa", "abalone"]
labeling_rates = [10, 20, 30, 40]

def load_dataset(path):
    """Load one dataset"""
    return pd.read_csv(path, header=None, sep=", ", engine="python", comment="@")

def load_datasets(dataset_name, labeling_rate=10):
    """ Load 3 datasets: training, transitive and testing"""
    partial_path="{0}SSC_{1}labeled/{2}/{2}-10-1".format(path_to_datasets, labeling_rate,dataset_name)
    dataframes = {t: load_dataset(partial_path+t+".dat") for t in ["tra", "trs", "tst"]}
    return dataframes

In [3]:
def train_and_score(clf, dataframes,categorical=[]):
    """
    Given a classifier and a datasets
    Trains the classifier on training dataset
    and scores the classifier on transitive and testing datasets
    """
    training = dataframes["tra"]
    
    Xtra = training.iloc[:,:-1]
    ytra = training.iloc[:, -1]
    Xtra = pd.get_dummies(Xtra, columns = categorical )
    clf.fit(Xtra, ytra)
    transitive = dataframes["trs"]
    Xtrs = transitive.iloc[:,:-1]
    ytrs = transitive.iloc[:, -1].astype(str)
    Xtrs = pd.get_dummies(Xtrs, columns = categorical )
    transitive_score = clf.score( Xtrs, ytrs)
    testing = dataframes["tst"]
    Xtst = testing.iloc[:,:-1]
    ytst = testing.iloc[:, -1].astype(str)
    Xtst = pd.get_dummies(Xtst, columns = categorical )
    testing_score = clf.score(Xtst, ytst)
    return (transitive_score, testing_score)

In [4]:
# All classifiers used for testing
classifiers = [
    StandardSelfTraining.KNN(),
    StandardSelfTraining.SMO(),
    StandardSelfTraining.CART()
]

#Columns in datasets that are categorical and need o be replaced with hot-one
categorical_columns = [[], [0]]
results = pd.DataFrame(columns=('classifier', 'dataset', 'labeling_rate', "transitive_accuracy", "testing_accuracy"))
for classifier in classifiers:
    print(classifier)
    print("--------")
    for dataset_name, categorical in zip(dataset_names, categorical_columns):
        print("dataset:", dataset_name)
        for labeling_rate in labeling_rates:
            print("#", end="")
            dataframes = load_datasets(dataset_name, labeling_rate)          
            transitive_score, testing_score = train_and_score(classifier, dataframes, categorical=categorical)
            results.loc[len(results.index)] = [classifier.name, dataset_name, labeling_rate, transitive_score, testing_score]
        print()
    print()

Classifier: Self-Training (KNN)
Parameters: {'n_neighbors': 3, 'leaf_size': 30, 'metric_params': None, 'n_jobs': 2, 'algorithm': 'auto', 'p': 2, 'metric': 'euclidean', 'weights': 'uniform'}
--------
dataset: bupa
####
dataset: abalone
####

Classifier: Self-Training (SVM)
Parameters: {'verbose': False, 'cache_size': 200, 'shrinking': True, 'max_iter': -1, 'probability': False, 'decision_function_shape': None, 'random_state': None, 'degree': 1, 'tol': 0.001, 'gamma': 'auto', 'C': 1.0, 'kernel': 'poly', 'coef0': 0.0, 'class_weight': None}
--------
dataset: bupa
####
dataset: abalone
####

Classifier: Self-Training (CART)
Parameters: {'max_features': None, 'min_samples_leaf': 2, 'presort': False, 'min_impurity_split': 1e-07, 'random_state': None, 'criterion': 'entropy', 'max_depth': None, 'splitter': 'best', 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'max_leaf_nodes': None, 'class_weight': None}
--------
dataset: bupa
####
dataset: abalone
####



Results reported in paper:
     transitive | testing
     
KNN:
bupa:    0.5471   0.5314
abalone: 0.2223   0.1725

SMO:
bupa:    0.6089   0.6330
abalone: 0.2174   0.2168


In [5]:
results.head()

Unnamed: 0,classifier,dataset,labeling_rate,transitive_accuracy,testing_accuracy
0,Self-Training (KNN),bupa,10.0,0.616129,0.6
1,Self-Training (KNN),bupa,20.0,0.667742,0.514286
2,Self-Training (KNN),bupa,30.0,0.670968,0.542857
3,Self-Training (KNN),bupa,40.0,0.7,0.514286
4,Self-Training (KNN),abalone,10.0,0.222044,0.169856


In [6]:
pd.pivot_table(results, values=None, index=['classifier', 'dataset'], columns=['labeling_rate'])

Unnamed: 0_level_0,Unnamed: 1_level_0,transitive_accuracy,transitive_accuracy,transitive_accuracy,transitive_accuracy,testing_accuracy,testing_accuracy,testing_accuracy,testing_accuracy
Unnamed: 0_level_1,labeling_rate,10.0,20.0,30.0,40.0,10.0,20.0,30.0,40.0
classifier,dataset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Self-Training (CART),abalone,0.233253,0.303895,0.321248,0.296109,0.169856,0.222488,0.191388,0.191388
Self-Training (CART),bupa,0.651613,0.670968,0.687097,0.73871,0.685714,0.6,0.571429,0.685714
Self-Training (KNN),abalone,0.222044,0.243597,0.278859,0.309701,0.169856,0.177033,0.191388,0.215311
Self-Training (KNN),bupa,0.616129,0.667742,0.670968,0.7,0.6,0.514286,0.542857,0.514286
Self-Training (SVM),abalone,0.191353,0.191569,0.194615,0.195096,0.188995,0.188995,0.184211,0.184211
Self-Training (SVM),bupa,0.683871,0.696774,0.690323,0.664516,0.6,0.542857,0.514286,0.6
