In [1]:
import pandas as pd
from sklearn import neighbors
from sklearn import svm

In [2]:
#Load Datasets
def load_datasets(dataset_name):
    path_to_datasets="../Datasets/"
    labeling_rate = 10
    partial_path="{0}SSC_{1}labeled/{2}/{2}-{1}-1".format(path_to_datasets,labeling_rate,dataset_name)
    paths={
        "tra":partial_path+"tra.dat",
        "trs":partial_path+"trs.dat",
        "tst":partial_path+"tst.dat"
    }
    dataframes = {
        "tra":pd.read_csv(paths["tra"], header=None, sep=", ", engine="python", comment="@"),
        "trs":pd.read_csv(paths["trs"], header=None, sep=", ", engine="python", comment="@"),
        "tst":pd.read_csv(paths["tst"], header=None, sep=", ", engine="python", comment="@")
    }
    return dataframes

In [3]:
class StandardSelfTraining:
    @staticmethod
    def NN():
        """
        Create Standard Self-Training classifier with NN base classifier"""
        base_clf = neighbors.KNeighborsClassifier(
            n_neighbors=3,
            metric="euclidean",
            n_jobs=2 #Parallelize work on CPUs
        )
        return StandardSelfTraining("KNN", base_clf)
    
    def SMO():
        """
        Create Standard Self-Training classifier with SVM base classifier
        the SVM classifier has been trained using SMO algorithm
        """
        base_clf = svm.SVC(
            C=1.0,
            kernel='poly',
            degree=1,
            tol=0.001,
            #Epsilon parameter missing?
        )
        return StandardSelfTraining("SMO", base_clf)
        
        
    
    def __init__(self, name, base_classifier, max_iterations=40):
        self.name = name
        self.base_classifier = base_classifier
        self.max_iterations = max_iterations
        
    def __str__(self):
        return "Classifier: " +self.name + "\nParameters: " + str(self.base_classifier.get_params());
    
    def fit(self, X, y):
        stable = False
        iteration = 0;
        #Iterate until the result is stable or max_iterations is reached
        while not(stable) and (iteration < self.max_iterations):
            new_y = self._fit_iteration(X, y)
            #Check if the result has stabilised
            stable = (y == new_y).all()
            y = new_y
            iteration += 1
            
    def _fit_iteration(self, X, y):
        clf = self.base_classifier
        #Fit a classifier on already labeled data
        labeled = y != "unlabeled"
        clf.fit(X[labeled], y[labeled])
        #Predict on all the training data
        return clf.predict(X)
    
    def predict(self, X):
        return self.base_classifier.predict(X)
    
    def score(self, X, y):
        return self.base_classifier.score(X, y)

def train_and_score(clf, dataframes,categorical=[]):
    """
    Given a classifier and a datasets
    Trains the classifier on training dataset
    and scores the classifier on transitive and testing datasets
    """
    training = dataframes["tra"]
    
    Xtra = training.iloc[:,:-1]
    ytra = training.iloc[:, -1]
    Xtra = pd.get_dummies(Xtra, columns = categorical )
    clf.fit(Xtra, ytra)
    transitive = dataframes["trs"]
    Xtrs = transitive.iloc[:,:-1]
    ytrs = transitive.iloc[:, -1].astype(str)
    Xtrs = pd.get_dummies(Xtrs, columns = categorical )
    print("Transitive score:", clf.score( Xtrs, ytrs))
    testing = dataframes["tst"]
    Xtst = testing.iloc[:,:-1]
    ytst = testing.iloc[:, -1].astype(str)
    Xtst = pd.get_dummies(Xtst, columns = categorical )
    print("Testing score:", clf.score(Xtst, ytst))

In [4]:
# All classifiers used for testing
classifiers = [
    StandardSelfTraining.NN(),
    StandardSelfTraining.SMO()
]
# All datasets used for testing
dataset_names = ["bupa", "abalone"]
#Columns in datasets that are categorical and need o be replaced with hot-one
categorical_columns = [[], [0]]
for classifier in classifiers:
    print("#########################")
    print(classifier)
    print("--------")
    print("Results:")
    for dataset_name, categorical in zip(dataset_names, categorical_columns):
        print("dataset:", dataset_name)
        dataframes = load_datasets(dataset_name)          
        train_and_score(classifier, dataframes, categorical=categorical)
    print()


#########################
Classifier: KNN
Parameters: {'leaf_size': 30, 'metric': 'euclidean', 'algorithm': 'auto', 'p': 2, 'n_neighbors': 3, 'metric_params': None, 'weights': 'uniform', 'n_jobs': 2}
--------
Results:
dataset: bupa
Transitive score: 0.616129032258
Testing score: 0.6
dataset: abalone
Transitive score: 0.222044302108
Testing score: 0.16985645933

#########################
Classifier: SMO
Parameters: {'gamma': 'auto', 'class_weight': None, 'kernel': 'poly', 'verbose': False, 'degree': 1, 'probability': False, 'tol': 0.001, 'cache_size': 200, 'decision_function_shape': None, 'C': 1.0, 'random_state': None, 'shrinking': True, 'max_iter': -1, 'coef0': 0.0}
--------
Results:
dataset: bupa
Transitive score: 0.683870967742
Testing score: 0.6
dataset: abalone
Transitive score: 0.191353082466
Testing score: 0.188995215311



Results reported in paper:
     transitive | testing
     
KNN:
bupa:    0.5471   0.5314
abalone: 0.2223   0.1725

SMO:
bupa:    0.6089   0.6330
abalone: 0.2174   0.2168
