In [46]:
import pandas as pd
from sklearn import neighbors
from sklearn import svm

In [2]:
#Load Datasets
def load_datasets(dataset_name):
    path_to_datasets="../Datasets/"
    labeling_rate = 10
    partial_path="{0}SSC_{1}labeled/{2}/{2}-{1}-1".format(path_to_datasets,labeling_rate,dataset_name)
    paths={
        "tra":partial_path+"tra.dat",
        "trs":partial_path+"trs.dat",
        "tst":partial_path+"tst.dat"
    }
    dataframes = {
        "tra":pd.read_csv(paths["tra"], header=None, sep=", ", engine="python", comment="@"),
        "trs":pd.read_csv(paths["trs"], header=None, sep=", ", engine="python", comment="@"),
        "tst":pd.read_csv(paths["tst"], header=None, sep=", ", engine="python", comment="@")
    }
    return dataframes

In [43]:
class StandardSelfTraining:
    @staticmethod
    def NN():
        """Create Standard Self-Training classifier with NN base classifier"""
        base_clf = neighbors.KNeighborsClassifier(1, weights='uniform')
        return StandardSelfTraining(base_clf)
    
    def __init__(self, base_classifier, max_iterations=40):
        self.base_classifier = base_classifier
        self.max_iterations = max_iterations
    
    def fit(self, X, y):
        stable = False
        iteration = 0;
        print("Training: ",end="")
        #Iterate until the result is stable or max_iterations is reached
        while not(stable) and (iteration < self.max_iterations):
            new_y = self._fit_iteration(X, y)
            #Check if the result has stabilised
            stable = (y == new_y).all()
            y = new_y
            iteration += 1
            
    def _fit_iteration(self, X, y):
        clf = self.base_classifier
        #Fit a classifier on already labeled data
        labeled = y != "unlabeled"
        clf.fit(X[labeled], y[labeled])
        #Predict on all the training data
        return clf.predict(X)
    
    def predict(self, X):
        return self.base_classifier.predict(X)
    
    def score(self, X, y):
        return self.base_classifier.score(X, y)

def train_and_score(clf, dataframes,categorical=[]):
    training = dataframes["tra"]
    
    
    Xtra = training.iloc[:,:-1]
    ytra = training.iloc[:, -1]
    Xtra = pd.get_dummies(Xtra, columns = categorical )
    clf.fit(Xtra, ytra)
    transitive = dataframes["trs"]
    Xtrs = transitive.iloc[:,:-1]
    ytrs = transitive.iloc[:, -1].astype(str)
    Xtrs = pd.get_dummies(Xtrs, columns = categorical )
    print("Training score:", clf.score( Xtrs, ytrs))
    testing = dataframes["tst"]
    Xtst = testing.iloc[:,:-1]
    ytst = testing.iloc[:, -1].astype(str)
    Xtst = pd.get_dummies(Xtst, columns = categorical )
    print("Testing score:", clf.score(Xtst, ytst))

In [47]:
classifiers = [
    StandardSelfTraining.NN(),
    StandardSelfTraining(svm.SVC())
]
dataset_names = ["bupa", "abalone"]
categorical_columns = [[], [0]]
for classifier in classifiers:
    print("## Classifier:", classifier)
    for dataset_name, categorical in zip(dataset_names, categorical_columns):
        print("dataset:", dataset_name)
        dataframes = load_datasets(dataset_name)          
        train_and_score(classifier, dataframes, categorical=categorical)


## Classifier: <__main__.StandardSelfTraining object at 0x7f0c64a807b8>
dataset: bupa
Training: Training score: 0.635483870968
Testing score: 0.514285714286
dataset: abalone
Training: Training score: 0.273285294903
Testing score: 0.193779904306
## Classifier: <__main__.StandardSelfTraining object at 0x7f0c64a80828>
dataset: bupa
Training: Training score: 0.61935483871
Testing score: 0.571428571429
dataset: abalone
Training: Training score: 0.199893247932
Testing score: 0.198564593301
