In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
class base_classifiers:
    KNN = KNeighborsClassifier(
        n_neighbors=3,
        metric="euclidean",
        #n_jobs=2  # Parallelize work on CPUs
    )
    NB = GaussianNB(
        priors=None
    )
    SVM = SVC(
        C=1.0,
        kernel='poly',
        degree=1,
        tol=0.001,
        # Epsilon parameter missing?
    )
    CART = DecisionTreeClassifier(
        criterion='entropy',
        # splitter='best',
        # max_depth=None,
        # min_samples_split=2,
        min_samples_leaf=2,
        # min_weight_fraction_leaf=0.0,
        # max_features=None,
        # random_state=None,
        # max_leaf_nodes=None,
        # min_impurity_split=1e-07,
        # class_weight=None,
        # presort=False,
    )

In [2]:
%matplotlib inline
import pandas as pd
from StandardSelfTraining import StandardSelfTraining
from tri_training import TriTraining

#Set the random seed
import random
import numpy as np
random.seed(123)
np.random.seed(123)

#Load classification dataset
from sklearn.datasets import fetch_mldata
data = fetch_mldata("abalone")

def to_dataframe(data):
    return pd.DataFrame(np.c_[data.data, data.target])

#Display the first few lines of data
to_dataframe(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15.0
1,1.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7.0
2,2.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9.0
3,1.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10.0
4,3.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7.0


In [3]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
data.data[:,1:-1] = MinMaxScaler().fit_transform(data.data[:,1:-1])
to_dataframe(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.15,15.0
1,1.0,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.07,7.0
2,2.0,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.21,9.0
3,1.0,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.155,10.0
4,3.0,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.055,7.0


In [4]:
#Encode Sex feature using OneHotEncoder
data.data = OneHotEncoder(categorical_features=[0], sparse=False).fit_transform(data.data)
to_dataframe(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.0,0.0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.15,15.0
1,1.0,0.0,0.0,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.07,7.0
2,0.0,1.0,0.0,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.21,9.0
3,1.0,0.0,0.0,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.155,10.0
4,0.0,0.0,1.0,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.055,7.0


In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
preprocessing_pipe = make_pipeline(
    #OneHotEncoder on "Sex" feature
    OneHotEncoder(categorical_features=[0], sparse=False),
    #Scale all from 0 to 1
    MinMaxScaler())
X = preprocessing_pipe.fit_transform(data.data)
y = data.target

clf = DummyClassifier()
scores = cross_val_score(clf, X, y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#clf = sklearn.linear_model.LogisticRegression()
#scores = cross_val_score(clf, X, y, cv=10)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.10 (+/- 0.03)




In [6]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
kf.get_n_splits(X)

10

In [13]:
from sklearn.model_selection import train_test_split

def _training_scoring_iteration(clf, X, y, training_index, test_index):
    """ One iteration of training and scoring on given data"""
    #Testing set is set aside.. - 1/10th of the data
    X_test, y_test = X[test_index], y[test_index]

    #For generating a testing and transductive set
    split_data = train_test_split(
        X[training_index],
        y[training_index],
        test_size=0.10,
        random_state=42
    )
    (X_unlabeled, X_labeled, y_unlabeled, y_labeled) = split_data

    #Training set - 9/10 of data
    X_train = np.concatenate((X_labeled, X_unlabeled))
    y_train = np.concatenate((
        y_labeled.astype(str),
        np.full_like(y_unlabeled.astype(str), "unlabeled")
    ))
    
    #Train the classifier
    clf.fit(X_train, y_train)
    
    #Score the classifier
    training_score = clf.score(X_train, y[training_index].astype(str))
    transductive_score = clf.score(X_unlabeled, y_unlabeled.astype(str))
    testing_score = clf.score(X_test, y_test.astype(str))
    
    return training_score, transductive_score, testing_score
    
def train_and_score(clf, X, y, cv):
    """ Collects scores using CV for splitting the data in different folds"""
    training_scores = []
    transductive_scores = []
    testing_scores = []
    for training_index, test_index in cv.split(X, y):
        training_score, transductive_score, testing_score = _training_scoring_iteration(clf, X, y, training_index, test_index)
        
        training_scores.append(training_score)
        transductive_scores.append(transductive_score)
        testing_scores.append(testing_score)
        print("#", end="")
    print()
    return {
        "training": training_scores,
        "transductive": transductive_scores,
        "testing": testing_scores
    }

#clf = StandardSelfTraining("Self-Training (SVM)", base_classifiers.SVM)

clf = TriTraining("TriTraining (SVM)", base_classifiers.SVM)

scores = train_and_score(clf, X, y, cv=kf)

inc 295
same 376
error 0.7845744680851063
inc 278
same 247
error 1.125506072874494
inc 278
same 247
error 1.125506072874494
#inc 269
same 269
error 1.0
inc 273
same 268
error 1.0186567164179106
inc 290
same 362
error 0.8011049723756906
#inc 259
same 111
error 2.3333333333333335
inc 280
same 259
error 1.0810810810810811
inc 259
same 113
error 2.2920353982300883
#inc 281
same 262
error 1.0725190839694656
inc 257
same 115
error 2.234782608695652
inc 277
same 229
error 1.2096069868995634
#inc 255
same 121
error 2.1074380165289255
inc 281
same 260
error 1.0807692307692307
inc 274
same 237
error 1.1561181434599157
#inc 293
same 312
error 0.9391025641025641
inc 274
same 212
error 1.2924528301886793
inc 283
same 256
error 1.10546875
#inc 248
same 40
error 6.2
inc 241
same 0
inc 301
same 336
error 0.8958333333333334
#inc 253
same 147
error 1.7210884353741496
inc 253
same 147
error 1.7210884353741496
inc 294
same 304
error 0.9671052631578947
#inc 299
same 355
error 0.8422535211267606
inc 299
sam

In [8]:
scores_df = pd.DataFrame(scores)
print(scores_df)
print(scores_df.describe())

    testing  training  transductive
0  0.126794  0.161479      0.193911
1  0.078947  0.168928      0.199231
2  0.222488  0.159351      0.183860
3  0.251196  0.152168      0.180609
4  0.215311  0.135408      0.195684
5  0.114833  0.158021      0.204848
6  0.275120  0.133546      0.188886
7  0.179856  0.156117      0.196217
8  0.179856  0.130851      0.179669
9  0.213429  0.142287      0.205969
         testing   training  transductive
count  10.000000  10.000000     10.000000
mean    0.185783   0.149816      0.192889
std     0.062575   0.013306      0.009413
min     0.078947   0.130851      0.179669
25%     0.140060   0.137128      0.185117
50%     0.196643   0.154143      0.194798
75%     0.220694   0.159018      0.198478
max     0.275120   0.168928      0.205969
