In [7]:
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [128]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

In [4]:
dataset.head(2)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


In [129]:
dataset.shape

(150, 5)

In [130]:
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = \
model_selection.train_test_split(X, 
                                 Y, 
                                 test_size=validation_size, 
                                 random_state=seed)

In [103]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

In [13]:
# evaluate each model in turn
seed = 7
scoring = 'accuracy'
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.966667 (0.040825)
LDA: 0.975000 (0.038188)
KNN: 0.983333 (0.033333)
CART: 0.975000 (0.038188)
NB: 0.975000 (0.053359)
SVM: 0.991667 (0.025000)


In [14]:
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.9
[[ 7  0  0]
 [ 0 11  1]
 [ 0  2  9]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         7
Iris-versicolor       0.85      0.92      0.88        12
 Iris-virginica       0.90      0.82      0.86        11

    avg / total       0.90      0.90      0.90        30



# Wrong Label Correction

In [98]:
import random
from sklearn.utils import shuffle

In [143]:
def load_dataset():
    url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
    names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
    dataset = pandas.read_csv(url, names=names)
    return dataset

In [173]:
dataset = load_dataset()

In [119]:
dataset.head(2)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


In [100]:
labels = list(dataset['class'].unique())
labels

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [101]:
dataset = shuffle(dataset)
dataset = dataset.reset_index(drop=True)

In [87]:
labels = list(dataset['class'].unique())
def make_wrong(dataset, num_of_wrongs):
    change_indexes = random.sample(range(0,len(dataset)+1), num_of_wrongs)
    print('change_indexes :\n', change_indexes)
    trues = []
    wrongs = []
    for i in change_indexes:
        true_label = dataset.at[i , 'class']
        trues.append(true_label)
        wrong_label = random.choice([i for i in labels if i != true_label])
        wrongs.append(wrong_label)
        dataset.at[i , 'class'] = wrong_label
    print('trues :\n', trues)
    print('wrongs :\n', wrongs)
    return dataset

In [156]:
def form_models():
    models = {}
    models['LR'] = LogisticRegression(solver='liblinear', multi_class='ovr')
    models['LDA'] = LinearDiscriminantAnalysis()
    models['KNN'] = KNeighborsClassifier()
    models['CART'] = DecisionTreeClassifier()
    models['NB'] = GaussianNB()
    models['SVM'] = SVC(gamma='auto')
    return models
m = form_models()

In [157]:
len(m)

6

In [None]:
num_of_features = 4
array = dataset.values
X = array[:,0:num_of_features]
Y = array[:,num_of_features]

In [149]:
shuffled_dataset = shuffle(dataset)

In [150]:
shuffled_dataset[:10]

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
29,4.7,3.2,1.6,0.2,Iris-setosa
125,7.2,3.2,6.0,1.8,Iris-virginica
55,5.7,2.8,4.5,1.3,Iris-versicolor
66,5.6,3.0,4.5,1.5,Iris-versicolor
131,7.9,3.8,6.4,2.0,Iris-virginica
31,5.4,3.4,1.5,0.4,Iris-setosa
63,6.1,2.9,4.7,1.4,Iris-versicolor
87,6.3,2.3,4.4,1.3,Iris-versicolor
10,5.4,3.7,1.5,0.2,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa


In [151]:
t = shuffled_dataset[:10]

In [153]:
X_test = t[t.columns[0:4]]
X_test

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width
29,4.7,3.2,1.6,0.2
125,7.2,3.2,6.0,1.8
55,5.7,2.8,4.5,1.3
66,5.6,3.0,4.5,1.5
131,7.9,3.8,6.4,2.0
31,5.4,3.4,1.5,0.4
63,6.1,2.9,4.7,1.4
87,6.3,2.3,4.4,1.3
10,5.4,3.7,1.5,0.2
37,4.9,3.1,1.5,0.1


In [155]:
for i, index in enumerate(list(X_test.index)):
    print(i,index)

0 29
1 125
2 55
3 66
4 131
5 31
6 63
7 87
8 10
9 37


In [161]:
from collections import defaultdict, Counter

In [162]:
d = defaultdict(list)
d[1].append('a')
d

defaultdict(list, {1: ['a']})

In [166]:
d[1].append('a')
d[1].append('b')

In [279]:
t = pandas.read_csv('/Users/muratyalcin/Downloads/train.csv')
t.head(2)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
import pandas, random
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from collections import defaultdict, Counter
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from sklearn.utils import shuffle


class correct_labels:
    
    def __init__(self,
                 dataset,
                 label_column_name:str, 
                 num_of_wrongs, 
                 repeats, 
                 split_rate = 4, # 3/4 train, 1/4 predict
                 iris = None,
                 mnist = None):
        self.dataset = dataset
        self.label_column_name = label_column_name
        self.split_rate = split_rate 
        self.num_of_wrongs = num_of_wrongs
        self.repeats = repeats
        self.models = self.form_models()
        #if iris:
        #    assert mnist is None
        #    self.dataset = self.load_iris_dataset()
        #if mnist:
        #    self.dataset = self.load_mnist_dataset()
        self.num_of_features = self.dataset.shape[1]-1
        self.labels = list(self.dataset[label_column_name].unique())
        self.num_of_labels = len(self.labels)
        
    
    def load_iris_dataset(self):
        url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
        names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
        dataset = pandas.read_csv(url, names=names)
        return dataset
    
    def load_mnist_dataset(self):
        t = pandas.read_csv('/Users/muratyalcin/Downloads/train.csv')
        cols = list(t.columns)
        cols = cols[1:] + [cols[0]]
        dataset = t[cols]
        return dataset
    
    def shuffle_dataset(self, dataset):
        shuffled_dataset = shuffle(dataset)
        #shuffled_dataset = shuffled_dataset.reset_index(drop=True)
        return shuffled_dataset
        
    def make_wrong(self, dataset):
        change_indexes = random.sample(range(0,len(dataset)+1),
                                       self.num_of_wrongs)
        trues = []
        wrongs = []
        wrong_dataset = dataset.copy()
        for i in change_indexes:
            true_label = self.dataset.at[i , self.label_column_name]
            trues.append(true_label)
            wrong_label = random.choice([i for i in self.labels if i != true_label])
            wrongs.append(wrong_label)
            wrong_dataset.at[i , self.label_column_name] = wrong_label
        return wrong_dataset, trues, wrongs, change_indexes
    
    def split_dataset(self, dataset):
        split_ = int(len(dataset)/self.split_rate)
        train = dataset[split_:]
        test = dataset[:split_]
        train_ = train.values
        X_train = train_[:,0:self.num_of_features]
        y_train = train_[:,self.num_of_features]
        test_ = test.values
        X_test = test_[:,0:self.num_of_features]
        y_test = test_[:,self.num_of_features]
        return X_train, y_train, X_test, y_test, split_, test
    
    def form_models(self):
        models = {}
        #models['LR'] = LogisticRegression(solver='liblinear', multi_class='ovr')
        #models['LDA'] = LinearDiscriminantAnalysis()
        #models['KNN'] = KNeighborsClassifier()
        models['CART'] = DecisionTreeClassifier()
        models['RF'] = RandomForestClassifier()
        #models['NB'] = GaussianNB()
        #models['SVM'] = SVC(gamma='auto')
        #models['baseline'] = self.baseline_model()
        return models
    
    def baseline_model(self):
        # create model
        model = Sequential()
        model.add(Dense(self.num_of_features, input_dim=self.num_of_features, kernel_initializer='normal', activation='relu'))
        model.add(Dense(self.num_of_labels, kernel_initializer='normal', activation='softmax'))
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
        
    
    def fit_predict(self, model, X_train, y_train, X_test):
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        return predictions
        
    def multi_model_predict(self, X_train, y_train, X_test):
        preds = []
        for model in list(self.models.values()):
            predictions = self.fit_predict(model, X_train, y_train, X_test)
            preds.append(predictions)
        return preds
            
    def handle_tracker(self, tracker, wrong_dataset):
        wrong_data_labels = list(wrong_dataset[self.label_column_name])
        assert len(list(self.dataset[self.label_column_name])) == \
        len(list(wrong_dataset[self.label_column_name]))
        item_preds = defaultdict()
        model_guess = []
        for i in range(len(self.dataset)):
            if tracker[i]:
                item_preds[i] = max(Counter(tracker[i]), key=Counter(tracker[i]).get) 
                model_guess.append(max(Counter(tracker[i]), key=Counter(tracker[i]).get))
            else:
                item_preds[i] = wrong_data_labels[i]
                model_guess.append(wrong_data_labels[i])
        return item_preds, model_guess
   
    def compare(self, model_guess):
        actuals = list(self.dataset[self.label_column_name])
        predicted = model_guess 
        corrects = [i for i, j in enumerate(zip(actuals, model_guess)) if j[0] == j[1]]
        wrongs = [i for i, j in enumerate(zip(actuals, model_guess)) if j[0] != j[1]]
        return corrects, wrongs
 
    def evaluate(self, corrects, change_indexes, wrongs):
        return {
            'data length' : len(self.dataset),
            'split rate' : self.split_rate,
            'repeats' : self.repeats,
            'total wrongs start' : self.num_of_wrongs,
            'number of corrects' : len(corrects),
            'number of wrongs' : len(wrongs),
            'number of wrong indexes'  : len(change_indexes),
            'number of corrected' : len(set(change_indexes) & set(corrects)),
            'number of missed' : len(set(change_indexes) & set(wrongs)), 
            'number of wronged' : len((set(change_indexes) | set(wrongs)) - set(change_indexes))   
        }
        
    def correct_wrong_labels(self):
        tracker = defaultdict(list)
        wrong_dataset, trues, wrongs, change_indexes = self.make_wrong(self.dataset)
        for i in range(self.repeats):
            dataset = self.shuffle_dataset(wrong_dataset)
            
            X_train, y_train, X_test, y_test, split_, test = self.split_dataset(wrong_dataset)
            preds = self.multi_model_predict(X_train, y_train, X_test)
            num_models = len(self.models)
            assert len(preds[0]) == split_
            y_indexes = list(test.index)
            for x in range(num_models):
                for i, index in enumerate(y_indexes):
                    tracker[index].append(preds[x][i])
        item_preds, model_guess = self.handle_tracker(tracker, wrong_dataset)
        corrects, wrongs = self.compare(model_guess)
        result = self.evaluate(corrects, change_indexes, wrongs)
        print('result : ', result) 
        return result

In [None]:
def load_mnist_dataset():
    t = pandas.read_csv('/Users/muratyalcin/Downloads/train.csv')
    cols = list(t.columns)
    cols = cols[1:] + [cols[0]]
    dataset = t[cols]
    return dataset
num_of_wrongs = [100, 500, 1000]
repeats = [1000, 10000]
split_rate = [5, 10, 20, 40, 50, 70, 100, 200]
results = []
print('loading dataset...')
dataset = load_mnist_dataset()
print('experiment started...')
for i in num_of_wrongs:
    for j in repeats:
        for k in split_rate:
            cl = correct_labels(dataset = dataset,
                                label_column_name = 'label', 
                                num_of_wrongs = i, 
                                repeats = j, 
                                split_rate = k,
                               mnist = True)
            print('\ncombination : \n', (i, j, k) , '\n')
            result = cl.correct_wrong_labels()
            results.append(result)

loading dataset...
experiment started...

combination : 
 (100, 1000, 40) 

result :  {'data length': 42000, 'split rate': 40, 'repeats': 1000, 'total wrongs start': 100, 'number of corrects': 41831, 'number of wrongs': 169, 'number of wrong indexes': 100, 'number of corrected': 3, 'number of missed': 97, 'number of wronged': 72}

combination : 
 (100, 1000, 50) 

result :  {'data length': 42000, 'split rate': 50, 'repeats': 1000, 'total wrongs start': 100, 'number of corrects': 41840, 'number of wrongs': 160, 'number of wrong indexes': 100, 'number of corrected': 1, 'number of missed': 99, 'number of wronged': 61}

combination : 
 (100, 1000, 70) 

result :  {'data length': 42000, 'split rate': 70, 'repeats': 1000, 'total wrongs start': 100, 'number of corrects': 41859, 'number of wrongs': 141, 'number of wrong indexes': 100, 'number of corrected': 0, 'number of missed': 100, 'number of wronged': 41}

combination : 
 (100, 1000, 100) 



In [3]:
pandas.DataFrame(results)

Unnamed: 0,data length,number of corrected,number of corrects,number of missed,number of wrong indexes,number of wronged,number of wrongs,repeats,split rate,total wrongs start
0,42000,3,40805,2,5,1193,1195,10,3,5
1,42000,3,40858,12,15,1130,1142,10,3,15


In [263]:
cl = correct_labels('class', num_of_wrongs = 5, repeats = 10000, split_rate = 5)

In [264]:
cl.correct_wrong_labels()

result :  {'data length': 150, 'number of corrects': 148, 'number of wrongs': 2, 'number of wrong indexes': 5, 'number of corrected': 3, 'number of missed': 2, 'number of wronged': 0}


In [8]:
def load():
    t = pandas.read_csv('/Users/muratyalcin/Downloads/train.csv')
    cols = list(t.columns)
    cols = cols[1:] + [cols[0]]
    dataset = t[cols]
    return dataset

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
t.shape[1]

785

In [11]:
cols = list(t.columns)
cols = cols[1:] + [cols[0]]
df = t[cols]
df.head(2)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#from keras.datasets import mnist
#(X_train, y_train), (X_test, y_test) = mnist.load_data()