# Handgeschriebene Ziffern

Der beiliegenden Beschreibung der Daten habe ich entnommen, dass es sich um handgeschriebene Ziffern handelt. Die mit 16x16 Bildpunkten abgespeichert sind. Der Beschreibung kann ich weiter entnehmen:

*"The data are in two gzipped files, and each line consists of the digit
id (0-9) followed by the 256 grayscale values."*

Es handelt sich also um eine Textdatei in der in jeder Zeile der Wert der geschriebenen Ziffer sowie die 16x16 = 256 Bildpunkte stehen.

---

In [None]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

In [None]:
gmm = GaussianMixture(n_components=10, covariance_type='full')

---

In [None]:
def getNumber(data, digit):
    # wähle alle Zeilen in denen die erste Spalte (Spalte mit den Ziffer) mit 'digit' übereinstimmt
    data_number = pd.DataFrame(data[data[0] == digit]) 
    # schneide die erste Spalte weg und gebe den rest zurück
    separated_data = data_number.iloc[:,1:]
    return separated_data


In [None]:
def getNumberImage(data, digit, aggregation):
    # ruft die Hilfsfunktion getNumber auf um alle Bilddaten Daten zu einer Ziffer zu holen
    image_data = getNumber(data, digit)
    # bilde aus allen Bildern zu der einen Ziffer ein aggrigiertes Bild
    if(aggregation == 'median'):
        df1 = pd.DataFrame(image_data.median())
    else:
        df1 = pd.DataFrame(image_data.mean())        
    
    # wandle das format von (1,256) -> (16,16) und gebe diesen transformierten DataFrame zurück
    return df1.values.reshape(16,16)

In [None]:
class Experiment:
    def __init__(self, targets):
        self.results = pd.DataFrame(targets, columns=["targets"])

        self.input_data = {}
        
        self.names = []        
        self.configs = []

        
    def get_names(self):
        return self.names
    
    
    def get_results(self):
        return self.results
    
    def get_config_map(self):
        for conf in self.configs:
            print ("CONFIG:", conf['name'], conf['input_data_name'], "\tNum features:", self.input_data[conf['input_data_name']]['data'].shape[1])
    
    
    def add_input_data(self, name, data, description=""):
        self.input_data[name] = {'description': description, 'data': data}
    
    
    def add_config(self, algo, name, input_data_name):
        # TODO: add check for dublicate names
        if input_data_name not in self.input_data:
            print ("ERROR: feature_set_name not found")
            return 
        
        self.names.append(name)
        self.configs.append({ 'name': name, 'algo': algo, 'input_data_name': input_data_name, 'fit': -1})
        

    def _fit(self, conf):
        # get features
        input_data = self.input_data[conf['input_data_name']]['data']

        # fit the features
        print ("Fitting ", conf['name'], "... ",)
        
        start = time.time() # startzeit
        conf['fit'] = conf['algo'].fit(input_data)
        end = time.time() # startzeit
        
        print ("done in ", end - start , "s.")

        
    def _predict(self, conf, data):
        if data.shape[1] != self.input_data[conf['input_data_name']]['data'].shape[1]:
            print ("ERROR: Number of input features does not match (", conf['name'], ")")
            return
        
        print ("Predicting data with", conf['name'], "... ",)
        
        start = time.time() # startzeit
        self.results[conf['name']] = conf['algo'].predict(data)
        end = time.time() # startzeit
        
        print ("done in ", end - start , "s." )

        
    def fit(self):
        '''Only trains algorithms if no fit has been calculated before'''
        for conf in self.configs:
            if conf['fit'] == -1:
                self._fit(conf)

                
    def fit_all(self, data):
        '''Trains all algorithms with given data'''
        for conf in self.configs:
            self._fit(conf)

            
    def refit(self, data):
        '''Retrains all algorithms with given data that have been trained before'''
        for conf in self.configs:
            if conf['fit'] != -1:
                self._fit(conf)

                
    def predict(self, name, data):
        for conf in self.configs:
            if conf['fit'] != -1 and conf['name'] == name:
                self._predict(conf, data)

    def predict_all(self, data):
        for conf in self.configs:
            if conf['fit'] != -1:
                self._predict(conf, data)

                
    def get_count_df(self, name):
        count_matrix = np.zeros((10,10)) 
        for digit in range(0,10):
            cluster_counts = self.results[self.results['targets'] == digit].groupby(name).count()['targets']
            for cluster in cluster_counts.keys():
                count_matrix[digit][cluster] = cluster_counts[cluster]
        
        col_names = []
        for i in range(0,10):
            col_names.append("C_" + str(i))
    
        return pd.DataFrame(count_matrix, columns=col_names)

    def get_norm_df(self, name):
        count_df = self.get_count_df(name)
        return count_df.divide(count_df.sum(1), axis=0)
    
    
    def get_all_error_rates(self):
        error_rates = {}
        for conf in self.configs:
            if (conf['fit'] != -1) & (conf['name'] in self.results.columns):
                error_rates[conf['name']] = self.get_error_rate(conf['name'])
        return error_rates
                
    def get_error_rate(self, name):
        '''Determine classification error by identifying best fitting column assignment'''
        
        normalized_counts = self.get_norm_df(name)
        cluster_assignmend = normalized_counts.values.argmax(axis=1)
    
        norm_np = normalized_counts.values
    
        # start with custer assignment using max function, which may contain doublicates
        new_order = normalized_counts.values.argmax(axis=1)
        unique_cluster_ids, unique_idx, inverse_map, cluster_counts = np.unique(new_order, return_counts=True, return_inverse=True, return_index=True)
        
        # determine unassigned cluster ids
        unprecise_clusters = []
        idx = 0

        for i in range(0,10):
            if i >= len(unique_cluster_ids):
                unprecise_clusters.append(i) 
                continue
            if unique_cluster_ids[idx] != i:
                unprecise_clusters.append(i)   
            else:
                idx += 1

        # determine unclean clusters
        confused_digits = []
        for i in range(0, len(unique_cluster_ids)):
            # skip if no dublicate exists
            if cluster_counts[i] == 1:
                continue
            
            # case where clusters were assigned to multiple digits
            unprecise_clusters.append(new_order[unique_idx[i]])
            for j in range(unique_idx[i], 10):
                if new_order[j] == unique_cluster_ids[i]:
                    confused_digits.append(j)
        
        
        # set corresponding digits
        idx = 0
        for digit in confused_digits:
            new_order[digit] = unprecise_clusters[idx]
            idx += 1
             
        
        # calculate maximal weight in regard to sum of frequencies 
        weight = normalized_counts.reindex(normalized_counts.columns[new_order], axis=1).values.diagonal().sum()
        max_weight = weight
        max_perm = new_order.copy()
        for i in range(0, len(confused_digits)):
            for j in range(i + 1, len(confused_digits)):  
                #permutate entires
                perm = new_order.copy()
                perm[confused_digits[i]] = new_order[confused_digits[j]]
                perm[confused_digits[j]] = new_order[confused_digits[i]]
                
                # calculate diagonal weights
                weight = normalized_counts.reindex(normalized_counts.columns[perm], axis=1).values.diagonal().sum()
                if weight > max_weight:
                    max_weight = weight
                    max_perm = perm.copy()
        
        perm_inv = np.zeros(len(max_perm),dtype=int)
        for i in range(0, len(max_perm)):
            perm_inv[max_perm[i]] = i
        
        
        self.results[name + '_TARGET'] = self.results[[name]].apply(lambda x: perm_inv[x])
        return 1.0 - (1.0 * (self.results['targets'] == self.results[name + '_TARGET']).sum()) / self.results.shape[0]
            

---

In [None]:
# mit header=None, wird die erste Zeile der Datei nicht als Header interpretiert 
#              (Man könnte den Header in einem solchen Fall als Zeile mit den Spaltenüberschriften bezeichnen)
# mit sep=" ", geben wir an, dass wir das Leerzeichen als Seperator verwenden wollen. 
#              D.h. zwei durch ein Leerzeichen separierte Werte sollen als zwei Werte eingelesen werden.
data = pd.read_csv("../data/zip.train", header=None, sep=" ") 

In [None]:
cleaned_data = data.dropna(axis=1, thresh=2) # lass alle Spalten mit mehr als 2 NaN (Not a Number) vom datensatz fallen 
cleaned_data.shape

In [None]:
cleaned_data.describe()

Aus der Beschreibung der Daten wissen wir, dass es sich um die Ziffern und die dazugehörigen Bilder der Größe 16x16 Pixel handelt. Also in jeder Zeile steht in der 0-ten Spalte die Ziffer und in den folgenden 256 Spalten die einzelnen Pixel der Bilder. Also visualisieren wir diese nun einmal.

In [None]:
plt.figure(1, figsize=(20, 10))
for i in range(0,10):
    image = getNumberImage(cleaned_data,i,'mean')

    # Call signature: subplot(nrows, ncols, index, **kwargs)
    plt.subplot(2,5, 1 + i)
    plt.imshow(image, cmap='hot', interpolation='none')

plt.show()

---

## Verschiedene Ansätze mit K-Means zu Clustern

Wir nehmen an, dass jeder Bildpunkt ein Wert in dem 256 dimensionalen Raum der reelen Zahlen ist. In diesem Raum haben wir wie im 1-, 2-, oder 3-Dimensionalen auch das euklidische Abstandsmaß.

In [None]:
input_data = cleaned_data.iloc[:,1:].values
input_data.shape

In [None]:
# Set targets
exp = Experiment(cleaned_data[0].values)

### Add imputs

In [None]:
init_image = np.zeros((10,256))
for i in range(0,10):
    image = getNumber(cleaned_data, i).mean()
    init_image[i,:] = image
    

In [None]:
# input featurs are the individual pixels of the image - without transformation
exp.add_input_data('256_pixel', input_data)

In [None]:
# input featurs are the individual pixels of the image - quantized
exp.add_input_data('256_pixel_quant', (input_data * 2).round() / 2)

In [None]:
# input featurs are the individual pixels of the image - quantized
exp.add_input_data('256_pixel_mult', (input_data * 4).round() / 4)

In [None]:
exp.add_config(KMeans(n_clusters=10), 'KMeans_10', '256_pixel')
exp.add_config(KMeans(n_clusters=10, init=init_image, n_init=1), 'KMeans_10_init', '256_pixel')
exp.add_config(KMeans(n_clusters=10), 'KMeans_10_Q', '256_pixel_quant')
exp.add_config(KMeans(n_clusters=10), 'KMeans_10_Q4', '256_pixel_mult')
exp.add_config(KMeans(n_clusters=10, n_init=20), 'KMeans__10__n_init20', '256_pixel')
exp.add_config(GaussianMixture(n_components=10, covariance_type='full'), 'GMM__10__cov_full', '256_pixel')

In [None]:
exp.get_names()

In [None]:
exp.get_config_map()

In [None]:
exp.fit()

In [None]:
exp.configs[5]['algo'].means_

In [None]:
exp.predict_all(input_data)

In [None]:
exp.get_all_error_rates()

In [None]:
results = exp.get_results()
results.head()

-----