In [None]:
#Import useful modules
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import copy
import random
from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical

In [None]:
#Set directory and import the Letters data
os.chdir('D:\MSc2\Deep Learning Keras\Task2')
data = pd.read_csv('Letters.txt',header=None)

#Randomly shuffle the data
data = data.iloc[np.random.permutation(data.shape[0]),].reset_index(drop=True)

In [None]:
#data.head() #First column is the label (binary)
#data.shape #(20000, 17)
#data.describe() #x = 0-15
#data.info() #y = object, x = int64
#data.columns.values #column names

In [None]:
##Section 1: Data cleaning and Exploratory Data Analysis
#Rename the columns 
data.columns = ["Class"] + ["Feature" + str(num1) for num1 in range(1,17)]

#Create features and labels
y = data['Class'].astype("category") #26 classes - pandas series (type categoy)
x = data.iloc[:,1:] #features 

d = dict(zip(y.unique(), range(0,26))) #{'A': 0, ..., 'Z': 25}
y = y.map(d, na_action='ignore') #map the labels of Y: Poissonous - 1, Others - 0

In [None]:
#Plot the Class Label distributions and the Feature distributions
#Class label distribution
label_dist = data.iloc[:,0].value_counts().sort_index() #create a table of counts 
label_dist.plot.bar() #balanced labels

In [None]:
#Check correlation between features using a heat map
sns.heatmap(x.corr()) #Feataure 1-5 have much higher correlations than other features

In [None]:
##Section 2. Prepare data for modelling
n = data.shape[0]
k = 0.70
q = int(round(n*k))

#Standardize features and convert to numpy arrays
x = np.asarray(x).astype('float32')
x = (x - x.mean(axis = 0))/x.std(axis=0) #x is homogeneous (similar range) - mean 0, std 1
#x = (x - x.mean(axis = 0))/(x.max(axis=0) - x.min(axis=0))
#x = (x - x.min(axis = 0))/(x.max(axis=0) - x.min(axis=0)) #x takes on small values - lies in [0,1]

train_x = x[:q,:] 
test_x = x[q:,:] 

In [None]:
#1-hot-encode targets y and convert to numpy arrays
y = np.asarray(y) #objects
y = to_categorical(y)

train_y = y[:q,:]
test_y = y[q:,:]

In [None]:
#type(train_x), type(train_y), type(test_x), type(test_y) #(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)
#train_x.shape, train_y.shape, test_x.shape, test_y.shape #((14000, 16), (14000, 26), (6000, 16), (6000, 26))

In [None]:
##Section 3. Model Building
#1)Architecture of the model
d = train_x.shape[1]
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(d,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(26, activation='softmax')) #26 classes
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) #Optimizer, Loss function, Metrics

#2)Train the model 
##K-Fold CV 
k = 5
num_validation_samples = train_x.shape[0] // k
loss = []
accuracy = []
n = 300


for i in range(1,n,10): #Hyper-param = # of epochs
    l = []
    a = []
    for fold in range(k): #5 folds
        validation_x = train_x[num_validation_samples*fold: num_validation_samples*(fold+1),:]
        validation_y = train_y[num_validation_samples*fold: num_validation_samples*(fold+1)]
        training_x = np.concatenate( (train_x[:num_validation_samples,:],train_x[num_validation_samples*(fold+1):,:]),
                                        axis = 0)
        training_y = np.concatenate( (train_y[:num_validation_samples],train_y[num_validation_samples*(fold+1):]),
                                        axis = 0)

        model.fit(training_x, training_y, epochs=i, batch_size=512,verbose=0) 

        l.append(model.evaluate(validation_x,validation_y,verbose=0)[0])
        a.append(model.evaluate(validation_x,validation_y,verbose=0)[1])
        
        
    loss.append(sum(l)/len(l))
    accuracy.append(sum(a)/len(a))

In [None]:
#3)Tuning - Plot loss and accuracy 
loss_values = np.asarray(loss)
accuracy_values = np.asarray(accuracy)

epochs = range(1,len(loss_values)+1)
plt.plot(epochs,loss_values, 'bo')
plt.xticks(range(1, 20,3))
plt.title('Loss - CV (k=5)', fontweight = 'bold',fontsize = 16)
plt.xlabel('Epochs (in steps of 10)',fontsize = 14)
plt.ylabel('Loss',fontsize = 14)
plt.legend()
plt.show()

epochs = range(1,len(accuracy_values)+1)
plt.plot(epochs,accuracy_values, 'go')
plt.xticks(range(1, 20,3))
plt.title('Accuracy - CV (k=5)', fontweight = 'bold',fontsize = 16)
plt.xlabel('Epochs (in steps of 10)',fontsize = 14)
plt.ylabel('Accuracy',fontsize = 14)
plt.legend()
plt.show()

In [None]:
#4)Finalize - retraining the model using whole training set
d = train_x.shape[1]
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(d,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(26, activation='softmax')) #26 classes
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) #Optimizer, Loss function, Metrics

model.fit(train_x, train_y, epochs=150,batch_size=512,verbose=0)

In [None]:
#5)Evaluate on the test set
model.evaluate(test_x,test_y) #[loss, accuracy] = [0.18193856616318227, 0.9406666666666667]

In [None]:
#6)Closer look at the test set performance
predictions = model.predict(test_x, verbose=0) #see the soft probability of the predicted data
plt.hist(predictions)

In [None]:
#Random baseline model(multiclass classification)
test_y = np.asarray(data.iloc[q:,0])
test_y_copy = copy.copy(test_y)
np.random.shuffle(test_y_copy)
hits_array = np.array(test_y) == np.array(test_y_copy)
float(np.sum(hits_array))/len(test_y)

#a random guess tends to get an accuracy of 0.041166666666666664