In [None]:
#%pip install pandas
#%pip install scipy
#%pip install matplotlib
#%pip install seaborn
#%pip install scikit-learn
#%pip install umap-learn


In [None]:
import pandas as pd
#from google.colab import auth
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.metrics import confusion_matrix

#auth.authenticate_user()

# Import Dataset

In [67]:
train_in = pd.read_csv("data/train_in.csv", header=None)
train_out = pd.read_csv("data/train_out.csv", header=None, names=["label"])
test_in = pd.read_csv("data/test_in.csv", header=None)
test_out = pd.read_csv("data/test_out.csv", header=None, names=["label"])

#train_in = pd.read_csv("/content/DL_assignment/train_in - Copy.csv", header=None)
#train_out = pd.read_csv("/content/DL_assignment/train_out - Copy.csv", header=None, names=["label"])
#test_in = pd.read_csv("/content/DL_assignment/test_in - Copy.csv", header=None)
#test_out = pd.read_csv("/content/DL_assignment/test_out - Copy.csv", header=None, names=["label"])

In [68]:
train_in.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.631,0.862,-0.167,...,0.304,0.823,1.0,0.482,-0.474,-0.991,-1.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-0.813,-0.671,-0.809,-0.887,-0.671,-0.853,-1.0,...,-0.671,-0.671,-0.033,0.761,0.762,0.126,-0.095,-0.671,-0.828,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.996,...,-1.0,-1.0,-1.0,-0.109,1.0,-0.179,-1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-0.273,0.684,0.96,0.45,-0.067,...,-0.318,1.0,0.536,-0.987,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-0.928,-0.204,0.751,0.466,0.234,...,0.466,0.639,1.0,1.0,0.791,0.439,-0.199,-0.883,-1.0,-1.0


In [66]:
train_out.head()

Unnamed: 0,label
0,6
1,5
2,4
3,7
4,3


In [None]:
train_in.shape, train_out.shape

In [None]:
train = pd.concat([train_in, train_out], axis=1)
test = pd.concat([test_in, test_out], axis=1)

#  Task 1: Data dimensionality, distance-based classifiers

## Subtask 1

In [None]:
grouped_labels = train.groupby('label')
average_points = grouped_labels.mean()

In [None]:
average_points

In [None]:
dist_matrix = pdist(average_points, metric='euclidean')
dist_matrix = squareform(dist_matrix)

In [None]:
dist_matrix

In [None]:
sns.heatmap(dist_matrix, annot=True, fmt=".1f", cmap='RdYlGn', linewidths=0.30).set(title='Distances between Cloud Centers');

The pair of numbers that are close to each other are the following: (3, 5); (5, 6); (4, 9); (7, 9); (8, 9).

## Subtask 2

In [None]:
pca = PCA(n_components=2)
pca_train = pca.fit_transform(train.drop(columns=["label"], inplace=False))
pca_train_df = pd.DataFrame(pca_train, columns=["x", "y"])
pca_train_df = pd.concat([pca_train_df, train["label"]], axis=1)
pca_train_df

In [None]:
plt.figure(figsize=(20,12))
sns.scatterplot(data=pca_train_df, x="x", y="y", hue="label", alpha=0.9, legend="full", palette="tab10").set(title='PCA');

In [None]:
tsne = TSNE(n_components=2)
tsne_train = tsne.fit_transform(train.drop(columns=["label"], inplace=False))
tsne_train_df = pd.DataFrame(tsne_train, columns=["x", "y"])
tsne_train_df = pd.concat([tsne_train_df, train["label"]], axis=1)
tsne_train_df

In [None]:
plt.figure(figsize=(20,12))
sns.scatterplot(data=tsne_train_df, x="x", y="y", hue="label", alpha=0.9, legend="full", palette="tab10").set(title='TSNE');

In [None]:
umap_ = UMAP(n_components=2)
umap_train = umap_.fit_transform(train.drop(columns=["label"], inplace=False))
umap_train_df = pd.DataFrame(umap_train, columns=["x", "y"])
umap_train_df = pd.concat([umap_train_df, train["label"]], axis=1)
umap_train_df

In [None]:
plt.figure(figsize=(20,12))
sns.scatterplot(data=umap_train_df, x="x", y="y", hue="label", alpha=0.9, legend="full", palette="tab10").set(title='TSNE');

## Subtask 3

In [None]:
nmc = NearestCentroid()
nmc.fit(train_in, train_out.values.ravel())

print(f"Training Score: {nmc.score(train_in, train_out) * 100} %")
print(f"Test Score: {nmc.score(test_in, test_out) * 100} %")

## Subtask 4

In [None]:
knn = KNeighborsClassifier()
knn.fit(train_in, train_out.values.ravel())

print(f"Training Score: {knn.score(train_in, train_out) * 100} %")
print(f"Test Score: {knn.score(test_in, test_out) * 100} %")

In [None]:
nmc_test_pred = nmc.predict(test_in)
knn_test_pred = knn.predict(test_in)

In [None]:
nmc_conf = confusion_matrix(test_out, nmc_test_pred, labels=range(0, 10))
sns.heatmap(nmc_conf, annot=True, fmt=".0f", cmap='RdYlGn', linewidths=0.30).set(title='Confusion Matrix for Nearest Mean Classifier');

In [None]:
knn_conf = confusion_matrix(test_out, knn_test_pred, labels=range(0, 10))
sns.heatmap(knn_conf, annot=True, fmt=".0f", cmap='RdYlGn', linewidths=0.30).set(title='Confusion Matrix for K Neighbors Classifier');