In [1]:
import numpy as np
import helper
from matplotlib import pyplot as plt
from tensorflow_addons.metrics import HammingLoss
from tensorflow.keras.losses import CosineSimilarity


## Load Preprocessed Data

In [2]:
data = np.load('../preprocessing/preprocessed_oh.npz' ,allow_pickle= True)

text_vec = data['text_word2vec']
summary_vec = data['summary_word2vec']
text_existence = data['text_existence']
text_count = data['text_count']
summary_existence = data['summary_existence']
summary_count = data['summary_count']
labels = data["labels"]
text_voc_size = data['text_voc_size']
sum_voc_size = data['summary_voc_size']

In [3]:
input_size = sum_voc_size
code_size = 2


nonlinear_ae_com, nonlinear_encoder_com = helper.nonlinear_autoencoder_complex(input_size, code_size, "mse")
history3 = nonlinear_ae_com.fit(summary_existence,summary_existence, epochs = 7)
plt.plot(history3.history["loss"])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 35560)]           0         
                                                                 
 dense (Dense)               (None, 2)                 71122     
                                                                 
 dense_1 (Dense)             (None, 17780)             53340     
                                                                 
 dense_2 (Dense)             (None, 35560)             632292360 
                                                                 
Total params: 632,416,822
Trainable params: 632,416,822
Non-trainable params: 0
_________________________________________________________________
Epoch 1/7
 1/16 [>.............................] - ETA: 1:25:33 - loss: 0.1812

In [None]:
helper.random_knn(summary_existence, labels = labels, num= 10, knn = 5, metric = "cosine")

In [None]:
helper.random_knn(summary_existence, labels = labels, num= 10, knn = 5, metric = "hamming")

## Analysis of Data

In [None]:
summary_existence = np.array(summary_existence)
centered_sum_ex = (summary_existence - summary_existence.mean())/summary_existence.std()

# Experiments with existence of words in a summary

In [None]:

helper.plot_dendrogram(summary_existence,title = "500 samples dendrogram using complete linkage existence of words in a summary",titles_list = labels)

In [None]:
reduced_PCA_OG_matrix = helper.plot_pca(summary_existence,titles_list = labels, title = "500 samples Summary Existence PCA")

In [None]:
reduced_TSNE_COS_matrix = helper.plot_tsne(summary_existence,labels,title="500 samples Summary Existence Tsne perp60 cosine",metric= "cosine")

In [None]:
reduced_TSNE_HAM_matrix = helper.plot_tsne(summary_existence,labels,title="500 samples Summary Existence Tsne perp60 Hamming",metric= "hamming")

In [None]:
helper.plot_dendrogram(summary_existence,title = "500 samples dendrogram using ward's algorithm on existence of words in a summary",hierarchy_method="ward", dist_metric="euclidean",titles_list = labels)

In [None]:
helper.plot_dendrogram(summary_existence,title = "500 samples dendrogram using complete linkage yule metric existence of words in a summary",dist_metric= "yule",titles_list = labels)

In [None]:
helper.plot_dendrogram(summary_existence,title = "500 samples dendrogram using complete linkage Hamming metric existence of words in a summary",dist_metric= "hamming",titles_list = labels)

In [None]:
reduced_PCA_COS_matrix = helper.plot_kpca(summary_existence,labels, kernel="cosine",title="500 samples cosine kernel PCA of existence of words in a summary")

In [None]:
reduced_PCA_HAM_matrix = helper.plot_kpca(summary_existence,labels, kernel="hamming",title="500 samples cosine kernel PCA of existence of words in a summary")

## Experiments with Summary counts

In [None]:
summary_count = np.array(summary_count)
centered_sum_count = (summary_count - summary_count.mean())/summary_existence.std()
helper.plot_dendrogram(centered_sum_ex,title = "500 samples dendrogram using complete linkage cosine metric count of words in a summary",titles_list = labels)

In [None]:
reduced_matrix = helper.plot_pca(summary_count,titles_list = labels, title = "500 samples Summary count PCA")

In [None]:
reduced_matrix = helper.plot_tsne(centered_sum_count,labels,title="500 samples Summary Count Tsne perp60 cosine",metric= "cosine")

In [None]:
reduced_matrix = helper.plot_tsne(summary_count,labels,title="500 samples Summary Count Tsne perp60 Yule",metric= "yule")

In [None]:
helper.plot_dendrogram(summary_count,title = "500 samples dendrogram using ward's algorithm on count of words in a summary",hierarchy_method="ward", dist_metric="euclidean",titles_list = labels)

In [None]:
helper.plot_dendrogram(summary_count,title = "500 samples dendrogram using complete linkage yule metric count of words in a summary",dist_metric= "yule",titles_list = labels)