In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as k
from sklearn.model_selection import train_test_split
from sklearn.manifold import Isomap
from keras.layers import Dense, Dropout, Input
from keras.models import Sequential
import matplotlib.pyplot as plt
import sys
from scipy.spatial.distance import cdist
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import mean_squared_error

In [None]:
matrix = pd.read_csv('data_Ngr_cleaned.csv')

In [None]:
matrix.head()

In [None]:
freq_table = pd.crosstab(matrix['fileID'], matrix['nN_GRM'])

In [None]:
freq_table.head()

In [None]:
file_lang = matrix.drop(['Unnamed: 0', 'nN_GRM', 'lang5'], axis=1)
file_lang = file_lang.drop_duplicates()
file_lang_dict = dict(zip(file_lang.fileID, file_lang.lang4))

def genWeightsMFA(train, file_lang_dict, k1=5, k2=5):
    n = train.shape[0]
    weight_matrix = np.zeros((n, n), dtype=np.float32)

    # Get group labels
    group_list = np.asarray([file_lang_dict[file] for file in train.index])

    # Compute Manhattan distances
    distances = cdist(train.values, train.values, metric='cityblock')

    for i in range(n):
        same_group_mask = group_list == group_list[i]
        diff_group_mask = ~same_group_mask

        # Exclude self
        same_group_mask[i] = False
        diff_group_mask[i] = False

        # Indices of same-group and diff-group
        same_group_indices = np.where(same_group_mask)[0]
        diff_group_indices = np.where(diff_group_mask)[0]

        # Get top-k1 same group neighbors
        if len(same_group_indices) > 0:
            same_dists = distances[i, same_group_indices]
            top_k1 = same_group_indices[np.argsort(same_dists)[:min(k1, len(same_group_indices))]]
            weight_matrix[i, top_k1] = 1

        # Get top-k2 diff group neighbors
        if len(diff_group_indices) > 0:
            diff_dists = distances[i, diff_group_indices]
            top_k2 = diff_group_indices[np.argsort(diff_dists)[:min(k2, len(diff_group_indices))]]
            weight_matrix[i, top_k2] = -1

    return tf.constant(weight_matrix, dtype=tf.float32)

In [None]:
train, val = train_test_split(freq_table, test_size=0.15, stratify=np.asarray(freq_table.index.map(file_lang_dict)))
train_numbered = train.copy()
val_numbered = val.copy()
train_numbered['index'] = range(0, len(train))
val_numbered['index'] = range(0, len(val))

def gen_MFA_loss(data, file_lang_dict, k1=5, k2=5):
  weight_matrix = genWeightsMFA(data, file_lang_dict, k1, k2)
  N = data.shape[0]

  @tf.function
  def general_MFA_loss(y_true, y_pred):
    # Calculate
    true = tf.reshape(tf.tile(y_true[:,:-1], [N, 1]), [N, N, -1]) # N x N x F
    pred = tf.reshape(tf.repeat(y_pred, N, axis=0),  [N, N, -1]) # N x N x F
    mse = tf.reduce_mean((true - pred) ** 2, axis=-1) # N x N x F
    loss = tf.reduce_sum(tf.math.multiply(mse, weight_matrix)) / tf.cast(N, dtype=tf.float32)
    return loss
  return general_MFA_loss

dropout_rate = 0.25

class ValidationMSECallback(Callback):
  def __init__(self, val_data):
    super().__init__()
    self.val_data = val_data

  def on_epoch_end(self, epoch, logs=None):
    preds = self.model.predict(self.val_data, verbose=0)
    y_true = tf.gather(self.val_data, indices=tf.range(tf.shape(self.val_data)[1] - 1), axis=1)
    mse = mean_squared_error(y_true, preds)
    logs = logs or {}
    logs["val_mse"] = mse

class ValidationMFACallback(Callback):
  def __init__(self, val_data, val_data_numbered, file_lang_dict, k1 = 5, k2 = 5):
    super().__init__()
    self.val_data = val_data
    self.val_data_numbered = tf.cast(val_data_numbered, tf.float32)
    self.loss_fn = gen_MFA_loss(val_data, file_lang_dict, k1, k2)

  def on_epoch_end(self, epoch, logs=None):
    preds = self.model.predict(self.val_data_numbered, verbose=0)
    loss = self.loss_fn(self.val_data_numbered, preds)
    logs = logs or {}
    logs["val_loss"] = loss

class Encoder(k.Model):
  def __init__(self, advanced = False):
    super().__init__()
    self.advanced = advanced
    if self.advanced:
      self.dropout1 = Dropout(dropout_rate)
      self.dense1 = Dense(32, activation='relu')
      self.dropout2 = Dropout(dropout_rate)
      self.dense2 = Dense(8, activation='relu')
      self.dropout3 = Dropout(dropout_rate)
    self.dense3 = Dense(2, activation='tanh')

  def call(self, x):
    if self.advanced:
      x = self.dropout1(x)
      x = self.dense1(x)
      x = self.dropout2(x)
      x = self.dense2(x)
      x = self.dropout3(x)
    x = self.dense3(x)
    return x

class Decoder(k.Model):
  def __init__(self, x, advanced = False):
    super().__init__()
    self.advanced = advanced
    if self.advanced:
      self.dense1 = Dense(8, activation='relu')
      self.dropout1 = Dropout(dropout_rate)
      self.dense2 = Dense(32, activation='relu')
      self.dropout2 = Dropout(dropout_rate)
    self.dense3 = Dense(x, activation='relu')

  def call(self, x):
    if self.advanced:
      x = self.dense1(x)
      x = self.dropout1(x)
      x = self.dense2(x)
      x = self.dropout2(x)
    x = self.dense3(x)
    return x


class Autoencoder_normal(k.Model):
  def __init__(self, advanced = False):
    super().__init__()
    self.encoder = Encoder(advanced)
    self.decoder = Decoder(train.shape[1], advanced)

  def call(self, x):
    x = self.encoder(x)
    x = self.decoder(x)
    return x

class Autoencoder(k.Model):
  def __init__(self, advanced = False):
    super().__init__()
    self.encoder = Encoder(advanced)
    self.decoder = Decoder(train.shape[1], advanced)

  def call(self, x):
    x = self.encoder(x[:,:-1])
    x = self.decoder(x)
    return x

In [None]:
batch_size = len(train)
epochs = 200

autoencoder_normal = Autoencoder_normal()
autoencoder_normal.compile(loss='MSE', optimizer="adam")
autoencoder_normal.summary()

SAE_cp_cb = k.callbacks.ModelCheckpoint(
    filepath="SAE.weights.h5",
    monitor='val_loss',
    mode='min',
    save_weights_only = True,
    save_best_only=True)

SAE_hist = autoencoder_normal.fit(train, train, batch_size=batch_size, epochs=epochs, verbose=0, validation_data=(val, val), callbacks=[SAE_cp_cb])
autoencoder_normal.load_weights("SAE.weights.h5")

encodings = pd.DataFrame(autoencoder_normal.encoder(freq_table))
encodings['fileID'] = freq_table.index
encodings.to_csv('encodings_AE_simple.csv')
plt.scatter(encodings[0], encodings[1])
plt.show()

In [None]:
autoencoder = Autoencoder()
autoencoder.compile(loss=gen_MFA_loss(train, file_lang_dict), optimizer="adam")
autoencoder.summary()

val_mse_cb = ValidationMSECallback(val_numbered)
val_loss_cb = ValidationMFACallback(val, val_numbered, file_lang_dict)
val_cb = [val_mse_cb, val_loss_cb]

SGAE_cp_cb = k.callbacks.ModelCheckpoint(
    filepath="SGAE.weights.h5",
    monitor='val_loss',
    mode='min',
    save_weights_only = True,
    save_best_only=True)

SGAE_hist = autoencoder.fit(train_numbered, train_numbered, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[val_cb, SGAE_cp_cb])
autoencoder.load_weights("SGAE.weights.h5")

encodings = pd.DataFrame(autoencoder.encoder(freq_table))
encodings['fileID'] = freq_table.index
encodings.to_csv('encodings_MFA_simple.csv')
plt.scatter(encodings[0], encodings[1])
plt.show()

In [None]:
autoencoder_normal = Autoencoder_normal(advanced=True)
autoencoder_normal.compile(loss='MSE', optimizer="adam")
autoencoder_normal.summary()

AAE_cp_cb = k.callbacks.ModelCheckpoint(
    filepath="AAE.weights.h5",
    monitor='val_loss',
    mode='min',
    save_weights_only = True,
    save_best_only=True)

AAE_hist = autoencoder_normal.fit(train, train, batch_size=batch_size, epochs=epochs, verbose=0, validation_data=(val, val), callbacks=[AAE_cp_cb])
autoencoder_normal.load_weights("AAE.weights.h5")

encodings = pd.DataFrame(autoencoder_normal.encoder(freq_table))
encodings['fileID'] = freq_table.index
encodings.to_csv('encodings_AE_advanced.csv')
plt.scatter(encodings[0], encodings[1])
plt.show()

In [None]:
autoencoder = Autoencoder(True)
autoencoder.compile(loss=gen_MFA_loss(train, file_lang_dict,k1=5, k2=5), optimizer="adam")
autoencoder.summary()

AGAE_cp_cb = k.callbacks.ModelCheckpoint(
    filepath="AGAE.weights.h5",
    monitor='val_loss',
    mode='min',
    save_weights_only = True,
    save_best_only=True)
AGAE_hist = autoencoder.fit(train_numbered, train_numbered, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[val_cb, AGAE_cp_cb])
autoencoder.load_weights("AGAE.weights.h5")

encodings = pd.DataFrame(autoencoder.encoder(freq_table))
encodings['fileID'] = freq_table.index
encodings.to_csv('encodings_MFA_advanced.csv')
plt.scatter(encodings[0], encodings[1])
plt.show()