In [None]:
import os
import pickle
import numpy as np
import faiss
from faiss import write_index, IndexFlatL2, IndexIVFFlat


In [None]:
def load_emb(enc_type):
    embs = []
    ids = []

    if enc_type == 'img':
        dir_name = 'emb_tweet'
        file_pfx = 'enc_'
    else:
        dir_name = 'emb_tweet'
        file_pfx = f'{enc_type}_'

    for file in os.listdir(dir_name):
        if file.endswith('.pkl') and file.startswith(file_pfx):
            with open(os.path.join(dir_name, file), "rb") as f_in:
                data = pickle.load(f_in)
                embs.extend(data['embeddings'])
                ids.extend(data['ids'])

    return np.array(embs), ids


In [None]:
def preprocess(data1, data2, lk):
    multimodal = np.concatenate((data1, data2), axis=1)
    return multimodal, np.array(lk)

In [None]:
def build_index(embeddings):
    quantizer = IndexFlatL2(embeddings.shape[1])
    nlist = 40
    index = IndexIVFFlat(quantizer, embeddings.shape[1], nlist)
    index.train(embeddings)
    index.add(embeddings)
    return index

In [None]:
def calc_error(test, index, train_lk, test_lk, k=15):
    """
    Calculate the Root Mean Squared Error (RMSE) for a set of test embeddings.

    Args:
    - test (numpy.ndarray): Array of test embeddings.
    - index (nmslib.Index): Nearest neighbors index for the training data.
    - train_lk (numpy.ndarray): Array of training likes corresponding to the embeddings.
    - test_lk (numpy.ndarray): Array of actual likes for the test embeddings.
    - k (int): Number of nearest neighbors to consider (default is 15).

    Returns:
    - float: Root Mean Squared Error (RMSE) between predicted and actual likes.
    """

    # List to store squared errors
    sq_err = []

    # Iterate over each test embedding
    for i, emb in enumerate(test):
        # Find the k-nearest neighbors in the training data
        D, I = index.search(np.array([emb]), k)

        # Extract likes of the nearest neighbors
        nearest = [train_lk[idx] for idx in I[0]]
        nearest = np.array(nearest)

        # Calculate quartiles and interquartile range (IQR) for outlier removal
        q1 = np.percentile(nearest, 20)
        q3 = np.percentile(nearest, 80)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr

        # Filter out outliers
        filt_indices = np.where((nearest >= lower) & (nearest <= upper))[0]
        filt_likes = nearest[filt_indices]

        # Calculate predicted likes based on filtered neighbors
        try:
            pred_likes = np.average(filt_likes)
        except ZeroDivisionError:
            pred_likes = np.mean(filt_likes)

        # Actual likes for the test sample
        act_likes = test_lk[i]

        # Calculate squared error and append to the list
        sq_err.append((pred_lik


In [None]:
embeds1, ids1 = load_emb('img')
embeds2, _ = load_emb('txt')
likes, _ = load_emb('likes')

multimodal, likes = preprocess(embeds1, embeds2, likes)

tr_size = int(0.8 * len(multimodal))
train = multimodal[:tr_size]
test = multimodal[tr_size:]

train_likes = likes[:tr_size]
test_likes = likes[tr_size:]

index = build_index(train)

rmse = calc_error(test, index, train_likes, test_likes)
print('Final RMSE:', rmse)


fin_index = build_index(multimodal)
write_index(fin_index, 'mm-ind.index')