In [1]:
# Import libraries - Python 3.8
import os
import numpy as np
import gensim
import pandas as pd
from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import sklearn.preprocessing as pp
from scipy.sparse import coo_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Concatenate, Input, InputLayer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow_addons.metrics import F1Score
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import one_hot
import tensorflow.keras.backend as K
import tensorflow as tf
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

from sklearn.model_selection import train_test_split

join = os.path.join

print(gensim.__version__)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\monish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\monish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3.8.3


In [2]:
# Import libraries - Python 3.6
import fasttext
import pandas as pd
import numpy as np
import os
join = os.path.join
import multiprocessing

In [3]:
import sys
print(sys.version)

3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]


### Data loading

In [5]:
root_path = "../final"
# root_path = "./../data/tmp"

In [6]:
# Load files

video1_csv = "baseline1_final.csv"
df_video1 = pd.read_csv(join(root_path, video1_csv), sep='\t')

channel_csv = "baseline_final_channels.csv"
df_channel = pd.read_csv(join(root_path, channel_csv), sep='\t')

In [7]:
print(df_video1.shape)
df_video1.head()

(2120, 4)


Unnamed: 0,caption,title,label,video_id
0,it happened outside waco texas a heavily armed...,the shadow of waco retro report the new york...,0,hOW9AjskoOo
1,thanks for coming its nice to see a good turno...,former abortionist dr levatino destroys procho...,0,dIRcw45n9RU
2,tonight i donald john trump do solemnly swear...,trumps road to the white house full film fron...,0,SMwXKl0odq8
3,this week on buzzfeed unsolved we discuss the...,the strange disappearance of db cooper,0,oHSehKtDyoI
4,im mason noise im 22 and im from birmingham wh...,shockingly offensive auditions have simon cowe...,0,N9COy7O7K-U


In [8]:
print(df_channel.shape)
df_channel.head()

(884, 2)


Unnamed: 0,channel_id,video_ids
0,UCqnbDFdCpuN8CMEg0VuEBqA,"hOW9AjskoOo,uJ44spUo8Uk,-O_DMyHdq_M,U_hbIPJuia..."
1,UCfkzsfj7Go1Q_kRFZmJptsw,dIRcw45n9RU
2,UC3ScyryU9Oy9Wse3a8OAmYQ,"SMwXKl0odq8,AW0gsP3EgDI"
3,UCKijjvu6bN1c-ZHVwR7-5WA,"oHSehKtDyoI,lDeFSOUHdH4,cDZweMXXY6Y,p2EUZ-gwe6..."
4,UC6my_lD3kBECBifeq0n2mdg,"N9COy7O7K-U,DHwpwD-ae7I,74fTHh6jB5Q"


In [9]:
# HARDCODED limits per topic for baseline1 dataset
TOPIC_LIMITS = [0, 430, 901, 1214, 1530, 2120]

In [10]:
captions1_list = df_video1["caption"].to_list()
video1_list = df_video1["video_id"].to_list()

In [11]:
# Convert labels to 2-class
df_video1["label"].replace(-1, 0, inplace=True)

### Captions -> word2vec, extract embeddings

In [12]:
# Load word2vec
word2vec_300 = "GoogleNews-vectors-negative300.bin"
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(join(root_path, word2vec_300), binary = True)
word2vec_model.vector_size

FileNotFoundError: [Errno 2] No such file or directory: '../final\\GoogleNews-vectors-negative300.bin'

In [None]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [None]:
# Our earlier preprocessing was done when we were dealing only with word vectors
# Here, we need each document to remain a document 
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word.isalpha()] 
    return doc

In [None]:
# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [None]:
# Filter out documents
def filter_docs(corpus, texts, video_ids, condition_on_doc):
    """
    Filter corpus and texts given the function condition_on_doc which takes a doc. The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    ret_texts, ret_videos = [], []
    if texts is not None:
        for (text, doc, video_id) in zip(texts, corpus, video_ids):
            if condition_on_doc(doc):
                ret_texts.append(text)
                ret_videos.append(video_id)

    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, ret_texts, ret_videos)

In [None]:
def captions_to_word2vec(caption_list, video_list, model):
    # Preprocess the corpus
    corpus = [preprocess(title) for title in caption_list]

    # Remove docs that don't include any words in W2V's vocab
    corpus, titles_list, video_list = filter_docs(corpus, caption_list, video_list, lambda doc: has_vector_representation(model, doc))

    # Filter out any empty docs
    corpus, titles_list, video_list = filter_docs(corpus, caption_list, video_list, lambda doc: (len(doc) != 0))
    x = []
    for doc in corpus: # append the vector for each document
        x.append(document_vector(model, doc))

    X = np.array(x) # list to array
    
    return X, video_list

In [None]:
captions1_X, cap_w2v_videos = captions_to_word2vec(captions1_list, video1_list, word2vec_model)
captions1_X.shape

In [None]:
# Save to file
with open(join(root_path, 'baseline1_word2vec_300.npy'), 'wb') as f:
    np.save(f, captions1_X)

In [13]:
# Load from file
assert os.path.isfile(join(root_path, 'baseline1_word2vec_300.npy'))
captions1_X = np.load(join(root_path, 'baseline1_word2vec_300.npy'))

### Captions -> fastText
Source: https://github.com/kostantinos-papadamou/pseudoscience-paper
Use Python 3.6 to run the following cells.

In [74]:
# Reformat data, save to file
with open(join(root_path, 'fasttext_captions.txt'), 'w') as f:
    for item in captions1_list:
        f.write("%s\n" % item)

In [75]:
# Fine tune fasttext model, save to file
fasttext_models_filename = 'fasttext_model_finetuned.bin'
ft_model = fasttext.train_unsupervised(
    input=join(root_path, 'fasttext_captions.txt'),
    pretrainedVectors=join(root_path, 'wiki-news-300d-1M.vec'),
    dim=300,
    minn=2,
    maxn=5,
    verbose=2)
ft_model.save_model(join(root_path, fasttext_models_filename))
del ft_model

ValueError: ../final\wiki-news-300d-1M.vec cannot be opened for loading!

In [14]:
# Get caption features
ft_model = fasttext.load_model(join(root_path, "fasttext_model_finetuned-002.bin"))
ft_features = []
for caption in captions1_list:
    ft_features += [ft_model.get_sentence_vector(text=caption)]
ft_features = np.array(ft_features)
print(ft_features.shape)
del ft_model



(2120, 300)


In [79]:
# # Save to file
# with open(join(root_path, 'baseline1_fasttext_300.npy'), 'wb') as f:
#     np.save(f, ft_features)

In [15]:
# Load from file
assert os.path.isfile(join(root_path, 'baseline1_fasttext_300.npy'))
ft_features = np.load(join(root_path, 'baseline1_fasttext_300.npy'))

### Train deep learning model on fastText, extract embeddings
Source: https://github.com/kostantinos-papadamou/pseudoscience-paper

In [None]:
# Specify GPU environment
gpu_training = True
if gpu_training:
    # Train on GPU
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
else:
    # Train on CPU
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [16]:
# Retrieve labels
ft_labels = df_video1["label"].to_numpy()
ft_labels[ft_labels == -1] = 0  # make two class
ft_labels.shape

(2120,)

In [None]:
# Initialize model
class PsuedoscienceDeepLearningModel(object):
    
    def __init__(self):
        # Initialize hyperparameters
        self.embedding_dim = 300
        self.dropout = 0.5
        self.learning_rate = 1e-3
        self.val_split_size = 0.2
        self.num_epochs = 100
        self.num_folds = 10  # for k-fold cross validation
        self.batch_size = 20
        self.shuffle_train_set = True
        self.oversampling = True
        self.num_classes = 2
        
        self.model = self.build_model()
    
    def build_model(self):
        seq = Sequential()
        seq.add(Dense(units=256, activation='relu', name='fully_connected_1', input_shape=(self.embedding_dim,)))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_1'))
        seq.add(Dense(units=128, activation='relu', name='fully_connected_2'))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_2'))
        seq.add(Dense(units=64, activation='relu', name='fully_connected_3'))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_3'))
        seq.add(Dense(units=32, activation='relu', name='fully_connected_4'))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_4'))
        seq.add(Dense(units=self.num_classes, activation='softmax', name='classification_layer'))
        seq.compile(loss=BinaryCrossentropy(from_logits=False),
                    optimizer=Adam(lr=self.learning_rate),
                    metrics=[F1Score(num_classes=2)])
        return seq
    
    def summary(self):
        return self.model.summary()
    
    def get_model(self):
        return self.model

model = PsuedoscienceDeepLearningModel()
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 fully_connected_1 (Dense)   (None, 256)               77056     
                                                                 
 dropout_layer_1 (Dropout)   (None, 256)               0         
                                                                 
 fully_connected_2 (Dense)   (None, 128)               32896     
                                                                 
 dropout_layer_2 (Dropout)   (None, 128)               0         
                                                                 
 fully_connected_3 (Dense)   (None, 64)                8256      
                                                                 
 dropout_layer_3 (Dropout)   (None, 64)                0         
                                                                 
 fully_connected_4 (Dense)   (None, 32)              

  super(Adam, self).__init__(name, **kwargs)


In [None]:
# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss',
                               verbose=2,
                               mode='auto',
                               restore_best_weights=True)

In [68]:
# Train model
print('/n---Training the Model with {} videos.'.format(ft_features.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(
    ft_features, ft_labels, test_size=model.val_split_size)

# Oversampling
if model.oversampling:
    smote = SMOTE(sampling_strategy='not majority')
    X_train, y_train = smote.fit_resample(X_train, y_train)
    print('--- [AFTER OVER-SAMPLING] TRAIN: %d' % (y_train.shape[0]))

# Convert labels to 1-hot
y_train = one_hot(y_train, model.num_classes)
y_test = one_hot(y_test, model.num_classes)

# Train the model
model_train_input = [X_train]
model_val_input = [X_test]
m = model.get_model()
m.fit(model_train_input,
      y_train,
      epochs=model.num_epochs,
      batch_size=model.batch_size,
      validation_data=[model_val_input, y_test],
      shuffle=model.shuffle_train_set,
      verbose=1,
      callbacks=[early_stopping])

# Save trained model
m.save(join(root_path, 'nn.hdf5'))

NameError: name 'ft_features' is not defined

In [17]:
# Load best model
best_model = load_model(join(root_path, "nn_75.hdf5"))

In [18]:
# Retrieve layer outputs
layer_name = 'fully_connected_4'
intermediate_model = Model(inputs=best_model.input,
                          outputs=best_model.get_layer(layer_name).output)
intermediate_output = intermediate_model.predict(ft_features)
intermediate_output.shape

(2120, 32)

In [83]:
# Save to file


In [19]:
# Load from file
assert os.path.isfile(join(root_path, 'baseline1_nn_32.npy'))
intermediate_output = np.load(join(root_path, 'baseline1_nn_32.npy'))

### Load embeddings from our approach

In [9]:
X_novel = np.load(join(root_path, 'bi_embedding.npy'))

In [154]:
# # Convert from ragged tensor to normal numpy array
# X_novel = np.zeros((ce.shape[0], ce[0][0].shape[0]))
# for i in range(ce.shape[0]):
#     X_novel[i, :] = ce[i][0]
# X_novel.shape

### Normalize CE

In [10]:
for i in range(X_novel.shape[0]):
    X_novel[i] = X_novel[i] / np.linalg.norm(X_novel[i])

In [11]:
X_novel[:3, :3]

array([[-0.00104776, -0.00723376,  0.01471311],
       [-0.00116783, -0.00725456,  0.01445016],
       [-0.00166335, -0.00807987,  0.01418412]], dtype=float32)

### Compute similarity between channels

In [24]:
def cosine_similarities(mat):
    col_normed_mat = pp.normalize(coo_matrix(mat.T).tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat

In [25]:
def compute_ground_truth_diff_for_channels(baseline_final, baseline_channels):
    
    num_channels = len(baseline_channels)
    channel_ground_truth_proportion = np.empty(num_channels, dtype='float32')

    for index, row in baseline_channels.iterrows():
        videos = row['video_ids'].split(',')

        channel_videos = baseline_final[baseline_final["video_id"].isin(videos)]
        proportion = np.mean(channel_videos["label"].to_numpy())
        channel_ground_truth_proportion[index] = proportion
    
    return channel_ground_truth_proportion

In [26]:
def compute_ground_truth_diff(p):
    return np.abs(p[:, np.newaxis] - p)

In [27]:
# Compute ground truth proportion differences for channels
misinfo_proportions = compute_ground_truth_diff_for_channels(df_video1, df_channel)
proportion_diffs = compute_ground_truth_diff(misinfo_proportions)
proportion_diffs.shape

(884, 884)

In [28]:
def compute_channel_embeddings(df_video, df_channel, video_embeddings):
    
    num_channels = len(df_channel)
    embedding_size = video_embeddings.shape[1]
    channel_embeddings = np.empty((num_channels, embedding_size), dtype='float32')
    
    for index, row in df_channel.iterrows():
        videos = row['video_ids'].split(',')
        
        video_i = df_video.index[df_video["video_id"].isin(videos)]
        channel_embeddings[index, :] = np.mean(video_embeddings[video_i, :], axis=0)
    
    return channel_embeddings

In [18]:
# Compute channel embeddings for baselines
word2vec_channel_emb = compute_channel_embeddings(df_video1, df_channel, captions1_X)
nn_channel_emb = compute_channel_embeddings(df_video1, df_channel, intermediate_output)
novel_channel_emb = compute_channel_embeddings(df_video1, df_channel, X_novel)
word2vec_channel_emb.shape, nn_channel_emb.shape, novel_channel_emb.shape

((884, 300), (884, 32), (884, 768))

In [None]:
# Compute cosine similarities
word2vec_channel_sim = cosine_similarities(word2vec_channel_emb).toarray()
nn_channel_sim = cosine_similarities(nn_channel_emb).toarray()
novel_channel_sim = cosine_similarities(novel_channel_emb).toarray()
word2vec_channel_sim.shape, nn_channel_sim.shape, novel_channel_sim.shape

((884, 884), (884, 884), (884, 884))

In [None]:
# All cosine similarities must be between 0 and 1

word2vec_channel_sim[word2vec_channel_sim > 1.0] = 1.0
word2vec_channel_sim[word2vec_channel_sim < 0.0] = 0.0
nn_channel_sim[nn_channel_sim > 1.0] = 1.0
nn_channel_sim[nn_channel_sim < 0.0] = 0.0
novel_channel_sim[novel_channel_sim > 1.0] = 1.0
novel_channel_sim[novel_channel_sim < 0.0] = 0.0

print(word2vec_channel_sim.min(), word2vec_channel_sim.max())
print(nn_channel_sim.min(), nn_channel_sim.max())
print(novel_channel_sim.min(), novel_channel_sim.max())

0.09198472 1.0
0.0 1.0
0.94883376 1.0


In [22]:
# Filter channels according to number of videos
# A minimum of 3 is required for fair comparison
filtered_indices = []
filtered_misinfo_indices = []
filtered_nonmisinfo_indices = []
for index, row in df_channel.iterrows():
    videos = row['video_ids'].split(',')
    if len(videos) >= 3:
        filtered_indices += [index]
        
        channel_videos = df_video1[df_video1["video_id"].isin(videos)]
        if (channel_videos['label'] == 1).sum() > 0:
            filtered_misinfo_indices += [index]
        else:
            filtered_nonmisinfo_indices += [index]

len(filtered_indices), len(filtered_misinfo_indices), len(filtered_nonmisinfo_indices)

(162, 37, 125)

In [29]:
# Filter proportion diff and similarity matrices accordingly
pd_all = proportion_diffs[filtered_indices, :][:, filtered_indices]
pd_misinfo = proportion_diffs[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
pd_nonmisinfo = proportion_diffs[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
w2v_sim_all = word2vec_channel_sim[filtered_indices, :][:, filtered_indices]
w2v_sim_misinfo = word2vec_channel_sim[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
w2v_sim_nonmisinfo = word2vec_channel_sim[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
nn_sim_all = nn_channel_sim[filtered_indices, :][:, filtered_indices]
nn_sim_misinfo = nn_channel_sim[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
nn_sim_nonmisinfo = nn_channel_sim[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
novel_sim_all = novel_channel_sim[filtered_indices, :][:, filtered_indices]
novel_sim_misinfo = novel_channel_sim[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
novel_sim_nonmisinfo = novel_channel_sim[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
pd_all.shape, pd_misinfo.shape, pd_nonmisinfo.shape

NameError: name 'word2vec_channel_sim' is not defined

In [43]:
# Extract upper diagonals as flat matrix

ind_all = np.triu_indices(len(filtered_indices), k=1)
ind_misinfo = np.triu_indices(len(filtered_misinfo_indices), k=1)
ind_nonmisinfo = np.triu_indices(len(filtered_nonmisinfo_indices), k=1)
pd_all_ud = pd_all[ind_all]
pd_misinfo_ud = pd_misinfo[ind_misinfo]
pd_nonmisinfo_ud = pd_nonmisinfo[ind_nonmisinfo]

pd_both = proportion_diffs[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]
pd_both_ud = pd_both.flatten()



In [None]:
pd_all_ud = pd_all[ind_all]
pd_misinfo_ud = pd_misinfo[ind_misinfo]
pd_nonmisinfo_ud = pd_nonmisinfo[ind_nonmisinfo]

w2v_sim_all_ud = w2v_sim_all[ind_all]
w2v_sim_misinfo_ud = w2v_sim_misinfo[ind_misinfo]
w2v_sim_nonmisinfo_ud = w2v_sim_nonmisinfo[ind_nonmisinfo]

nn_sim_all_ud = nn_sim_all[ind_all]
nn_sim_misinfo_ud = nn_sim_misinfo[ind_misinfo]
nn_sim_nonmisinfo_ud = nn_sim_nonmisinfo[ind_nonmisinfo]

novel_sim_all_ud = novel_sim_all[ind_all]
novel_sim_misinfo_ud = novel_sim_misinfo[ind_misinfo]
novel_sim_nonmisinfo_ud = novel_sim_nonmisinfo[ind_nonmisinfo]

pd_all_ud.shape, pd_misinfo_ud.shape, pd_nonmisinfo_ud.shape

In [24]:
# Extract cosine similarities of (misinfo, nonmisinfo) pairs
# This needs to be done separately as it won't be a square matrix

pd_both = proportion_diffs[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]
w2v_sim_both = word2vec_channel_sim[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]
nn_sim_both = nn_channel_sim[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]
novel_sim_both = novel_channel_sim[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]

# Naming convention: _ud to be similar to other variables
# No actual upper triangular calculation here
pd_both_ud = pd_both.flatten()
w2v_sim_both_ud = w2v_sim_both.flatten()
nn_sim_both_ud = nn_sim_both.flatten()
novel_sim_both_ud = novel_sim_both.flatten()

pd_both_ud.shape, w2v_sim_both_ud.shape, nn_sim_both_ud.shape, novel_sim_both_ud.shape

((4625,), (4625,), (4625,), (4625,))

In [36]:
def result_analysis_by_range(proportion_diffs_ud, sim_ud):

    ranges = [(0.0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.01)]
    for begin, end in ranges:

        ind = np.argwhere((proportion_diffs_ud >= begin) & (proportion_diffs_ud < end))
        
        if ind.shape[0] > 0:
            #proportion_diffs_i = proportion_diffs_ud[ind]
            sim_i = sim_ud[ind]

            print("For range", begin, "to", end, " - ", sim_i.shape[0], "pairs")
            #print("25th percentile =", np.percentile(sim_i, 25))
            #print("50th percentile =", np.percentile(sim_i, 50))
            #print("75th percentile =", np.percentile(sim_i, 75))
            print("{0},{1},{2}".format(np.percentile(sim_i, 25), np.percentile(sim_i, 50), np.percentile(sim_i, 75)))
        else:
            print("No pairs for range", begin, "to", end)

#### Results: any label to any label

In [26]:
result_analysis_by_range(pd_all_ud, w2v_sim_all_ud)

For range 0.0 to 0.2  -  9133 pairs
0.9259091019630432,0.9569250345230103,0.9747517108917236
For range 0.2 to 0.4  -  1750 pairs
0.9337780624628067,0.9588366746902466,0.973519578576088
For range 0.4 to 0.6  -  271 pairs
0.9456446170806885,0.9669362306594849,0.9810445010662079
For range 0.6 to 0.8  -  1214 pairs
0.9366919100284576,0.9611344337463379,0.9744926393032074
For range 0.8 to 1.01  -  673 pairs
0.9479347467422485,0.9659979343414307,0.9776875376701355


In [27]:
result_analysis_by_range(pd_all_ud, nn_sim_all_ud)

For range 0.0 to 0.2  -  9133 pairs
0.9773192405700684,0.9969475269317627,0.9994640350341797
For range 0.2 to 0.4  -  1750 pairs
0.5526943355798721,0.991633415222168,0.9985706955194473
For range 0.4 to 0.6  -  271 pairs
0.5335564315319061,0.5732368230819702,0.9283021092414856
For range 0.6 to 0.8  -  1214 pairs
0.07796964980661869,0.9157878458499908,0.9894870817661285
For range 0.8 to 1.01  -  673 pairs
0.008336368948221207,0.05354585871100426,0.3391764461994171


In [28]:
result_analysis_by_range(pd_all_ud, novel_sim_all_ud)

For range 0.0 to 0.2  -  9133 pairs
0.9989447593688965,0.9996557831764221,0.9999077916145325
For range 0.2 to 0.4  -  1750 pairs
0.9990879148244858,0.9997088611125946,0.9999207705259323
For range 0.4 to 0.6  -  271 pairs
0.9992295801639557,0.9997743964195251,0.9999341368675232
For range 0.6 to 0.8  -  1214 pairs
0.9990362375974655,0.9996720552444458,0.9999110102653503
For range 0.8 to 1.01  -  673 pairs
0.9991890788078308,0.9997066259384155,0.9999141693115234


#### Results: misinfo to misinfo

In [29]:
result_analysis_by_range(pd_misinfo_ud, w2v_sim_misinfo_ud)

For range 0.0 to 0.2  -  258 pairs
0.9506474882364273,0.9678962230682373,0.9787565022706985
For range 0.2 to 0.4  -  125 pairs
0.9571177363395691,0.9734190702438354,0.9796338677406311
For range 0.4 to 0.6  -  146 pairs
0.9452119767665863,0.9641998410224915,0.9775935262441635
For range 0.6 to 0.8  -  89 pairs
0.9602387547492981,0.971376895904541,0.9801899194717407
For range 0.8 to 1.01  -  48 pairs
0.9673985242843628,0.9770538508892059,0.9858627766370773


In [30]:
result_analysis_by_range(pd_misinfo_ud, nn_sim_misinfo_ud)

For range 0.0 to 0.2  -  258 pairs
0.2811216786503792,0.9841984510421753,0.9979343265295029
For range 0.2 to 0.4  -  125 pairs
0.4025932252407074,0.9171246886253357,0.9972336292266846
For range 0.4 to 0.6  -  146 pairs
0.13459418341517448,0.9181327819824219,0.9896990805864334
For range 0.6 to 0.8  -  89 pairs
0.0458090677857399,0.36094900965690613,0.9177894592285156
For range 0.8 to 1.01  -  48 pairs
0.02807043818756938,0.058897823095321655,0.10355405882000923


In [31]:
result_analysis_by_range(pd_misinfo_ud, novel_sim_misinfo_ud)

For range 0.0 to 0.2  -  258 pairs
0.9988691210746765,0.9996564388275146,0.9999011904001236
For range 0.2 to 0.4  -  125 pairs
0.9991981983184814,0.9997649192810059,0.9999457001686096
For range 0.4 to 0.6  -  146 pairs
0.9989419877529144,0.9997016787528992,0.9999076128005981
For range 0.6 to 0.8  -  89 pairs
0.9989882707595825,0.9996993541717529,0.9998793005943298
For range 0.8 to 1.01  -  48 pairs
0.9991359114646912,0.999737948179245,0.9998822808265686


#### Results: nonmisinfo to nonmisinfo

In [32]:
result_analysis_by_range(pd_nonmisinfo_ud, w2v_sim_nonmisinfo_ud)

For range 0.0 to 0.2  -  7750 pairs
0.9217846244573593,0.9540671706199646,0.9731397479772568
No pairs for range 0.2 to 0.4
No pairs for range 0.4 to 0.6
No pairs for range 0.6 to 0.8
No pairs for range 0.8 to 1.01


In [33]:
result_analysis_by_range(pd_nonmisinfo_ud, nn_sim_nonmisinfo_ud)

For range 0.0 to 0.2  -  7750 pairs
0.976519450545311,0.9969258010387421,0.9995447993278503
No pairs for range 0.2 to 0.4
No pairs for range 0.4 to 0.6
No pairs for range 0.6 to 0.8
No pairs for range 0.8 to 1.01


In [34]:
result_analysis_by_range(pd_nonmisinfo_ud, novel_sim_nonmisinfo_ud)

For range 0.0 to 0.2  -  7750 pairs
0.9990141540765762,0.9996667802333832,0.9999121278524399
No pairs for range 0.2 to 0.4
No pairs for range 0.4 to 0.6
No pairs for range 0.6 to 0.8
No pairs for range 0.8 to 1.01


#### Results: misinfo to nonmisinfo

In [35]:
result_analysis_by_range(pd_both_ud, w2v_sim_both_ud)

For range 0.0 to 0.2  -  1125 pairs
0.9460095763206482,0.9685550332069397,0.9811440110206604
For range 0.2 to 0.4  -  1625 pairs
0.9324469566345215,0.9578205943107605,0.9724388122558594
For range 0.4 to 0.6  -  125 pairs
0.9456844329833984,0.9709511399269104,0.9843785762786865
For range 0.6 to 0.8  -  1125 pairs
0.9349206686019897,0.960141122341156,0.9735148549079895
For range 0.8 to 1.01  -  625 pairs
0.9469743967056274,0.9641695618629456,0.9771322011947632


In [36]:
result_analysis_by_range(pd_both_ud, nn_sim_both_ud)

For range 0.0 to 0.2  -  1125 pairs
0.991191565990448,0.9977689385414124,0.9992472529411316
For range 0.2 to 0.4  -  1625 pairs
0.6346403360366821,0.9918893575668335,0.9986367225646973
For range 0.4 to 0.6  -  125 pairs
0.5360397696495056,0.5589452981948853,0.6129161715507507
For range 0.6 to 0.8  -  1125 pairs
0.08983024209737778,0.9198633432388306,0.9897669553756714
For range 0.8 to 1.01  -  625 pairs
0.008286419324576855,0.052968502044677734,0.3504315912723541


In [37]:
result_analysis_by_range(pd_both_ud, novel_sim_both_ud)

For range 0.0 to 0.2  -  1125 pairs
0.9979409575462341,0.9995355010032654,0.9998689889907837
For range 0.2 to 0.4  -  1625 pairs
0.9990849494934082,0.9997081160545349,0.9999186396598816
For range 0.4 to 0.6  -  125 pairs
0.9994762539863586,0.9998292326927185,0.9999659657478333
For range 0.6 to 0.8  -  1125 pairs
0.9990374445915222,0.9996687173843384,0.999912440776825
For range 0.8 to 1.01  -  625 pairs
0.999190628528595,0.9997054934501648,0.9999157190322876


In [20]:
def chunk_to_chunk(df_video, df_channel, chunk_embeddings):
    num_channels = len(df_channel)
    channel_embeddings = [[] for _ in range(num_channels)]
    
    for index, row in df_channel.iterrows():
        videos = row['video_ids'].split(',')
        video_i = df_video.index[df_video["video_id"].isin(videos)]  
        chunk = chunk_embeddings[video_i]
        for elem in chunk:
            channel_embeddings[index] += elem

    return np.array(channel_embeddings, dtype=object)
from tqdm.notebook import trange

t = 0.75
def chunk_cosine_similarities(mat):
    cos_sim = np.zeros((mat.shape[0], mat.shape[0]))    
    for i in trange(mat.shape[0]):
        for j in range(mat.shape[0]):
            cos_mat = cosine_similarity(mat[i], mat[j])
            cos_mat = cos_mat.flatten()
            cos_mat_len = len(cos_mat)
            m = cos_mat[cos_mat > t]
            cos_mat_max = len(m) 
            cos_mat_min = len(cos_mat[cos_mat <= t])

            # cos_mat_max = (cos_mat_len - cos_mat_max) / (0.5 * (cos_mat_max + cos_mat_len))
            # cos_mat_min = (cos_mat_len - cos_mat_min) / (0.5 * (cos_mat_min + cos_mat_len))
            if cos_mat_max <= cos_mat_min:
                cos_sim[i, j] = 0.0
            else:
                cos_sim[i, j] = m.min()

            # if len(cos_mat) == 0:
            #     cos_sim[i, j] = 0.0
            # else:
            #     cos_sim[i, j] = np.min(cos_mat)
    return cos_sim

chunk_embeddings = np.load(join(root_path, 'bi2_embedding.npy'), allow_pickle=True)
print(chunk_embeddings.shape)
print(len(chunk_embeddings[0][0]))

chunked_channel_emb = chunk_to_chunk(df_video1, df_channel, chunk_embeddings)
from sklearn.metrics.pairwise import cosine_similarity


chunk_cos_sim = chunk_cosine_similarities(chunked_channel_emb)
print(chunk_cos_sim.min(), chunk_cos_sim.max())

(2120,)
768


  0%|          | 0/884 [00:00<?, ?it/s]

0.9239965677261353 1.000000238418579


In [33]:

chunk_sim_all = chunk_cos_sim[filtered_indices, :][:, filtered_indices]
chunk_sim_misinfo = chunk_cos_sim[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
chunk_sim_nonmisinfo = chunk_cos_sim[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
chunk_sim_both = chunk_cos_sim[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]

chunk_sim_all_ud = chunk_sim_all[ind_all]
chunk_sim_all_minsinfo_ud = chunk_sim_misinfo[ind_misinfo]
chunk_sim_all_nonmisinfo_ud = chunk_cos_sim[ind_nonmisinfo]
chunk_sim_both_ud = chunk_sim_both.flatten()

In [37]:
result_analysis_by_range(pd_all_ud, chunk_sim_all_ud)

For range 0.0 to 0.2  -  9133 pairs
0.9770883917808533,0.9831773638725281,0.987862229347229
For range 0.2 to 0.4  -  1750 pairs
0.976630374789238,0.983600378036499,0.9880243092775345
For range 0.4 to 0.6  -  271 pairs
0.9796615242958069,0.9856457710266113,0.989536464214325
For range 0.6 to 0.8  -  1214 pairs
0.9799032807350159,0.9858368933200836,0.9902356266975403
For range 0.8 to 1.01  -  673 pairs
0.9770849943161011,0.9861478805541992,0.9899656772613525


In [44]:
result_analysis_by_range(pd_misinfo_ud, chunk_sim_all_minsinfo_ud)


For range 0.0 to 0.2  -  258 pairs
0.9717417508363724,0.9802733957767487,0.9864931106567383
For range 0.2 to 0.4  -  125 pairs
0.9769965410232544,0.9847503900527954,0.9900976419448853
For range 0.4 to 0.6  -  146 pairs
0.9770472943782806,0.9829241037368774,0.9885358661413193
For range 0.6 to 0.8  -  89 pairs
0.9706945419311523,0.9820359945297241,0.9884343147277832
For range 0.8 to 1.01  -  48 pairs
0.9776507616043091,0.9848083555698395,0.98786860704422


In [47]:
result_analysis_by_range(pd_nonmisinfo_ud, chunk_sim_all_nonmisinfo_ud)

For range 0.0 to 0.2  -  7750 pairs
0.9828147441148758,0.9888578355312347,0.9932411760091782
No pairs for range 0.2 to 0.4
No pairs for range 0.4 to 0.6
No pairs for range 0.6 to 0.8
No pairs for range 0.8 to 1.01


In [46]:
result_analysis_by_range(pd_both_ud, chunk_sim_both_ud)

For range 0.0 to 0.2  -  1125 pairs
0.9719547033309937,0.9777891635894775,0.982799232006073
For range 0.2 to 0.4  -  1625 pairs
0.9765671491622925,0.9835426807403564,0.9878278970718384
For range 0.4 to 0.6  -  125 pairs
0.9843184947967529,0.9873199462890625,0.9904522895812988
For range 0.6 to 0.8  -  1125 pairs
0.9804402589797974,0.9860204458236694,0.990294337272644
For range 0.8 to 1.01  -  625 pairs
0.9769406914710999,0.9863744974136353,0.9900457859039307
