In [56]:
# Import libraries - Python 3.8
import os
import numpy as np
import gensim
import pandas as pd
from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import sklearn.preprocessing as pp
from scipy.sparse import coo_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Concatenate, Input, InputLayer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow_addons.metrics import F1Score
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import one_hot
import tensorflow.keras.backend as K
import tensorflow as tf
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

from sklearn.model_selection import train_test_split

join = os.path.join

print(gensim.__version__)

3.8.3


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\monish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\monish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
# Import libraries - Python 3.6
import fasttext
import pandas as pd
import numpy as np
import os
join = os.path.join
import multiprocessing

In [58]:
import sys
print(sys.version)

3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]


### Data loading

In [59]:
root_path = "../final"

In [60]:
# Load files

video1_csv = "baseline1_final.csv"
df_video1 = pd.read_csv(join(root_path, video1_csv), sep='\t')

channel_csv = "baseline_final_channels.csv"
df_channel = pd.read_csv(join(root_path, channel_csv), sep='\t')

In [61]:
print(df_video1.shape)
df_video1.head()

(2120, 4)


Unnamed: 0,caption,title,label,video_id
0,it happened outside waco texas a heavily armed...,the shadow of waco retro report the new york...,0,hOW9AjskoOo
1,thanks for coming its nice to see a good turno...,former abortionist dr levatino destroys procho...,0,dIRcw45n9RU
2,tonight i donald john trump do solemnly swear...,trumps road to the white house full film fron...,0,SMwXKl0odq8
3,this week on buzzfeed unsolved we discuss the...,the strange disappearance of db cooper,0,oHSehKtDyoI
4,im mason noise im 22 and im from birmingham wh...,shockingly offensive auditions have simon cowe...,0,N9COy7O7K-U


In [62]:
print(df_channel.shape)
df_channel.head()

(884, 2)


Unnamed: 0,channel_id,video_ids
0,UCqnbDFdCpuN8CMEg0VuEBqA,"hOW9AjskoOo,uJ44spUo8Uk,-O_DMyHdq_M,U_hbIPJuia..."
1,UCfkzsfj7Go1Q_kRFZmJptsw,dIRcw45n9RU
2,UC3ScyryU9Oy9Wse3a8OAmYQ,"SMwXKl0odq8,AW0gsP3EgDI"
3,UCKijjvu6bN1c-ZHVwR7-5WA,"oHSehKtDyoI,lDeFSOUHdH4,cDZweMXXY6Y,p2EUZ-gwe6..."
4,UC6my_lD3kBECBifeq0n2mdg,"N9COy7O7K-U,DHwpwD-ae7I,74fTHh6jB5Q"


In [63]:
# HARDCODED limits per topic for baseline1 dataset
TOPIC_LIMITS = [0, 430, 901, 1214, 1530, 2120]

In [64]:
captions1_list = df_video1["caption"].to_list()
video1_list = df_video1["video_id"].to_list()

### Captions -> word2vec, extract embeddings

In [65]:
# Load word2vec
word2vec_300 = "GoogleNews-vectors-negative300.bin"
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(join(root_path, word2vec_300), binary = True)
word2vec_model.vector_size

FileNotFoundError: [Errno 2] No such file or directory: '../final\\GoogleNews-vectors-negative300.bin'

In [None]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [None]:
# Our earlier preprocessing was done when we were dealing only with word vectors
# Here, we need each document to remain a document 
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word.isalpha()] 
    return doc

In [None]:
# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [None]:
# Filter out documents
def filter_docs(corpus, texts, video_ids, condition_on_doc):
    """
    Filter corpus and texts given the function condition_on_doc which takes a doc. The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    ret_texts, ret_videos = [], []
    if texts is not None:
        for (text, doc, video_id) in zip(texts, corpus, video_ids):
            if condition_on_doc(doc):
                ret_texts.append(text)
                ret_videos.append(video_id)

    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, ret_texts, ret_videos)

In [None]:
def captions_to_word2vec(caption_list, video_list, model):
    # Preprocess the corpus
    corpus = [preprocess(title) for title in caption_list]

    # Remove docs that don't include any words in W2V's vocab
    corpus, titles_list, video_list = filter_docs(corpus, caption_list, video_list, lambda doc: has_vector_representation(model, doc))

    # Filter out any empty docs
    corpus, titles_list, video_list = filter_docs(corpus, caption_list, video_list, lambda doc: (len(doc) != 0))
    x = []
    for doc in corpus: # append the vector for each document
        x.append(document_vector(model, doc))

    X = np.array(x) # list to array
    
    return X, video_list

In [None]:
captions1_X, cap_w2v_videos = captions_to_word2vec(captions1_list, video1_list, word2vec_model)
captions1_X.shape

In [None]:
# Save to file
with open(join(root_path, 'baseline1_word2vec_300.npy'), 'wb') as f:
    np.save(f, captions1_X)

In [None]:
# Load from file
assert os.path.isfile(join(root_path, 'baseline1_word2vec_300.npy'))
captions1_X = np.load(join(root_path, 'baseline1_word2vec_300.npy'))

### Captions -> fastText
Source: https://github.com/kostantinos-papadamou/pseudoscience-paper
Use Python 3.6 to run the following cells.

In [74]:
# Reformat data, save to file
with open(join(root_path, 'fasttext_captions.txt'), 'w') as f:
    for item in captions1_list:
        f.write("%s\n" % item)

In [75]:
# Fine tune fasttext model, save to file
fasttext_models_filename = 'fasttext_model_finetuned.bin'
ft_model = fasttext.train_unsupervised(
    input=join(root_path, 'fasttext_captions.txt'),
    pretrainedVectors=join(root_path, 'wiki-news-300d-1M.vec'),
    dim=300,
    minn=2,
    maxn=5,
    verbose=2)
ft_model.save_model(join(root_path, fasttext_models_filename))
del ft_model

ValueError: ../final\wiki-news-300d-1M.vec cannot be opened for loading!

In [78]:
# Get caption features
ft_model = fasttext.load_model(join(root_path, "fasttext_model_finetuned-002.bin"))
ft_features = []
for caption in captions1_list:
    ft_features += [ft_model.get_sentence_vector(text=caption)]
ft_features = np.array(ft_features)
print(ft_features.shape)
del ft_model



(2120, 300)


In [79]:
# Save to file
with open(join(root_path, 'baseline1_fasttext_300.npy'), 'wb') as f:
    np.save(f, ft_features)

In [80]:
# Load from file
assert os.path.isfile(join(root_path, 'baseline1_fasttext_300.npy'))
ft_features = np.load(join(root_path, 'baseline1_fasttext_300.npy'))

### Train deep learning model on fastText, extract embeddings
Source: https://github.com/kostantinos-papadamou/pseudoscience-paper

In [None]:
# Specify GPU environment
gpu_training = True
if gpu_training:
    # Train on GPU
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
else:
    # Train on CPU
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
# Retrieve labels
ft_labels = df_video1["label"].to_numpy()
ft_labels[ft_labels == -1] = 0  # make two class
ft_labels.shape

(2120,)

In [None]:
# Initialize model
class PsuedoscienceDeepLearningModel(object):
    
    def __init__(self):
        # Initialize hyperparameters
        self.embedding_dim = 300
        self.dropout = 0.5
        self.learning_rate = 1e-3
        self.val_split_size = 0.2
        self.num_epochs = 100
        self.num_folds = 10  # for k-fold cross validation
        self.batch_size = 20
        self.shuffle_train_set = True
        self.oversampling = True
        self.num_classes = 2
        
        self.model = self.build_model()
    
    def build_model(self):
        seq = Sequential()
        seq.add(Dense(units=256, activation='relu', name='fully_connected_1', input_shape=(self.embedding_dim,)))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_1'))
        seq.add(Dense(units=128, activation='relu', name='fully_connected_2'))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_2'))
        seq.add(Dense(units=64, activation='relu', name='fully_connected_3'))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_3'))
        seq.add(Dense(units=32, activation='relu', name='fully_connected_4'))
        seq.add(Dropout(rate=self.dropout, name='dropout_layer_4'))
        seq.add(Dense(units=self.num_classes, activation='softmax', name='classification_layer'))
        seq.compile(loss=BinaryCrossentropy(from_logits=False),
                    optimizer=Adam(lr=self.learning_rate),
                    metrics=[F1Score(num_classes=2)])
        return seq
    
    def summary(self):
        return self.model.summary()
    
    def get_model(self):
        return self.model

model = PsuedoscienceDeepLearningModel()
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 fully_connected_1 (Dense)   (None, 256)               77056     
                                                                 
 dropout_layer_1 (Dropout)   (None, 256)               0         
                                                                 
 fully_connected_2 (Dense)   (None, 128)               32896     
                                                                 
 dropout_layer_2 (Dropout)   (None, 128)               0         
                                                                 
 fully_connected_3 (Dense)   (None, 64)                8256      
                                                                 
 dropout_layer_3 (Dropout)   (None, 64)                0         
                                                                 
 fully_connected_4 (Dense)   (None, 32)              

  super(Adam, self).__init__(name, **kwargs)


In [None]:
# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss',
                               verbose=2,
                               mode='auto',
                               restore_best_weights=True)

In [68]:
# Train model
print('/n---Training the Model with {} videos.'.format(ft_features.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(
    ft_features, ft_labels, test_size=model.val_split_size)

# Oversampling
if model.oversampling:
    smote = SMOTE(sampling_strategy='not majority')
    X_train, y_train = smote.fit_resample(X_train, y_train)
    print('--- [AFTER OVER-SAMPLING] TRAIN: %d' % (y_train.shape[0]))

# Convert labels to 1-hot
y_train = one_hot(y_train, model.num_classes)
y_test = one_hot(y_test, model.num_classes)

# Train the model
model_train_input = [X_train]
model_val_input = [X_test]
m = model.get_model()
m.fit(model_train_input,
      y_train,
      epochs=model.num_epochs,
      batch_size=model.batch_size,
      validation_data=[model_val_input, y_test],
      shuffle=model.shuffle_train_set,
      verbose=1,
      callbacks=[early_stopping])

# Save trained model
m.save(join(root_path, 'nn.hdf5'))

NameError: name 'ft_features' is not defined

In [81]:
# Load best model
best_model = load_model(join(root_path, "nn_75.hdf5"))

In [82]:
# Retrieve layer outputs
layer_name = 'fully_connected_4'
intermediate_model = Model(inputs=best_model.input,
                          outputs=best_model.get_layer(layer_name).output)
intermediate_output = intermediate_model.predict(ft_features)
intermediate_output.shape

(2120, 32)

In [83]:
# Save to file


In [84]:
# Load from file
assert os.path.isfile(join(root_path, 'baseline1_nn_32.npy'))
intermediate_output = np.load(join(root_path, 'baseline1_nn_32.npy'))

### Load embeddings from our approach

In [85]:
ce = np.load(join(root_path, 'caption_embedding.npy'), allow_pickle=True)

In [86]:
# Convert from ragged tensor to normal numpy array
X_novel = np.zeros((ce.shape[0], ce[0][0].shape[0]))
for i in range(ce.shape[0]):
    X_novel[i, :] = ce[i][0]
X_novel.shape

(2120, 384)

### Normalize CE

In [87]:
for i in range(ce.shape[0]):
    X_novel[i] = X_novel[i] / np.linalg.norm(X_novel[i])

In [88]:
X_novel[:3, :3]

array([[-0.02485627,  0.02774838, -0.04501574],
       [ 0.04443581,  0.01227851,  0.08846989],
       [-0.10445649,  0.03703442,  0.06822192]])

### Compute similarity between channels

In [89]:
def cosine_similarities(mat):
    col_normed_mat = pp.normalize(coo_matrix(mat.T).tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat

In [90]:
def compute_ground_truth_diff_for_channels(baseline_final, baseline_channels):
    
    num_channels = len(baseline_channels)
    channel_ground_truth_proportion = np.empty(num_channels, dtype='float32')

    for index, row in baseline_channels.iterrows():
        videos = row['video_ids'].split(',')

        channel_videos = baseline_final[baseline_final["video_id"].isin(videos)]
        proportion = np.mean(channel_videos["label"].to_numpy())
        channel_ground_truth_proportion[index] = proportion
    
    return channel_ground_truth_proportion

In [91]:
def compute_ground_truth_diff(p):
    return np.abs(p[:, np.newaxis] - p)

In [92]:
# Compute ground truth proportion differences for channels
misinfo_proportions = compute_ground_truth_diff_for_channels(df_video1, df_channel)
proportion_diffs = compute_ground_truth_diff(misinfo_proportions)
proportion_diffs.shape

(884, 884)

In [93]:
def compute_channel_embeddings(df_video, df_channel, video_embeddings):
    
    num_channels = len(df_channel)
    embedding_size = video_embeddings.shape[1]
    channel_embeddings = np.empty((num_channels, embedding_size), dtype='float32')
    
    for index, row in df_channel.iterrows():
        videos = row['video_ids'].split(',')
        
        video_i = df_video.index[df_video["video_id"].isin(videos)]
        channel_embeddings[index, :] = np.mean(video_embeddings[video_i, :], axis=0)
    
    return channel_embeddings

In [94]:
# Compute channel embeddings for baselines
word2vec_channel_emb = compute_channel_embeddings(df_video1, df_channel, captions1_X)
nn_channel_emb = compute_channel_embeddings(df_video1, df_channel, intermediate_output)
novel_channel_emb = compute_channel_embeddings(df_video1, df_channel, X_novel)
word2vec_channel_emb.shape, nn_channel_emb.shape, novel_channel_emb.shape

((884, 300), (884, 32), (884, 384))

In [95]:
# Compute cosine similarities
word2vec_channel_sim = cosine_similarities(word2vec_channel_emb).toarray()
nn_channel_sim = cosine_similarities(nn_channel_emb).toarray()
novel_channel_sim = cosine_similarities(novel_channel_emb).toarray()
word2vec_channel_sim.shape, nn_channel_sim.shape, novel_channel_sim.shape

((884, 884), (884, 884), (884, 884))

In [96]:
# All cosine similarities must be between 0 and 1

word2vec_channel_sim[word2vec_channel_sim > 1.0] = 1.0
word2vec_channel_sim[word2vec_channel_sim < 0.0] = 0.0
nn_channel_sim[nn_channel_sim > 1.0] = 1.0
nn_channel_sim[nn_channel_sim < 0.0] = 0.0
novel_channel_sim[novel_channel_sim > 1.0] = 1.0
novel_channel_sim[novel_channel_sim < 0.0] = 0.0

print(word2vec_channel_sim.min(), word2vec_channel_sim.max())
print(nn_channel_sim.min(), nn_channel_sim.max())
print(novel_channel_sim.min(), novel_channel_sim.max())

0.09198472 1.0
0.0 1.0
0.0 1.0


In [97]:
# Filter channels according to number of videos
# A minimum of 3 is required for fair comparison
filtered_indices = []
filtered_misinfo_indices = []
filtered_nonmisinfo_indices = []
for index, row in df_channel.iterrows():
    videos = row['video_ids'].split(',')
    if len(videos) >= 3:
        filtered_indices += [index]
        
        channel_videos = df_video1[df_video1["video_id"].isin(videos)]
        if (channel_videos['label'] == 1).sum() > 0:
            filtered_misinfo_indices += [index]
        else:
            filtered_nonmisinfo_indices += [index]

len(filtered_indices), len(filtered_misinfo_indices), len(filtered_nonmisinfo_indices)

(162, 37, 125)

In [98]:
# Filter proportion diff and similarity matrices accordingly
pd_all = proportion_diffs[filtered_indices, :][:, filtered_indices]
pd_misinfo = proportion_diffs[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
pd_nonmisinfo = proportion_diffs[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
w2v_sim_all = word2vec_channel_sim[filtered_indices, :][:, filtered_indices]
w2v_sim_misinfo = word2vec_channel_sim[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
w2v_sim_nonmisinfo = word2vec_channel_sim[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
nn_sim_all = nn_channel_sim[filtered_indices, :][:, filtered_indices]
nn_sim_misinfo = nn_channel_sim[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
nn_sim_nonmisinfo = nn_channel_sim[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
novel_sim_all = novel_channel_sim[filtered_indices, :][:, filtered_indices]
novel_sim_misinfo = novel_channel_sim[filtered_misinfo_indices, :][:, filtered_misinfo_indices]
novel_sim_nonmisinfo = novel_channel_sim[filtered_nonmisinfo_indices, :][:, filtered_nonmisinfo_indices]
pd_all.shape, pd_misinfo.shape, pd_nonmisinfo.shape

((162, 162), (37, 37), (125, 125))

In [99]:
# Extract upper diagonals as flat matrix

ind_all = np.triu_indices(len(filtered_indices), k=1)
ind_misinfo = np.triu_indices(len(filtered_misinfo_indices), k=1)
ind_nonmisinfo = np.triu_indices(len(filtered_nonmisinfo_indices), k=1)

pd_all_ud = pd_all[ind_all]
pd_misinfo_ud = pd_misinfo[ind_misinfo]
pd_nonmisinfo_ud = pd_nonmisinfo[ind_nonmisinfo]

w2v_sim_all_ud = w2v_sim_all[ind_all]
w2v_sim_misinfo_ud = w2v_sim_misinfo[ind_misinfo]
w2v_sim_nonmisinfo_ud = w2v_sim_nonmisinfo[ind_nonmisinfo]

nn_sim_all_ud = nn_sim_all[ind_all]
nn_sim_misinfo_ud = nn_sim_misinfo[ind_misinfo]
nn_sim_nonmisinfo_ud = nn_sim_nonmisinfo[ind_nonmisinfo]

novel_sim_all_ud = novel_sim_all[ind_all]
novel_sim_misinfo_ud = novel_sim_misinfo[ind_misinfo]
novel_sim_nonmisinfo_ud = novel_sim_nonmisinfo[ind_nonmisinfo]

pd_all_ud.shape, pd_misinfo_ud.shape, pd_nonmisinfo_ud.shape

((13041,), (666,), (7750,))

In [100]:
# Extract cosine similarities of (misinfo, nonmisinfo) pairs
# This needs to be done separately as it won't be a square matrix

pd_both = proportion_diffs[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]
w2v_sim_both = word2vec_channel_sim[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]
nn_sim_both = nn_channel_sim[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]
novel_sim_both = novel_channel_sim[filtered_misinfo_indices, :][:, filtered_nonmisinfo_indices]

# Naming convention: _ud to be similar to other variables
# No actual upper triangular calculation here
pd_both_ud = pd_both.flatten()
w2v_sim_both_ud = w2v_sim_both.flatten()
nn_sim_both_ud = nn_sim_both.flatten()
novel_sim_both_ud = novel_sim_both.flatten()

pd_both_ud.shape, w2v_sim_both_ud.shape, nn_sim_both_ud.shape, novel_sim_both_ud.shape

((4625,), (4625,), (4625,), (4625,))

In [101]:
def result_analysis_by_range(proportion_diffs_ud, sim_ud):

    ranges = [(0.0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.01)]
    for begin, end in ranges:

        ind = np.argwhere((proportion_diffs_ud >= begin) & (proportion_diffs_ud < end))
        
        if ind.shape[0] > 0:
            #proportion_diffs_i = proportion_diffs_ud[ind]
            sim_i = sim_ud[ind]

            print("For range", begin, "to", end, " - ", sim_i.shape[0], "pairs")
            #print("25th percentile =", np.percentile(sim_i, 25))
            #print("50th percentile =", np.percentile(sim_i, 50))
            #print("75th percentile =", np.percentile(sim_i, 75))
            print("{0},{1},{2}".format(np.percentile(sim_i, 25), np.percentile(sim_i, 50), np.percentile(sim_i, 75)))
        else:
            print("No pairs for range", begin, "to", end)

#### Results: any label to any label

In [102]:
result_analysis_by_range(pd_all_ud, w2v_sim_all_ud)

For range 0.0 to 0.2  -  4591 pairs
0.9349738955497742,0.9611316323280334,0.9763515889644623
For range 0.2 to 0.4  -  3367 pairs
0.9307077527046204,0.9571183323860168,0.973841518163681
For range 0.4 to 0.6  -  1536 pairs
0.9287628084421158,0.9563179314136505,0.9732555747032166
For range 0.6 to 0.8  -  1472 pairs
0.9185249358415604,0.9571459591388702,0.973799392580986
For range 0.8 to 1.01  -  1284 pairs
0.914280965924263,0.9543503224849701,0.9731230735778809


In [103]:
result_analysis_by_range(pd_all_ud, nn_sim_all_ud)

For range 0.0 to 0.2  -  4591 pairs
0.9878326952457428,0.998088538646698,0.9996699690818787
For range 0.2 to 0.4  -  3367 pairs
0.9718308746814728,0.9965531826019287,0.9993865191936493
For range 0.4 to 0.6  -  1536 pairs
0.7102301567792892,0.9849448800086975,0.9980695396661758
For range 0.6 to 0.8  -  1472 pairs
0.8472627848386765,0.9765549600124359,0.9961894154548645
For range 0.8 to 1.01  -  1284 pairs
0.35030514746904373,0.9182875454425812,0.9919017404317856


In [104]:
result_analysis_by_range(pd_all_ud, novel_sim_all_ud)

For range 0.0 to 0.2  -  4591 pairs
0.2453436627984047,0.3427238166332245,0.4346577376127243
For range 0.2 to 0.4  -  3367 pairs
0.21647685766220093,0.31172576546669006,0.4109567552804947
For range 0.4 to 0.6  -  1536 pairs
0.24291158095002174,0.32847101986408234,0.4256638437509537
For range 0.6 to 0.8  -  1472 pairs
0.1954108327627182,0.2875102013349533,0.3843056857585907
For range 0.8 to 1.01  -  1284 pairs
0.20787980034947395,0.3040456622838974,0.4053148850798607


#### Results: misinfo to misinfo

In [105]:
result_analysis_by_range(pd_misinfo_ud, w2v_sim_misinfo_ud)

For range 0.0 to 0.2  -  147 pairs
0.952852189540863,0.9694886803627014,0.9803795516490936
For range 0.2 to 0.4  -  155 pairs
0.9491956532001495,0.9687919020652771,0.9776621162891388
For range 0.4 to 0.6  -  122 pairs
0.9475528746843338,0.9666178226470947,0.9770339131355286
For range 0.6 to 0.8  -  107 pairs
0.9530268609523773,0.97138911485672,0.9801283180713654
For range 0.8 to 1.01  -  79 pairs
0.9538569748401642,0.9685057401657104,0.9792028069496155


In [106]:
result_analysis_by_range(pd_misinfo_ud, nn_sim_misinfo_ud)

For range 0.0 to 0.2  -  147 pairs
0.6253405511379242,0.9908631443977356,0.9980990290641785
For range 0.2 to 0.4  -  155 pairs
0.4010315388441086,0.992784857749939,0.998765230178833
For range 0.4 to 0.6  -  122 pairs
0.16523493826389313,0.8675801455974579,0.9913467019796371
For range 0.6 to 0.8  -  107 pairs
0.04899721406400204,0.2615395486354828,0.9801926910877228
For range 0.8 to 1.01  -  79 pairs
0.06611417979001999,0.23036248981952667,0.9115776717662811


In [107]:
result_analysis_by_range(pd_misinfo_ud, novel_sim_misinfo_ud)

For range 0.0 to 0.2  -  147 pairs
0.270979642868042,0.3796599805355072,0.5330677628517151
For range 0.2 to 0.4  -  155 pairs
0.2979286462068558,0.3736207187175751,0.45076586306095123
For range 0.4 to 0.6  -  122 pairs
0.2648647204041481,0.37134717404842377,0.44685080647468567
For range 0.6 to 0.8  -  107 pairs
0.29260146617889404,0.36472806334495544,0.46974870562553406
For range 0.8 to 1.01  -  79 pairs
0.3119906783103943,0.3955981731414795,0.48944850265979767


#### Results: nonmisinfo to nonmisinfo

In [108]:
result_analysis_by_range(pd_nonmisinfo_ud, w2v_sim_nonmisinfo_ud)

For range 0.0 to 0.2  -  3586 pairs
0.9310038834810257,0.958113044500351,0.974911093711853
For range 0.2 to 0.4  -  2244 pairs
0.9262921959161758,0.9543962776660919,0.9727998226881027
For range 0.4 to 0.6  -  833 pairs
0.9156664609909058,0.9499412775039673,0.9699505567550659
For range 0.6 to 0.8  -  616 pairs
0.8791230022907257,0.9391011297702789,0.9709326922893524
For range 0.8 to 1.01  -  471 pairs
0.8901519477367401,0.9309831857681274,0.9677489399909973


In [109]:
result_analysis_by_range(pd_nonmisinfo_ud, nn_sim_nonmisinfo_ud)

For range 0.0 to 0.2  -  3586 pairs
0.9911077618598938,0.9983079731464386,0.9997662007808685
For range 0.2 to 0.4  -  2244 pairs
0.9747819006443024,0.9971319437026978,0.9995608478784561
For range 0.4 to 0.6  -  833 pairs
0.9731502532958984,0.9849562644958496,0.9980689883232117
For range 0.6 to 0.8  -  616 pairs
0.9326130598783493,0.984435111284256,0.9973897933959961
For range 0.8 to 1.01  -  471 pairs
0.8678244948387146,0.9774820804595947,0.9961245954036713


In [110]:
result_analysis_by_range(pd_nonmisinfo_ud, novel_sim_nonmisinfo_ud)

For range 0.0 to 0.2  -  3586 pairs
0.2264680191874504,0.3270697593688965,0.4174145683646202
For range 0.2 to 0.4  -  2244 pairs
0.20777948945760727,0.2977876663208008,0.3978920057415962
For range 0.4 to 0.6  -  833 pairs
0.21923674643039703,0.3127687871456146,0.41201019287109375
For range 0.6 to 0.8  -  616 pairs
0.16034485399723053,0.26836447417736053,0.37859004735946655
For range 0.8 to 1.01  -  471 pairs
0.15319616347551346,0.27509382367134094,0.3735475242137909


#### Results: misinfo to nonmisinfo

In [111]:
result_analysis_by_range(pd_both_ud, w2v_sim_both_ud)

For range 0.0 to 0.2  -  858 pairs
0.9487195760011673,0.9682484269142151,0.9805682450532913
For range 0.2 to 0.4  -  968 pairs
0.9373989254236221,0.9595908224582672,0.9747491180896759
For range 0.4 to 0.6  -  581 pairs
0.9377700090408325,0.9625737071037292,0.9767392873764038
For range 0.6 to 0.8  -  749 pairs
0.9349206686019897,0.9612652063369751,0.9745776653289795
For range 0.8 to 1.01  -  734 pairs
0.9342801719903946,0.9597935974597931,0.9746307581663132


In [112]:
result_analysis_by_range(pd_both_ud, nn_sim_both_ud)

For range 0.0 to 0.2  -  858 pairs
0.9870851635932922,0.9972964525222778,0.999194785952568
For range 0.2 to 0.4  -  968 pairs
0.9642482995986938,0.9960271716117859,0.9990866184234619
For range 0.4 to 0.6  -  581 pairs
0.5341629981994629,0.9900873899459839,0.9986177086830139
For range 0.6 to 0.8  -  749 pairs
0.44558992981910706,0.9713408350944519,0.9961678981781006
For range 0.8 to 1.01  -  734 pairs
0.04710390232503414,0.8822254240512848,0.9894763678312302


In [114]:
result_analysis_by_range(pd_both_ud, novel_sim_both_ud)

For range 0.0 to 0.2  -  858 pairs
0.31990014016628265,0.39864738285541534,0.48806386440992355
For range 0.2 to 0.4  -  968 pairs
0.23172388970851898,0.32924240827560425,0.4329392611980438
For range 0.4 to 0.6  -  581 pairs
0.2579505741596222,0.34843820333480835,0.4331725239753723
For range 0.6 to 0.8  -  749 pairs
0.21119889616966248,0.2875664532184601,0.37809696793556213
For range 0.8 to 1.01  -  734 pairs
0.22195350378751755,0.30931080877780914,0.4137972667813301
