In [145]:
# Import libraries
import os
import numpy as np
import gensim
import pandas as pd
from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

import sklearn.preprocessing as pp
from scipy.sparse import coo_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

join = os.path.join

[nltk_data] Error loading punkt: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>


### Data loading

In [2]:
print(gensim.__version__)

3.8.3


In [3]:
root_path = "./../data/tmp"

In [4]:
# Load dataset

video1_csv = "baseline1_final.csv"
comment1_csv = "baseline1_comment_final.csv"

df_video1 = pd.read_csv(join(root_path, video1_csv), sep='\t')
df_comment1 = pd.read_csv(join(root_path, comment1_csv), sep='\t')

video2_csv = "baseline2_final.csv"
comment2_csv = "baseline2_comment_final.csv"

df_video2 = pd.read_csv(join(root_path, video2_csv), sep='\t')
df_comment2 = pd.read_csv(join(root_path, comment2_csv), sep='\t')

In [5]:
df_video1.head()

Unnamed: 0,caption,title,label,video_id
0,it happened outside waco texas a heavily armed...,the shadow of waco retro report the new york...,0,hOW9AjskoOo
1,thanks for coming its nice to see a good turno...,former abortionist dr levatino destroys procho...,0,dIRcw45n9RU
2,tonight i donald john trump do solemnly swear...,trumps road to the white house full film fron...,0,SMwXKl0odq8
3,this week on buzzfeed unsolved we discuss the...,the strange disappearance of db cooper,0,oHSehKtDyoI
4,im mason noise im 22 and im from birmingham wh...,shockingly offensive auditions have simon cowe...,0,N9COy7O7K-U


In [6]:
df_comment1.head()

Unnamed: 0,video_id,comment
0,hOW9AjskoOo,im sry for the kids but the rest they became w...
1,hOW9AjskoOo,this testifies to the power of automatic weapo...
2,hOW9AjskoOo,these days they will just drone strike a build...
3,hOW9AjskoOo,935 crazy how this phrase will likely never be...
4,hOW9AjskoOo,no true christian would burn their kids


In [7]:
print(df_video1.shape)
print(df_comment1.shape)
print(df_video2.shape)
print(df_comment2.shape)

(2120, 4)
(177514, 2)
(80, 4)
(111526, 2)


In [8]:
# Load word2vec
word2vec_300 = "GoogleNews-vectors-negative300.bin"
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(join(root_path, word2vec_300), binary = True)

In [9]:
word2vec_model.vector_size

300

In [10]:
# Load glove
glove_100d = "glove.6B.100d.txt"
glove100d_embeddings_dict = {}
f = open(os.path.join(root_path, glove_100d),'r', errors = 'ignore', encoding='utf8')
for line in f:
    values = line.split()
    word = ''.join(values[:-100])
    coefs = np.asarray(values[-100:], dtype='float32')
    glove100d_embeddings_dict[word] = coefs
f.close()

In [11]:
print(len(glove100d_embeddings_dict))

400000


In [187]:
df_video1[df_video1["video_id"] == "7Ptv-Z7goZ8"]

Unnamed: 0,caption,title,label,video_id
899,prove it see its back to spot chemtrails just ...,chemtrails exposed in new zealandthe land of t...,1,7Ptv-Z7goZ8


### Captions -> word2vec

In [12]:
captions1_list = df_video1["caption"].to_list()
video1_list = df_video1["video_id"].to_list()
captions2_list = df_video2["caption"].to_list()
video2_list = df_video2["video_id"].to_list()

In [21]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [14]:
# Our earlier preprocessing was done when we were dealing only with word vectors
# Here, we need each document to remain a document 
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word.isalpha()] 
    return doc

In [15]:
# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [16]:
# Filter out documents
def filter_docs(corpus, texts, video_ids, condition_on_doc):
    """
    Filter corpus and texts given the function condition_on_doc which takes a doc. The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    ret_texts, ret_videos = [], []
    if texts is not None:
        for (text, doc, video_id) in zip(texts, corpus, video_ids):
            if condition_on_doc(doc):
                ret_texts.append(text)
                ret_videos.append(video_id)

    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, ret_texts, ret_videos)

In [17]:
def captions_to_word2vec(caption_list, video_list, model):
    # Preprocess the corpus
    corpus = [preprocess(title) for title in caption_list]

    # Remove docs that don't include any words in W2V's vocab
    corpus, titles_list, video_list = filter_docs(corpus, caption_list, video_list, lambda doc: has_vector_representation(model, doc))

    # Filter out any empty docs
    corpus, titles_list, video_list = filter_docs(corpus, caption_list, video_list, lambda doc: (len(doc) != 0))
    x = []
    for doc in corpus: # append the vector for each document
        x.append(document_vector(model, doc))

    X = np.array(x) # list to array
    
    return X, video_list

In [22]:
captions1_X, cap_w2v_videos = captions_to_word2vec(captions1_list, video1_list, word2vec_model)

0 docs removed
0 docs removed


In [23]:
captions1_X.shape

(2120, 300)

In [24]:
print(len(cap_w2v_videos))
cap_w2v_videos[:5]

2120


['hOW9AjskoOo', 'dIRcw45n9RU', 'SMwXKl0odq8', 'oHSehKtDyoI', 'N9COy7O7K-U']

In [25]:
captions2_X, cap2_w2v_videos = captions_to_word2vec(captions2_list, video2_list, word2vec_model)

0 docs removed
0 docs removed


In [26]:
captions2_X.shape

(80, 300)

In [27]:
print(len(cap2_w2v_videos))
cap2_w2v_videos[:5]

80


['0jmhj-vnl5E', '1cWvGnF6_dw', '3DAI3c9wE0Q', '4G1LTpm1pkw', '4tD0w86MTuA']

### Captions -> glove

In [78]:
# Define captions as numpy array (paper 1 + paper 2, 2120 + 80)
captions1_texts_concat = np.empty([2200, 1], dtype='object')

In [79]:
# Get captions as numpy array 
# length of captions from paper 1 is 2120
prev_video_id = ''
cur_i = 0
captions1_texts_concat[0] = ''
for index, row in df_video1.iterrows():
    if row['video_id'] != prev_video_id and prev_video_id: # new video_id
        # next video_id index
        cur_i += 1
        captions1_texts_concat[cur_i] =  row['caption']
    # else, same video
    # concat caption for that video_id  
    else:
        captions1_texts_concat[cur_i] +=  row['caption']
        prev_video_id = row['video_id']
for index, row in df_video2.iterrows():
    if row['video_id'] != prev_video_id and prev_video_id: # new video_id
        # next video_id index
        cur_i += 1
        captions1_texts_concat[cur_i] =  row['caption']
    # else, same video
    # concat caption for that video_id  
    else:
        captions1_texts_concat[cur_i] +=  row['caption']
        prev_video_id = row['video_id']
#normalized_annotation

In [105]:
cv = CountVectorizer()

X_captions1 = np.zeros((2200, 100))
Y_captions1 = np.zeros((2200,))

df_video = pd.concat([df_video1, df_video2], axis=0)
df_video.reset_index(drop=True, inplace=True)

for i, caption in enumerate(captions1_texts_concat):
    #try:
    if caption[0] and len(caption[0]) > 1: # ['o'] -- skip when cv.fit_transform(caption) fails
        cv_fit_caption=cv.fit_transform(caption)
        caption_features = cv.get_feature_names_out()
        caption_feature_weights = cv_fit_caption.toarray()

        total_caption_weights = sum(caption_feature_weights[0])
        sum_embeddings = None
        for j, weight in enumerate(caption_feature_weights[0]):
            try:
                # check if key in embedding dictionary
                if caption_features[j] in glove100d_embeddings_dict:
                    if j == 0: 
                        sum_embeddings = weight*glove100d_embeddings_dict[caption_features[j]]
                    else:
                        sum_embeddings += weight*glove100d_embeddings_dict[caption_features[j]]
            except:
                pass
                #print(caption_features[j])

        # calculate caption embedding
        if sum_embeddings is not None:
            caption_embedding = np.divide(sum_embeddings, total_caption_weights)

            # append to captions dataset
            X_captions1[i] = caption_embedding
            
            # append label for captions dataset
            Y_captions1[i] = 1 if df_video.iloc[i]['label'] == 1 else 0 # 1 is misinfo, 0 is non-misinfo
            
         

        
#     except:
#         print('Error')
#         cv_fit_caption=cv.fit_transform(caption)
#         caption_features = cv.get_feature_names_out()
#         caption_feature_weights = cv_fit_caption.toarray()
        
#         #print(caption_feature_weights)
#         error += 1
#         if error == 1:
#             break
#         pass
    #print(sum_embeddings)
    # if i == 2:
    #     break

In [112]:
zero_ind = np.where(~np.all(X_captions1 == 0, axis=1))

In [110]:
video_1b = []
cand_videos = cap_w2v_videos + cap2_w2v_videos
for i, v in enumerate(cand_videos):
    if i not in zero_ind[0]:
        video_1b += [v]

In [113]:
X_captions1 = X_captions1[zero_ind]
Y_captions1 = Y_captions1[zero_ind]

In [100]:
X_captions1[2199]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [90]:
Y_captions1[2199]

0.0

In [114]:
print(X_captions1.shape)
print(Y_captions1.shape)
print(len(video_1b))

(2122, 100)
(2122,)
2122


### Titles, comments -> tfidf

In [50]:
title1_list = df_video1["title"].to_list()
video1_list = df_video1["video_id"].to_list()
title2_list = df_video2["title"].to_list()
video2_list = df_video2["video_id"].to_list()

title_list = title1_list + title2_list
video_list = video1_list + video2_list

df_comment = pd.concat([df_comment1, df_comment2], axis=0)

In [66]:
def create_corpus(titles, videos, comments):
    """Simply concatenate titles and comments for each video into one long string"""
    corpus = []
    for (title, video_id) in zip(titles, videos):
        comments_v = comments[comments["video_id"] == video_id]
        corpus_elem = [title] + comments_v["comment"].to_list()
        corpus_elem = [str(x) for x in corpus_elem]
        corpus_elem = ' '.join(corpus_elem)
        corpus += [corpus_elem]
    return corpus

In [67]:
corpus = create_corpus(title_list, video_list, df_comment)

In [68]:
# Run tf-idf
vectorizer = TfidfVectorizer()
tc_X = vectorizer.fit_transform(corpus)

In [69]:
tc_X.shape

(2200, 185568)

### Baseline 1 models

#### Word2vec SVC

In [32]:
def print_results(model_name, y_test, y_pred):
    print(model_name, "results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 score:", f1_score(y_test, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Cross entropy loss:", log_loss(y_test, y_pred))

In [319]:
X = np.vstack([captions1_X, captions2_X])
y = np.hstack([df_video1["label"].to_numpy(), df_video2["label"].to_numpy()])

X = X[1529:, :]
y = y[1529:]

y[y == -1] = 0  # make two class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(536, 300) (135, 300) (536,) (135,)


In [320]:
# Use SMOTE to oversample minority class for training
print("Before SMOTE:", Counter(y_train))
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=1.0)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, y_train = pipeline.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train))

Before SMOTE: Counter({0: 487, 1: 49})
After SMOTE: Counter({0: 243, 1: 243})


In [321]:
# Train SVC model (word2vec)
clf_1a = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_1a.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [322]:
y_pred = clf_1a.predict(X_test)
print_results("Baseline 1 word2vec SVC", y_test, y_pred)

Baseline 1 word2vec SVC results:
Accuracy: 0.9259259259259259
F1 score: 0.5833333333333334
Confusion matrix:
[[118   6]
 [  4   7]]
Cross entropy loss: 2.55846341876806


In [45]:
def get_videos_by_result(video_list, y_test, y_pred):
    t, fp, fn = [], [], []
    for (video_id, gt, pred) in zip(video_list, y_test, y_pred):
        if gt == pred:
            t += [video_id]
        elif gt == 0 and pred == 1:
            fp += [video_id]
        else:  # gt == 1 and pred == 0
            fn += [video_id]
    return t, fp, fn

In [46]:
videos = cap_w2v_videos + cap2_w2v_videos
y_pred = clf_1a.predict(X)
t_1a, fp_1a, fn_1a = get_videos_by_result(videos, y, y_pred)
print(len(t_1a), len(fp_1a), len(fn_1a))

2080 105 15


#### Glove SVC

In [115]:
X = X_captions1
y = Y_captions1

y[y == -1] = 0  # make two class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1697, 100) (425, 100) (1697,) (425,)


In [116]:
# Use SMOTE to oversample minority class for training
print("Before SMOTE:", Counter(y_train))
over = SMOTE(sampling_strategy=0.75)
under = RandomUnderSampler(sampling_strategy=1.0)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, y_train = pipeline.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train))

Before SMOTE: Counter({0.0: 1478, 1.0: 219})
After SMOTE: Counter({0.0: 1108, 1.0: 1108})


In [117]:
# Train SVC model (word2vec)
clf_1b = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_1b.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [118]:
y_pred = clf_1b.predict(X_test)
print_results("Baseline 1 glove SVC", y_test, y_pred)

Baseline 1 glove SVC results:
Accuracy: 0.88
F1 score: 0.5785123966942148
Confusion matrix:
[[339  32]
 [ 19  35]]
Cross entropy loss: 4.144713372372262


In [119]:
videos = video_1b
y_pred = clf_1b.predict(X)
t_1b, fp_1b, fn_1b = get_videos_by_result(videos, y, y_pred)
print(len(t_1b), len(fp_1b), len(fn_1b))

1960 133 29


### Baseline 2 models

#### tf-idf SVC

In [363]:
X = tc_X
y = np.hstack([df_video1["label"].to_numpy(), df_video2["label"].to_numpy()])

X = X[1529:, :]
y = y[1529:]

y[y == -1] = 0  # make two class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(536, 185568) (135, 185568) (536,) (135,)


In [364]:
# Use SMOTE to oversample minority class for training
print("Before SMOTE:", Counter(y_train))
over = SMOTE(sampling_strategy=0.75)
under = RandomUnderSampler(sampling_strategy=1.0)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, y_train = pipeline.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train))

Before SMOTE: Counter({0: 491, 1: 45})
After SMOTE: Counter({0: 368, 1: 368})


In [365]:
# Train SVC model
clf_2a = SVC(gamma='auto', verbose=1)
clf_2a.fit(X_train, y_train)

[LibSVM]*
optimization finished, #iter = 368


SVC(gamma='auto', verbose=1)

obj = -735.950126, rho = -0.000638
nSV = 736, nBSV = 736
Total nSV = 736


In [366]:
y_pred = clf_2a.predict(X_test)
print_results("Baseline 2 tf-idf SVC", y_test, y_pred)

Baseline 2 tf-idf SVC results:
Accuracy: 0.42962962962962964
F1 score: 0.26666666666666666
Confusion matrix:
[[44 76]
 [ 1 14]]
Cross entropy loss: 19.70034482824309


In [77]:
videos = cap_w2v_videos + cap2_w2v_videos
y_pred = clf_2a.predict(X)
t_2a, fp_2a, fn_2a = get_videos_by_result(videos, y, y_pred)
print(len(t_2a), len(fp_2a), len(fn_2a))

1223 901 76


In [None]:
#### tf-idf RandomForest

In [121]:
X = tc_X
y = np.hstack([df_video1["label"].to_numpy(), df_video2["label"].to_numpy()])

y[y == -1] = 0  # make two class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1760, 185568) (440, 185568) (1760,) (440,)


In [122]:
# Use SMOTE to oversample minority class for training
print("Before SMOTE:", Counter(y_train))
over = SMOTE(sampling_strategy=0.75)
under = RandomUnderSampler(sampling_strategy=1.0)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, y_train = pipeline.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train))

Before SMOTE: Counter({0: 1533, 1: 227})
After SMOTE: Counter({0: 1149, 1: 1149})


In [132]:
# Train RandomForest model
clf_2b = RandomForestClassifier(n_estimators=800, n_jobs=-1)
clf_2b.fit(X_train, y_train)

RandomForestClassifier(n_estimators=800, n_jobs=-1)

In [133]:
y_pred = clf_2b.predict(X_test)
print_results("Baseline 2 tf-idf RF", y_test, y_pred)

Baseline 2 tf-idf RF results:
Accuracy: 0.8886363636363637
F1 score: 0.36363636363636365
Confusion matrix:
[[377  10]
 [ 39  14]]
Cross entropy loss: 3.846381907556649


In [135]:
videos = cap_w2v_videos + cap2_w2v_videos
y_pred = clf_2b.predict(X)
t_2b, fp_2b, fn_2b = get_videos_by_result(videos, y, y_pred)
print(len(t_2b), len(fp_2b), len(fn_2b))

2144 17 39


### Save all predictions to file

In [137]:
res_dict = {"video_id": [], "svc_tfidf": [], "svc_w2v": [], "svc_glove": [], "rf_tfidf": [], "3_label": [], "2_label":[]}
y = np.hstack([df_video1["label"].to_numpy(), df_video2["label"].to_numpy()])
z = y.copy()
z[z == -1] = 0  # make two class
videos = cap_w2v_videos + cap2_w2v_videos

for (video, label1, label2) in zip(videos, y, z):
    res_dict["video_id"] += [video]
    
    # svc_tfidf (2a)
    if video in t_2a:
        res_dict["svc_tfidf"] += ["T"]
    elif video in fp_2a:
        res_dict["svc_tfidf"] += ["FP"]
    else:
        res_dict["svc_tfidf"] += ["FN"]
    
    # svc_w2v (1a)
    if video in t_1a:
        res_dict["svc_w2v"] += ["T"]
    elif video in fp_1a:
        res_dict["svc_w2v"] += ["FP"]
    else:
        res_dict["svc_w2v"] += ["FN"]
    
    # svc_glove (note that there were some embedding failures) (1b)
    if video in t_1b:
        res_dict["svc_glove"] += ["T"]
    elif video in fp_1b:
        res_dict["svc_glove"] += ["FP"]
    elif video in fn_1b:
        res_dict["svc_glove"] += ["FN"]
    else:
        res_dict["svc_glove"] += ["N/A"]  # embedding failure
    
    # rf_tfidf (2b)
    if video in t_2b:
        res_dict["rf_tfidf"] += ["T"]
    elif video in fp_2b:
        res_dict["rf_tfidf"] += ["FP"]
    else:
        res_dict["rf_tfidf"] += ["FN"]
    
    res_dict["3_label"] += [label1]
    res_dict["2_label"] += [label2]
        
pd.DataFrame(res_dict).to_csv('baseline_results.csv', index=False)

### Calculating cosine similarity of embeddings

In [173]:
df_embedd = pd.read_csv('_embedd.csv', sep='\t')

In [174]:
embedd_videos = set(df_embedd['video_id'].to_list())

In [151]:
def cosine_similarities(mat):
    col_normed_mat = pp.normalize(coo_matrix(mat.T).tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat

In [178]:
tf_sim = cosine_similarities(tc_X)

In [160]:
word2vec_sim = cosine_similarities(np.vstack([captions1_X, captions2_X]))

In [154]:
glove_sim = cosine_similarities(X_captions1)

In [153]:
word2vec_sim.shape

(2200, 2200)

In [155]:
glove_sim.shape

(2122, 2122)

In [None]:
tf_sim = tf_sim.toarray()

In [161]:
word2vec_sim = word2vec_sim.toarray()

In [162]:
word2vec_sim[:3, :3]

array([[0.99999976, 0.93542814, 0.962264  ],
       [0.93542814, 0.99999976, 0.9643437 ],
       [0.962264  , 0.9643437 , 1.0000001 ]], dtype=float32)

In [167]:
word2vec_sim[word2vec_sim < 0.0] = 0.0
word2vec_sim[word2vec_sim > 1.0] = 1.0

In [168]:
word2vec_sim.max(), word2vec_sim.min()

(1.0, 0.0)

In [164]:
glove_sim = glove_sim.toarray()

In [165]:
glove_sim[:3, :3]

array([[1.        , 0.98440973, 0.98987664],
       [0.98440973, 1.        , 0.9928833 ],
       [0.98987664, 0.9928833 , 1.        ]])

In [169]:
glove_sim[glove_sim < 0.0] = 0.0
glove_sim[glove_sim > 1.0] = 1.0

In [170]:
glove_sim.max(), glove_sim.min()

(1.0, 0.36732266597963353)

In [176]:
# word2vec - compute precision, recall, F1-score

word2vec_dict = {}
videos = cap_w2v_videos + cap2_w2v_videos
labels = np.hstack([df_video1["label"].to_numpy(), df_video2["label"].to_numpy()])

y_test, y_pred = [], []
for i, video1 in enumerate(videos):
    for j, video2 in enumerate(videos):
        if i <= j:
            continue
        if labels[i] == labels[j]:
            y_test += [1]
            y_pred += [1 if word2vec_sim[i][j] >= 0.5 else 0]
        else:
            y_test += [0]
            y_pred += [0 if word2vec_sim[i][j] < 0.5 else 1]
        
# y_true, y_pred
print_results("word2vec match", y_test, y_pred)

word2vec match results:
Accuracy: 0.5300657323576833
F1 score: 0.6917487378154241
Confusion matrix:
[[   6711 1126448]
 [  10276 1275465]]
Cross entropy loss: 16.23132695177747


In [177]:
# glove - compute precision, recall, F1-score

glove_dict = {}
videos = video_1b
labels = Y_captions1

y_test, y_pred = [], []
for i, video1 in enumerate(videos):
    for j, video2 in enumerate(videos):
        if i <= j:
            continue
        if labels[i] == labels[j]:
            y_test += [1]
            y_pred += [1 if glove_sim[i][j] >= 0.5 else 0]
        else:
            y_test += [0]
            y_pred += [0 if glove_sim[i][j] < 0.5 else 1]
        
# y_true, y_pred
print_results("glove match", y_test, y_pred)

glove match results:
Accuracy: 0.7760143726773377
F1 score: 0.8738266675844977
Confusion matrix:
[[    892  503885]
 [    168 1745436]]
Cross entropy loss: 7.736368536410071
