In [24]:
import pickle
import numpy as np
import csv
import pandas as pd
import re
import emoji
from gensim.parsing.preprocessing import *

In [25]:
def tokenize(string):

    """ Tokenizes a string.

    Adds a space between numbers and letters, removes punctuation, repeated whitespaces, words shorter than 2
    characters, and stop-words. Returns a list of stems and, eventually, emojis.

    @param string: String to tokenize.
    @return: A list of stems and emojis.
    """

    # Based on the Ranks NL (Google) stopwords list, but "how" and "will" are not stripped, and words shorter than 2
    # characters are not checked (since they are stripped):
    stop_words = [
        "about", "an", "are", "as", "at", "be", "by", "com", "for", "from", "in", "is", "it", "of", "on", "or", "that",
        "the", "this", "to", "was", "what", "when", "where", "who", "with", "the", "www"
    ]

    string = strip_short(
        strip_multiple_whitespaces(
            strip_punctuation(
                split_alphanum(string))),
        minsize=2)
    # Parse emojis:
    emojis = [ c for c in string if c in emoji.UNICODE_EMOJI ]
    # Remove every non-word character and stem each word:
    string = stem_text(re.sub(r"[^\w\s,]", "", string))
    # List of stems and emojis:
    tokens = string.split() + emojis
    
    for stop_word in stop_words:
        try:
            tokens.remove(stop_word)
        except:
            pass

    return tokens

In [3]:
# read in our data
clickbait = pd.read_csv('clickbaits.csv',names = ['vid','title','img'],encoding = "ISO-8859-1")
clickbait['label'] = 1
clickbait["video_title_tokenized"] = clickbait["title"].apply(tokenize)
nonclickbait = pd.read_csv('non_clickbait.csv',names = ['vid','title','img'],encoding = "ISO-8859-1")
nonclickbait['label'] = 0
nonclickbait["video_title_tokenized"] = nonclickbait["title"].apply(tokenize)


# concatenate two dfs together
newdf = pd.concat([ clickbait, nonclickbait ]).sample(frac=1).sample(frac=1)

# add the other columns and set as null
newdf["video_views"] = np.nan
newdf["video_likes"] = np.nan
newdf["video_dislikes"] = np.nan
newdf["video_comments"] = np.nan

In [4]:
# Compute the log of the video metadata or replace the missing values with the mean values obtained
# from the train set:
mean_log_video_views = pickle.load(open("mean-log-video-views", "rb"))
mean_log_video_likes = pickle.load(open("mean-log-video-likes", "rb"))
mean_log_video_dislikes = pickle.load(open("mean-log-video-dislikes", "rb"))
mean_log_video_comments = pickle.load(open("mean-log-video-comments", "rb"))
if newdf["video_views"].isnull().any():
    newdf["video_views"].fillna(mean_log_video_views, inplace=True)
if newdf["video_likes"].isnull().any():
    newdf["video_likes"].fillna(mean_log_video_likes, inplace=True)
if newdf["video_dislikes"].isnull().any():
    newdf["video_dislikes"].fillna(mean_log_video_dislikes, inplace=True)
if newdf["video_comments"].isnull().any():
    newdf["video_comments"].fillna(mean_log_video_comments, inplace=True)

In [7]:
def average_embedding(tokens, word2vec, na_vector=None):

    """ Embeds a title with the average representation of its tokens.

    Returns the mean vector representation of the tokens representations. When no token is in the Word2Vec model, it
    can be provided a vector to use instead (for example the mean vector representation of the train set titles).

    @param tokens: List of tokens to embed.
    @param word2vec: Word2Vec model.
    @param na_vector: Vector representation to use when no token is in the Word2Vec model.
    @return: A vector representation for the token list.
    """

    vectors = list()

    for token in tokens:
        if token in word2vec:
            vectors.append(word2vec[token])

    if len(vectors) == 0 and na_vector is not None:
        vectors.append(na_vector)

    return np.mean(np.array(vectors), axis=0)

In [10]:
word2vec = pickle.load(open("word2vec", "rb"))
mean_title_embedding = pickle.load(open("mean-title-embedding", "rb"))

In [14]:
# For the test set use the mean title embedding computed on the train set:
titles_embeddings = newdf["video_title_tokenized"].apply(average_embedding, word2vec=word2vec, na_vector=mean_title_embedding)
new_test_set = pd.concat(
    [
        newdf[["video_views", "video_likes", "video_dislikes", "video_comments"]],
        titles_embeddings.apply(pd.Series)
    ], axis=1)
new_test_set["label"] = newdf['label']
new_test_set[["video_views", "video_likes", "video_dislikes", "video_comments"]] = new_test_set[["video_views", "video_likes", "video_dislikes", "video_comments"]].apply(np.log)




In [15]:
min_max_scaler = pickle.load(open("min-max-scaler", "rb"))

new_test_set = new_test_set.replace(-np.inf, 0)
new_test_labels = new_test_set["label"]
new_test_set = new_test_set.drop(columns=["label"])
new_test_set = pd.DataFrame(min_max_scaler.transform(new_test_set), columns=new_test_set.columns)
# Import the SVM model:
svm = pickle.load(open("svm", "rb"))

In [17]:
print(new_test_set.shape)

(1699, 29)


In [18]:
predictions = svm.predict(new_test_set)

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

print("Performance on the test set (%d samples):" % len(new_test_set))
print("\tAccuracy Score:", accuracy_score(new_test_labels, predictions))
print("\tArea under ROC curve:", roc_auc_score(new_test_labels, predictions))

Performance on the test set (1699 samples):
	Accuracy Score: 0.42377869334902885
	Area under ROC curve: 0.5012143186987976
