# Text classification of clickbait headlines
## Word embeddings: word2vec

Word embeddings are representations of each word's meaning, which are derived by examining the context that a word is used in across a large text corpus. The meanings are represented as n-dimensional vectors, which in this case will be derived from the hidden layer of a word2vec model. These embeddings can be compared to each other in an n-dimensional space, with words that have similar meaning in the training corpus ending up close together, while those with dissimilar meanings being far apart.

## Load in dependencies and data

In [1]:
import pandas as pd
import numpy as np

from sklearn.manifold import TSNE
import plotly.express as px
from support_functions import train_text_classification_model, generate_predictions

In [2]:
clickbait_train = pd.read_csv("data/clickbait_train.csv", sep="\t", header=0)
clickbait_val = pd.read_csv("data/clickbait_val.csv", sep="\t", header=0)

## Prepare data for word2vec training

In order to get the data ready for word2vec training, we need to do a small amount of pre-preparation.

Firstly, we do some light string cleaning, including converting all characters to lowercase, removing all numbers and punctuation, and removing additional whitespace. This is because word2vec models, like bag-of-words models, are based on word tokens, so we want to normalise the text as much as possible before creating the embeddings.

In [3]:
def apply_light_string_cleaning(dataset: pd.Series) -> pd.Series:
    return (
        dataset
        .str.lower()
        .str.replace("[^a-zA-Z]", " ", regex=True)
        .str.replace("\s+", " ", regex=True)
        .str.strip()
    )

In [4]:
clickbait_train["text_clean"] = apply_light_string_cleaning(clickbait_train["text"])
clickbait_val["text_clean"] = apply_light_string_cleaning(clickbait_val["text"])

Finally, we split each sentence into a list of words

In [6]:
# Convert sentences into list of lists for training
clickbait_w2v_training = clickbait_train["text_clean"].str.split("\s").to_list()

# Remove nans
clickbait_w2v_training = [s for s in clickbait_w2v_training if type(s) is list]

In [23]:
clickbait_w2v_training[0]

['new',
 'insulin',
 'resistance',
 'discovery',
 'may',
 'help',
 'diabetes',
 'sufferers']

## Train w2v model to get word embeddings

In [7]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=clickbait_w2v_training,
                     vector_size=100,
                     window=5,
                     min_count=2,
                     workers=4,
                     sg = 1)

In [22]:
print(w2v_model.wv["best"])

[ 0.14907946  0.28824726 -0.21418294 -0.01751759  0.04796686 -0.50966877
  0.15561077  0.6808643  -0.1881679  -0.47073466  0.17010072 -0.21327229
  0.41245914  0.2570131   0.321909   -0.12087936  0.3379991  -0.10301717
 -0.09720543 -0.73928154 -0.07547107  0.10915086  0.3289941  -0.3191192
 -0.2608391   0.02476047 -0.16094607 -0.0202     -0.16117613  0.0449634
  0.13695855 -0.35205188  0.07177465 -0.11207826 -0.20879701  0.6474015
  0.20773141  0.19352014 -0.52104014 -0.44643104 -0.12138324  0.03723307
  0.09146038  0.20123108  0.27091563 -0.40002406 -0.44672093 -0.28696516
  0.02032497  0.00097848  0.34457195 -0.06766028 -0.00278115 -0.2781989
  0.10550953 -0.07829891  0.31129166 -0.10078651 -0.15524553 -0.14389506
 -0.04391776 -0.20158549  0.42454836  0.2731408  -0.28869006  0.37500978
 -0.16989341  0.29143006 -0.7356896   0.22930235 -0.08687814  0.18691404
  0.4316549   0.10328962  0.49351418  0.18695037  0.12108275  0.3096438
 -0.3770024  -0.06272292 -0.2633025  -0.09259783 -0.5530

In [8]:
w2v_model.wv.most_similar("best")

[('worst', 0.9616432785987854),
 ('greatest', 0.9568080902099609),
 ('funniest', 0.9464026689529419),
 ('cast', 0.9411523938179016),
 ('costume', 0.9407869577407837),
 ('friend', 0.9345501065254211),
 ('advice', 0.9340210556983948),
 ('cutest', 0.9334638714790344),
 ('absolute', 0.9320166707038879),
 ('horoscope', 0.928727924823761)]

## Extract vectors and average them across the documents

In [9]:
def extract_document_vectors(model: Word2Vec, text: str, len_vectors: int):
    vectors = np.empty((0, len_vectors), float)
    for word in text.split():
        if word in model.wv.key_to_index:
            v = model.wv[word]
            vectors = np.append(vectors, np.array([v]), axis=0)
    return vectors


def calculate_w2v_dataset(model: Word2Vec, dataset: pd.DataFrame, len_vectors: int):
    document_vectors = np.empty((0, len_vectors), float)
    matched_labels = []
    for index, row in dataset.iterrows():
        v = extract_document_vectors(model, row["text_clean"], len_vectors)
        if v.shape[0] > 0:
            v_mean = v.mean(axis=0)
            document_vectors = np.append(document_vectors, np.array([v_mean]), axis=0)
            matched_labels.append(row["label"])
        else:
            pass
    return document_vectors, np.array(matched_labels)

In [10]:
document_vectors_train, final_labels_train = calculate_w2v_dataset(w2v_model, clickbait_train, 100)
document_vectors_val, final_labels_val = calculate_w2v_dataset(w2v_model, clickbait_val, 100)

In [25]:
print(document_vectors_train[0])

[ 0.00507699  0.2007062   0.01186012  0.20193866  0.00412909 -0.22657562
  0.11774148  0.3044009  -0.12022151 -0.12942862 -0.00888197 -0.30975881
 -0.06769178  0.08562083 -0.04232452 -0.16963971 -0.00952893 -0.17474628
 -0.01128205 -0.34019006  0.11256461  0.13747228  0.14553923 -0.11516011
 -0.1090769   0.00089034 -0.11464819 -0.02761603 -0.21950178 -0.02587586
  0.10625197 -0.02292068  0.09964792 -0.16341038 -0.0607111   0.18224826
  0.11604677  0.04209852 -0.11569612 -0.21428558  0.01259418 -0.23099467
 -0.16032995  0.08548508  0.06918482 -0.06889528 -0.22557475 -0.02799946
  0.1021311   0.2564885   0.014845   -0.06245856  0.09569031 -0.04462699
 -0.0408895  -0.06987935  0.12536642 -0.10255382 -0.25519856  0.01822063
 -0.01372942  0.0101575   0.0038287  -0.03953905 -0.1699226   0.21139023
  0.05537213  0.23632612 -0.16418181  0.11364143 -0.02436646  0.15505715
  0.21826863 -0.19872557  0.15993538  0.12863836  0.10840615  0.05997875
 -0.1317348   0.00940794 -0.22510099 -0.12142222 -0

## Visualise groupings of headlines

In [47]:
# Create TSNE chart to project 100 dimensional vectors onto 2 dimensional space
document_vectors_val_tsne = TSNE(n_components=2,
                                 learning_rate='auto',
                                 init='random',
                                 perplexity=3).fit_transform(document_vectors_val)

In [48]:
document_vectors_plotting = (
    pd.DataFrame(document_vectors_val_tsne, columns=["dimension_1", "dimension_2"])
    .assign(labels = final_labels_val)
    .assign(text = clickbait_val["text"])
)

In [66]:
fig = px.scatter(
    document_vectors_plotting,
    x = "dimension_1",
    y = "dimension_2",
    color = "labels",
    title = "Vector space of documents in validation set",
    custom_data=["labels", "text"]
)
fig.update_traces(
    hovertemplate = "<br>".join([
        "Category: %{customdata[0]}",
        "Headline: %{customdata[1]}"
    ])
)
fig.show()

## Train clickbait classifier

In [11]:
 w2v_classification_model = train_text_classification_model(
    document_vectors_train,
    final_labels_train,
    document_vectors_val,
    final_labels_val,
    100,
    20,
    32
)

2022-08-11 14:26:08.936434: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [50]:
clickbait_val["w2v_baseline_pred"] = generate_predictions(w2v_classification_model, document_vectors_val,
                                                          final_labels_val)

col_0   0.0   1.0
row_0            
0      3022   182
1       123  3073


In [51]:
clickbait_val.loc[(clickbait_val["label"] == 1) & (clickbait_val["w2v_baseline_pred"] == 0), "text"][:5]

6     Phoebe Buffay Is Supposed To Die On October 15...
49    This Body Cam Footage Shows A Vehicle Plow Int...
52    Ariana Grande Flawlessly Shut Down Sexist Comm...
78    Robert Pattinson Has Grown A Humongously Bushy...
83    Photographer Gregory Crewdson Releases Hauntin...
Name: text, dtype: object

In [52]:
clickbait_val.loc[(clickbait_val["label"] == 0) & (clickbait_val["w2v_baseline_pred"] == 1), "text"][:5]

4                               Where Is Oil Going Next?
46     With High-Speed Camera, Glimpsing Worlds Too F...
69             A World of Lingo (Out of This World, Too)
112         Advertisers Change Game Plans for Super Bowl
184              Posted deadlines for Christmas delivery
Name: text, dtype: object

## Visualise terms in correctly and incorrectly classified headlines

In [53]:
# Extract all the vectors and terms into a DataFrame
ordered_vocab = [(term, index) for term, index in w2v_model.wv.key_to_index.items()]
ordered_vocab = sorted(ordered_vocab, key=lambda k: k[1])
ordered_terms, term_indices = zip(*ordered_vocab)
word_vectors = pd.DataFrame(w2v_model.wv.vectors[term_indices, :])

In [60]:
word_vectors[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.092019,0.549868,0.272347,0.388173,-0.110919,-0.587427,0.176126,0.224797,-0.398198,-0.358398,...,-0.024581,-0.138341,0.169414,-0.257062,0.451853,0.224808,-0.365707,-0.130746,-0.196883,0.033093
1,-0.256559,0.1901,0.185909,0.400883,-0.080782,-0.182513,0.040689,0.450951,-0.098182,-0.203557,...,0.256703,0.084139,0.228044,-0.226592,0.132801,0.101782,0.000948,-0.01839,-0.126378,0.116004
2,0.082965,0.218201,-0.490405,-0.024517,0.050831,-0.125781,0.048825,0.494006,-0.3401,-0.295687,...,-0.086211,0.227876,0.339261,-0.112485,0.748536,0.070178,0.070375,0.005012,0.166978,-0.187985
3,-0.119819,0.708272,0.345631,-0.214874,0.076039,-0.566856,0.319319,0.619439,0.082704,-0.343069,...,-0.272353,0.059439,0.397259,-0.451967,0.805482,0.374491,0.42596,-0.415415,0.163022,-0.410705
4,0.27048,0.13142,-0.090329,0.229737,0.11476,0.040019,0.186319,0.360846,-0.26848,-0.071422,...,0.314901,0.165026,0.275682,-0.024953,0.276057,0.0383,0.171796,0.253916,0.023395,-0.047437


In [59]:
# Project the vectors onto 2 dimensions for visualisation
baseline_model_tsne = TSNE(n_components=2, learning_rate='auto',
                           init='random', perplexity=3).fit_transform(word_vectors)

In [55]:
def create_classification_word_plot_sample(category: str):
    # Select all headlines that are either correctly or incorrectly classified by model.
    if category == "correctly_classified":
        full_sample = clickbait_val[clickbait_val["label"] == clickbait_val["w2v_baseline_pred"]]
    else:
        full_sample = clickbait_val[clickbait_val["label"] != clickbait_val["w2v_baseline_pred"]]

    # Get lists of words used in clickbait headlines, non-clickbait headlines or both
    clickbait_words = full_sample.loc[full_sample["label"] == 1, "text_clean"].str.split().explode().to_list()
    non_clickbait_words = full_sample.loc[full_sample["label"] == 0, "text_clean"].str.split().explode().to_list()
    both_words = set(clickbait_words).intersection(non_clickbait_words)

    # Create DataFrame containing the word vectors used in either correctly or incorrectly classified headlines
    # Assign label as to whether the word is used only in clickbait or non-clickbait headlines, or both
    plotting_sample = (
        pd.DataFrame(baseline_model_tsne, columns=["dimension_1", "dimension_2"])
        .assign(terms=ordered_terms)
        .query(f"terms == {clickbait_words + non_clickbait_words}")
    )
    plotting_sample["terms_group"] = (
        np.where(plotting_sample["terms"].isin(both_words), "both",
                 np.where(plotting_sample["terms"].isin(clickbait_words), "clickbait", "non-clickbait")
                 )
    )

    return plotting_sample

In [56]:
correct_plotting_sample = create_classification_word_plot_sample("correctly_classified")
incorrect_plotting_sample = create_classification_word_plot_sample("incorrectly_classified")

In [57]:
fig = px.scatter(
    correct_plotting_sample,
    x = "dimension_1",
    y = "dimension_2",
    color = "terms_group",
    title = "Vector space of words from correctly classified headlines",
    custom_data=["terms", "terms_group"]
)
fig.update_traces(
    hovertemplate = "<br>".join([
        "Word: %{customdata[0]}",
        "Terms group: %{customdata[1]}"
    ])
)
fig.show()

In [65]:
fig = px.scatter(
    incorrect_plotting_sample,
    x = "dimension_1",
    y = "dimension_2",
    color = "terms_group",
    title = "Vector space of words from incorrectly classified headlines",
    custom_data=["terms", "terms_group"]
)
fig.update_traces(
    hovertemplate = "<br>".join([
        "Word: %{customdata[0]}",
        "Terms group: %{customdata[1]}"
    ])
)
fig.show()