# Testing words
This notebook provides some skeleton code for loading the training data and getting the predictions from the model for the different keywords.

## Warning: For now, I am still training the models, testing can already be done with the agency model

## Imports and setup

This section of the notebook makes sure you have all the libraries installed that are used by the code.
It also makes sure that they are updated to the newest version.

In [1]:
%pip install transformers -Uqq
%pip install nltk -Uqq
%pip install matplotlib -Uqq
%pip install wordcloud -Uqq

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import random
from collections import defaultdict
from functools import partial
from multiprocessing.dummy import Pool as ThreadPool

import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
from nltk import word_tokenize
from nltk.corpus import stopwords
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    TextClassificationPipeline,
)
from wordcloud import WordCloud



## Loading data

In [2]:
random.seed(42)
dataset = []

with open("data/data.json") as file:
    dataset = list(map(lambda x: x["text"], json.load(file)["data"]))

random.shuffle(dataset)
dataset[0:2]

['SoftBank-owned Arm to launch new AI chip for small devices',
 "What's your favorite scary movie? AI gets a 'ghoulish assignment' to reimagine classic horror film posters - from movies such as Scream and Child's Play - and the results that are BLOODIER and more terrifying than the originals"]

## Loading model for some keyword

In [3]:
labels = [
    "agency",
    # "suggestiveImagery",
    "comparisonWithHumanIntelligence",
    "comparisonWithHumanSkills",
    "hyperbole",
    "uncriticalHistoryComparison",
    "unjustifiedClaimsAboutFuture",
    "falseClaimsAboutProgress",
    "incorrectClaimsAboutStudyReport",
    "deepSoundingTermsForBanalities",
    "treatingSpokespeopleAsNeutral",
    "repeatingPRTerms",
    "noDiscussionOfLimitations",
    "deEmphasizingLimitations",
    "limitationsAddressedBySkeptics",
    "downplayingHumanLabour",
    "performanceNumbersWithoutCaveats",
    # "inscrutability",
]

models = {}

for label in labels:
    models[label] = BertForSequenceClassification.from_pretrained(
        f"xt0r3/aihype_{label}-vs-rest"
    )

## Adding input processing

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [5]:
pipes = {
    label: TextClassificationPipeline(
        model=models[label],
        tokenizer=tokenizer,
        top_k=None,
    )
    for label in labels
}

## Define prediction function

In [37]:
def get_result(preds):
    for pred in preds:
        if pred["label"] == "LABEL_1":
            return pred["score"] >= 0.5
    return False


def predict(text, label):
    preds = pipes[label](text)[0]
    return get_result(preds)

## Playing around with the model

In [7]:
dataset[0:4]

['SoftBank-owned Arm to launch new AI chip for small devices',
 "What's your favorite scary movie? AI gets a 'ghoulish assignment' to reimagine classic horror film posters - from movies such as Scream and Child's Play - and the results that are BLOODIER and more terrifying than the originals",
 'French tax officials use AI to spot 20,000 undeclared pools',
 'Vic-made robot to fight African poachers']

## Tokenization example 
This chapter shows an example of tokenization so that you can do data classification easier

In [8]:
# NLTK tokenizer for human-understood words
word_tokenize(dataset[0])

['SoftBank-owned',
 'Arm',
 'to',
 'launch',
 'new',
 'AI',
 'chip',
 'for',
 'small',
 'devices']

In [9]:
word_tokenize("infinitesimal")

['infinitesimal']

In [10]:
# BERT tokeznier for subwords the model pays attention to.
# This does not only find words, but also splits some long words to smaller subwords.
tokenizer.tokenize(dataset[0])

['Soft',
 '##B',
 '##an',
 '##k',
 '-',
 'owned',
 'Arm',
 'to',
 'launch',
 'new',
 'AI',
 'chip',
 'for',
 'small',
 'devices']

In [11]:
tokenizer.tokenize("infinitesimal")

['infinite', '##si', '##mal']

## Getting the word frequencies

In [38]:
freq_nltk = defaultdict(lambda: defaultdict(lambda: 0))
freq_bert = defaultdict(lambda: defaultdict(lambda: 0))

In [39]:
def process_headline(headline, label):
    if predict(headline, label):
        for word in tokenize_nltk:
            freq_nltk[label][word] += 1
        for word in tokenize_bert:
            freq_bert[label][word] += 1

In [40]:
for i, headline in enumerate(dataset):
    tokenize_nltk = word_tokenize(headline)
    tokenize_bert = tokenizer.tokenize(headline)

    pool = ThreadPool(4)
    
    func = partial(process_headline, headline)
    pool.map(func, labels)

    if i % 20 == 0:
        print(f"Progress: {i}/{len(dataset)}")

Progress: 0/238
Progress: 20/238
Progress: 40/238
Progress: 60/238
Progress: 80/238
Progress: 100/238
Progress: 120/238
Progress: 140/238
Progress: 160/238
Progress: 180/238
Progress: 200/238
Progress: 220/238


### Removing stopwords

In [41]:
STOPWORDS = stopwords.words("english")

with open('data/keywords') as file:
    lines = [line.strip().lower() for line in file.readlines()]
    print(lines[0:3])
    STOPWORDS.extend(lines)
    
for label in labels:
    freq_nltk[label] = {
        k: v
        for k, v in freq_nltk[label].items()
        if k.lower() not in STOPWORDS
    }
    freq_bert[label] = {
        k: v
        for k, v in freq_bert[label].items()
        if k.lower() not in STOPWORDS
    }

['ai', 'ml', 'nlp']


## Plot results

In [42]:
set_matplotlib_formats("svg")

  set_matplotlib_formats("svg")


In [43]:
# Define a function to plot word cloud
def plot_cloud(axs, wordcloud, title, i):
    x = i // 4
    y = i % 4
    # Set figure size
    # Display image
    axs[x, y].imshow(wordcloud)
    # No axis details
    axs[x, y].axis("off")
    axs[x, y].set_title(title, fontdict={"fontsize": 8})

In [None]:
# Define plot characteristics
plt.figure(figsize=(44, 36))
fig, axs = plt.subplots(4, 4, figsize=(10, 8))
plt.rcParams["figure.dpi"] = 600
plt.rcParams["savefig.dpi"] = 600


for i, label in enumerate(labels):
    if len(freq_nltk[label]) == 0:
        continue
    # Generate wordcloud for NLTK text
    wordcloud = WordCloud(
        width=600,
        height=400,
        random_state=1,
        background_color="black",
        colormap="Set2",
        collocations=False,
    ).generate_from_frequencies(freq_nltk[label])
    # Plot
    plot_cloud(axs, wordcloud, label, i)

plt.savefig("nltk.svg")
plt.show()

In [None]:
# Define plot characteristics
plt.figure(figsize=(44, 36))
fig, axs = plt.subplots(4, 4, figsize=(10, 8))
plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300

for i, label in enumerate(labels):
    if len(freq_nltk[label]) == 0:
        continue
    # Generate wordcloud for BERT text
    wordcloud = WordCloud(
        width=600,
        height=400,
        random_state=1,
        background_color="black",
        colormap="Set2",
        collocations=False,
    ).generate_from_frequencies(freq_bert[label])
    # Plot
    plot_cloud(axs, wordcloud, label, i)

plt.savefig("bert.svg")
plt.show()

In [None]:
list(freq_nltk[labels[0]].items())[0:30]

In [None]:
list(freq_nltk[labels[7]].items())[0:30]