In [9]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import CategoricalNB, MultinomialNB

stop_words = set(stopwords.words("english"))
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [10]:
dataset = defaultdict(list)
path = Path("data/data.json")

with open(path) as f:
    d = json.load(f)
    for e in d["data"]:
        for k, v in e.items():
            dataset[k].append(v)

df = pd.DataFrame.from_dict(dataset)

df.sample(3)

Unnamed: 0,text,agency,humanComparison,hyperbole,historyComparison,unjustClaims,deepSounding,sceptics,deEmphasize,performanceNumber,inscrutable
523,The AI that can tell if a sheep is in PAIN: Re...,True,False,False,False,False,False,False,True,False,False
405,Can YOU spot the difference? Damaged artworks ...,False,False,False,False,False,False,False,True,False,False
629,"SoftBank-backed Tokopedia bets on logistics, A...",False,False,False,False,False,False,False,False,False,False


In [11]:
labels = [k for k in df.columns if k not in ["text"]]
labels

['agency',
 'humanComparison',
 'hyperbole',
 'historyComparison',
 'unjustClaims',
 'deepSounding',
 'sceptics',
 'deEmphasize',
 'performanceNumber',
 'inscrutable']

In [12]:
dataset["text"] = list(map(lambda text: word_tokenize(text), dataset["text"]))

dataset["text"][0:2]

[['A',
  'new',
  'vision',
  'of',
  'artificial',
  'intelligence',
  'for',
  'the',
  'people'],
 ['The', 'gig', 'workers', 'fighting', 'back', 'against', 'the', 'algorithms']]

In [13]:
train, test = train_test_split(df, random_state=42, test_size=0.2, shuffle=True)

X_train = train.text
X_test = test.text
print(X_train.shape)
print(X_test.shape)

(571,)
(143,)


In [14]:
# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer(stop_words=stop_words)),
        ("clf", OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))),
    ]
)
for label in labels:
    print("... Processing {}".format(label))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[label])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print(f"""Test accuracy is {accuracy_score(test[label], prediction)}
Test precision is {precision_score(test[label], prediction, zero_division=0)}
Test recall is {recall_score(test[label], prediction)}
Test f1 is {f1_score(test[label], prediction)}
""")

... Processing agency
Test accuracy is 0.7762237762237763
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing humanComparison
Test accuracy is 0.8671328671328671
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing hyperbole
Test accuracy is 0.9090909090909091
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing historyComparison
Test accuracy is 0.993006993006993
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing unjustClaims
Test accuracy is 0.9370629370629371
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing deepSounding
Test accuracy is 0.958041958041958
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing sceptics
Test accuracy is 0.993006993006993
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing deEmphasize
Test accuracy is 0.8671328671328671
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing performanceNumber
Test acc

In [15]:
SVC_pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer(stop_words=stop_words)),
        ("clf", OneVsRestClassifier(LinearSVC(), n_jobs=1)),
    ]
)
for label in labels:
    print("... Processing {}".format(label))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[label])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print(f"""Test accuracy is {accuracy_score(test[label], prediction)}
Test precision is {precision_score(test[label], prediction, zero_division=0)}
Test recall is {recall_score(test[label], prediction)}
Test f1 is {f1_score(test[label], prediction)}
""")

... Processing agency
Test accuracy is 0.7412587412587412
Test precision is 0.35294117647058826
Test recall is 0.1875
Test f1 is 0.24489795918367344

... Processing humanComparison
Test accuracy is 0.8671328671328671
Test precision is 0.5
Test recall is 0.10526315789473684
Test f1 is 0.17391304347826086

... Processing hyperbole
Test accuracy is 0.9020979020979021
Test precision is 0.3333333333333333
Test recall is 0.07692307692307693
Test f1 is 0.125

... Processing historyComparison
Test accuracy is 0.993006993006993
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing unjustClaims
Test accuracy is 0.9440559440559441
Test precision is 1.0
Test recall is 0.1111111111111111
Test f1 is 0.19999999999999998

... Processing deepSounding
Test accuracy is 0.958041958041958
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing sceptics
Test accuracy is 0.993006993006993
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing deEmphasize
Tes

In [16]:
LogReg_pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer(stop_words=stop_words)),
        ("clf", OneVsRestClassifier(LogisticRegression(solver="sag"), n_jobs=1)),
    ]
)
for label in labels:
    print("... Processing {}".format(label))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[label])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print(f"""Test accuracy is {accuracy_score(test[label], prediction)}
Test precision is {precision_score(test[label], prediction, zero_division=0)}
Test recall is {recall_score(test[label], prediction)}
Test f1 is {f1_score(test[label], prediction)}
""")

... Processing agency
Test accuracy is 0.7832167832167832
Test precision is 1.0
Test recall is 0.03125
Test f1 is 0.06060606060606061

... Processing humanComparison
Test accuracy is 0.8671328671328671
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing hyperbole
Test accuracy is 0.9090909090909091
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing historyComparison
Test accuracy is 0.993006993006993
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing unjustClaims
Test accuracy is 0.9370629370629371
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing deepSounding
Test accuracy is 0.958041958041958
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing sceptics
Test accuracy is 0.993006993006993
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing deEmphasize
Test accuracy is 0.8671328671328671
Test precision is 0.0
Test recall is 0.0
Test f1 is 0.0

... Processing perfor

