In [1]:
!pip install sklearn -Uqq

In [2]:
import json
import re
from collections import defaultdict

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import CategoricalNB, MultinomialNB
from pathlib import Path

stop_words = set(stopwords.words("english"))
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [3]:
dataset = defaultdict(list)
path = Path("data3/data.json")

with open(path) as f:
    d = json.load(f)
    for e in d["data"]:
        for k, v in e.items():
            dataset[k].append(v)
            
df = pd.DataFrame.from_dict(dataset)

df.sample(3)

Unnamed: 0,text,agency,humanComparison,hyperbole,historyComparison,unjustClaims,deepSounding,sceptics,deEmphasize,performanceNumber,inscrutable,objective
82,Agents unjustly claim history deep deemphasize...,True,False,False,True,False,True,False,True,False,False,True
6,Agents unjustly claim history deep deemphasize...,True,False,False,True,False,True,False,True,False,False,True
83,Agents unjustly claim history deep deemphasize...,True,False,False,True,False,True,False,True,False,False,True


In [4]:
labels = [k for k in df.columns if k not in ['text']]
labels

['agency',
 'humanComparison',
 'hyperbole',
 'historyComparison',
 'unjustClaims',
 'deepSounding',
 'sceptics',
 'deEmphasize',
 'performanceNumber',
 'inscrutable',
 'objective']

In [5]:
dataset["text"] = list(map(lambda text: word_tokenize(text), dataset["text"]))

dataset['text'][0:2]

[['asdasd'],
 ['Agents',
  'unjustly',
  'claim',
  'history',
  'deep',
  'deemphasize',
  'objectivel']]

In [6]:
train, test = train_test_split(df, random_state=42, test_size=0.2, shuffle=True)

X_train = train.text
X_test = test.text
print(X_train.shape)
print(X_test.shape)

(80,)
(20,)


In [7]:
# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer(stop_words=stop_words)),
        ("clf", OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))),
    ]
)
for label in labels:
    print("... Processing {}".format(label))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[label])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print("Test accuracy is {}".format(accuracy_score(test[label], prediction)))

... Processing agency
Test accuracy is 1.0
... Processing humanComparison
Test accuracy is 1.0
... Processing hyperbole
Test accuracy is 1.0
... Processing historyComparison
Test accuracy is 1.0
... Processing unjustClaims
Test accuracy is 1.0
... Processing deepSounding
Test accuracy is 1.0
... Processing sceptics
Test accuracy is 1.0
... Processing deEmphasize
Test accuracy is 1.0
... Processing performanceNumber
Test accuracy is 1.0
... Processing inscrutable
Test accuracy is 1.0
... Processing objective
Test accuracy is 1.0




In [8]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
for label in labels:
    print('... Processing {}'.format(label))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[label])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[label], prediction)))


... Processing agency
Test accuracy is 1.0
... Processing humanComparison
Test accuracy is 1.0
... Processing hyperbole
Test accuracy is 1.0
... Processing historyComparison
Test accuracy is 1.0
... Processing unjustClaims
Test accuracy is 1.0
... Processing deepSounding
Test accuracy is 1.0
... Processing sceptics
Test accuracy is 1.0
... Processing deEmphasize
Test accuracy is 1.0
... Processing performanceNumber
Test accuracy is 1.0
... Processing inscrutable
Test accuracy is 1.0
... Processing objective
Test accuracy is 1.0




In [9]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
for label in labels:
    print('... Processing {}'.format(label))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[label])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[label], prediction)))

... Processing agency
Test accuracy is 1.0
... Processing humanComparison
Test accuracy is 1.0
... Processing hyperbole
Test accuracy is 1.0
... Processing historyComparison
Test accuracy is 1.0
... Processing unjustClaims
Test accuracy is 1.0
... Processing deepSounding
Test accuracy is 1.0
... Processing sceptics
Test accuracy is 1.0
... Processing deEmphasize
Test accuracy is 1.0
... Processing performanceNumber
Test accuracy is 1.0
... Processing inscrutable
Test accuracy is 1.0
... Processing objective
Test accuracy is 1.0


