In [2]:
import datasets
import numpy as np
import transformers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

import shap

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
csv_file_path = 'datasets/fakenewsnet.csv'
df = pd.read_csv(csv_file_path)
print(df.shape)
print("COLUMNS", df.columns.tolist())

# Imputing null values
null_imputation_dict = { 
    'id': 'None',
    'title': 'None',
    'text': 'None',
    'label': 'None'
    }
df = df.fillna(value=null_imputation_dict)

(20800, 4)
COLUMNS ['id', 'title', 'text', 'label']


In [4]:
# Testing tokenizer
from transformers import AutoTokenizer

# Load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example text
text = "Hello, this is an example text to demonstrate how to use a pretrained tokenizer."

# Use the tokenizer
encoded_input = tokenizer(text)
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'])
print(tokens)

['[CLS]', 'hello', ',', 'this', 'is', 'an', 'example', 'text', 'to', 'demonstrate', 'how', 'to', 'use', 'a', 'pre', '##train', '##ed', 'token', '##izer', '.', '[SEP]']


In [5]:
# MODEL = "jy46604790/Fake-News-Bert-Detect"
# classifier = transformers.pipeline("sentiment-analysis")
short_data = [v[:500] for v in df["text"][:20]]
MODEL = "jy46604790/Fake-News-Bert-Detect"
classifier = transformers.pipeline("text-classification", model=MODEL, tokenizer=MODEL)
text = df['text'][0][:2000]

print(len(text))
print("SHORT DATA", type(short_data), short_data)

classifier(short_data)

2000
SHORT DATA <class 'list'> ['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) \nWith apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It t', 'Ever get the feeling your life circles the roundabout rather than heads in a straight line toward the intended destination? [Hillary Clinton remains the big woman on campus in leafy, liberal Wellesley, Massachusetts. Everywhere else votes her most likely to don her inauguration dress for the remainder of her days the way Miss Havisham forever wore that wedding dress.  Speaking of Great Expectations, Hillary Rodham overflowed with them 48 years ago when she f

[{'label': 'LABEL_0', 'score': 0.9991253018379211},
 {'label': 'LABEL_0', 'score': 0.9993244409561157},
 {'label': 'LABEL_0', 'score': 0.9992601275444031},
 {'label': 'LABEL_0', 'score': 0.9995051622390747},
 {'label': 'LABEL_0', 'score': 0.997266411781311},
 {'label': 'LABEL_0', 'score': 0.9982476234436035},
 {'label': 'LABEL_0', 'score': 0.9995917677879333},
 {'label': 'LABEL_1', 'score': 0.9999107122421265},
 {'label': 'LABEL_0', 'score': 0.9976542592048645},
 {'label': 'LABEL_1', 'score': 0.996364951133728},
 {'label': 'LABEL_0', 'score': 0.998518168926239},
 {'label': 'LABEL_0', 'score': 0.9990730285644531},
 {'label': 'LABEL_0', 'score': 0.999119222164154},
 {'label': 'LABEL_0', 'score': 0.9985765218734741},
 {'label': 'LABEL_0', 'score': 0.9994326233863831},
 {'label': 'LABEL_0', 'score': 0.9991536140441895},
 {'label': 'LABEL_1', 'score': 0.9998937845230103},
 {'label': 'LABEL_0', 'score': 0.999484658241272},
 {'label': 'LABEL_0', 'score': 0.997330904006958},
 {'label': 'LABEL_

In [8]:
print(shap_values.data)
shap.plots.text(shap_values[:, :, "LABEL_0"])
result = classifier(text)[0]
print("Classified as:", "FAKE" if result['label'] == 'LABEL_0' else "REAL", ', score:', result['score'])

(array(['', '\n', '\n', 'A ', 'baby ', 'rescued ', 'from ', 'her ',
       'dying ', 'mother', "'s ", 'womb ', 'after ', 'an ', 'Israeli ',
       'air ', 'strike ', 'in ', 'southern ', 'Gaza ', 'has ', 'died',
       ', ', 'the ', 'BBC ', 'has ', 'learned', '.', '\n', '\n', 'Baby ',
       'Sab', 'reen ', 'al', '-', 'S', 'ak', 'ani ', 'was ', 'delivered ',
       'by ', 'Ca', 'es', 'are', 'an ', 'section ', 'in ', 'a ', 'Raf',
       'ah ', 'hospital ', 'shortly ', 'after ', 'midnight ', 'on ',
       'Sunday', '.', '\n', '\n', 'Amid ', 'chaotic ', 'scenes ',
       'doctors ', 'resusc', 'itated ', 'the ', 'baby', ', ', 'using ',
       'a ', 'hand ', 'pump ', 'to ', 'push ', 'air ', 'into ', 'her ',
       'lungs', '.', '\n', '\n', 'However ', 'she ', 'died ', 'on ',
       'Thursday ', 'and ', 'has ', 'been ', 'buried ', 'next ', 'to ',
       'her ', 'mother ', 'after ', 'whom ', 'she ', 'was ', 'named', '.',
       '\n', '\n', 'Baby ', 'Sab', 'reen ', 'was ', 'among ', '16 ',
    

Classified as: FAKE , score: 0.9989961981773376


In [None]:
Shap for fake news