In [1]:
import datasets
import numpy as np
import transformers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
csv_file_path = 'datasets/fakenewsnet.csv'
df = pd.read_csv(csv_file_path)
print(df.shape)
print("COLUMNS", df.columns.tolist())

# Imputing null values
null_imputation_dict = { 
    'id': 'None',
    'title': 'None',
    'text': 'None',
    'label': 'None'
    }
df = df.fillna(value=null_imputation_dict)

(20800, 4)
COLUMNS ['id', 'title', 'text', 'label']


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import scipy as sp
tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification", use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")
labels = ['FAKE', 'TRUE']

# this defines an explicit python function that takes a list of strings and outputs scores for each class
def f(x):
    tv = torch.tensor(
        [
            tokenizer.encode(v, padding="max_length", max_length=128, truncation=True)
            for v in x
        ]
    ).cpu()
    attention_mask = (tv != 0).type(torch.int64).cpu()
    outputs = model(tv, attention_mask=attention_mask)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores)
    return val



In [6]:
explainer = shap.Explainer(f, tokenizer, output_names=labels)
print("TYPE OF EXPLAINER", type(explainer))

text = """
At least 60 people have died and more than 100 have been injured after flash flooding in northern Afghanistan, according to Taliban officials.

Dozens of people remain missing after heavy rainfall hit five districts in Baghlan province, with warnings the death toll could rise ahead of a further two storms forecast to spread across the region on Friday night.

Pictures on social media showed torrents of water sweeping through houses in several villages, leaving a trail of destruction in its wake.

The country has been hit by unusually heavy rainfall over the last few weeks, with floods killing more than 100 people since mid-April.

Abdul Mateen Qani, a spokesman for Afghanistan's interior ministry, told the BBC those who had died came from the Borka district in Baghlan province.

More than 200 people have been trapped inside their homes there.

The official earlier told Reuters news agency that helicopters had been sent to Baghlan - located directly north of the capital, Kabul - but "the operation may not be successful" due to a shortage of night vision lights.

Meanwhile, local official Hedayatullah Hamdard told AFP news agency emergency personnel including the army were "searching for any possible victims under the mud and rubble".

Tents, blankets and food were provided to some families who had lost their homes, the official added.

The main road connecting Kabul to northern Afghanistan is closed.

It comes after flooding last month in the west of the country killed dozens of people, leaving thousands requiring humanitarian aid.

About 2,000 homes, three mosques, and four schools were also damaged.

Flash flooding happens when rain falls so heavily that normal drainage cannot cope.

Experts say a relatively dry winter has made it more difficult for the soil to absorb rainfall.

Torrential rain and flooding kill people every year in Afghanistan, where badly built houses in isolated rural areas are particularly vulnerable.

Afghanistan is among the globe's most at risk nations from the effects of climate change, according to experts.

The nation is one of the poorest in the world, having been ravaged by decades of war which culminated in the withdrawal of a US-led coalition and the Taliban retaking control in 2021.

Many factors contribute to flooding, but a warming atmosphere caused by climate change makes extreme rainfall more likely.

The world has already warmed by about 1.1C since the industrial era began and temperatures will keep rising unless governments around the world make steep cuts to emissions.
"""

shap_values = explainer([text])

shap.plots.text(shap_values)


Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors


TYPE OF EXPLAINER <class 'shap.explainers._partition.PartitionExplainer'>


PartitionExplainer explainer: 2it [01:35, 95.23s/it]               


In [8]:
import torch
def predict_fake(title,text):
    input_str = "<title>" + title + "<content>" +  text + "<end>"
    input_ids = tokenizer.encode_plus(input_str, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    device =  'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    with torch.no_grad():
        output = model(input_ids["input_ids"].to(device), attention_mask=input_ids["attention_mask"].to(device))
    return dict(zip(["Fake","Real"], [x.item() for x in list(torch.nn.Softmax()(output.logits)[0])] ))
    
print(predict_fake("", text))

{'Fake': 8.692077244631946e-05, 'Real': 0.9999130964279175}
