In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lime.lime_text import LimeTextExplainer
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
df = pd.read_csv("datasets/fakenewsnet.csv")

null_imputation_dict = { 
    'id': 'None',
    'title': 'None',
    'text': 'None',
    'label': 'None'
    }
df = df.fillna(value=null_imputation_dict)

## split to train and val
train_df, val_df = train_test_split(df, test_size=0.1)
val_df.reset_index(drop=True)

## vectorize to tf-idf vectors
tfidf_vc = TfidfVectorizer(min_df = 10, max_features = 100000, analyzer = "word", ngram_range = (1, 2), stop_words = 'english', lowercase = True)
train_vc = tfidf_vc.fit_transform(train_df["text"])
val_vc = tfidf_vc.transform(val_df["text"])    

In [9]:
# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")


In [1]:
headline = "At least 60 killed in Afghanistan flash flooding"


text = """
At least 60 people have died and more than 100 have been injured after flash flooding in northern Afghanistan, according to Taliban officials.

Dozens of people remain missing after heavy rainfall hit five districts in Baghlan province, with warnings the death toll could rise ahead of a further two storms forecast to spread across the region on Friday night.

Pictures on social media showed torrents of water sweeping through houses in several villages, leaving a trail of destruction in its wake.

The country has been hit by unusually heavy rainfall over the last few weeks, with floods killing more than 100 people since mid-April.

Abdul Mateen Qani, a spokesman for Afghanistan's interior ministry, told the BBC those who had died came from the Borka district in Baghlan province.

More than 200 people have been trapped inside their homes there.

The official earlier told Reuters news agency that helicopters had been sent to Baghlan - located directly north of the capital, Kabul - but "the operation may not be successful" due to a shortage of night vision lights.

Meanwhile, local official Hedayatullah Hamdard told AFP news agency emergency personnel including the army were "searching for any possible victims under the mud and rubble".

Tents, blankets and food were provided to some families who had lost their homes, the official added.

The main road connecting Kabul to northern Afghanistan is closed.

It comes after flooding last month in the west of the country killed dozens of people, leaving thousands requiring humanitarian aid.

About 2,000 homes, three mosques, and four schools were also damaged.

Flash flooding happens when rain falls so heavily that normal drainage cannot cope.

Experts say a relatively dry winter has made it more difficult for the soil to absorb rainfall.

Torrential rain and flooding kill people every year in Afghanistan, where badly built houses in isolated rural areas are particularly vulnerable.

Afghanistan is among the globe's most at risk nations from the effects of climate change, according to experts.

The nation is one of the poorest in the world, having been ravaged by decades of war which culminated in the withdrawal of a US-led coalition and the Taliban retaking control in 2021.

Many factors contribute to flooding, but a warming atmosphere caused by climate change makes extreme rainfall more likely.

The world has already warmed by about 1.1C since the industrial era began and temperatures will keep rising unless governments around the world make steep cuts to emissions.
"""

In [31]:
import torch

def predict_fake(title,text):
    input_str = "<title>" + title + "<content>" +  text + "<end>"
    input_ids = tokenizer.encode_plus(input_str, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    device =  'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    with torch.no_grad():
        output = model(input_ids["input_ids"].to(device), attention_mask=input_ids["attention_mask"].to(device))
    return dict(zip(["Fake","Real"], [x.item() for x in list(torch.nn.Softmax()(output.logits)[0])] ))
    

def predict_fake_textonly(text):
    input_str = "<title>" + "<content>" +  text + "<end>"
    input_ids = tokenizer.encode_plus(input_str, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    device =  'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    with torch.no_grad():
        output = model(input_ids["input_ids"].to(device), attention_mask=input_ids["attention_mask"].to(device))
    return dict(zip(["Fake","Real"], [x.item() for x in list(torch.nn.Softmax()(output.logits)[0])] ))
    


print(predict_fake_textonly(text))


{'Fake': 8.692077244631946e-05, 'Real': 0.9999130964279175}


  return self._call_impl(*args, **kwargs)


In [36]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(data):
    actual_labels = data['label']
    predicted_labels = []
    correct_predictions = 0
    
    for index, row in data.iterrows():
        print(f"{index}/{len(data)} % correct: {correct_predictions/len(data)}")
        title = row['title']
        text = row['text']
        actual_label = row['label']
        
        # Predict using your function
        predicted = predict_fake(title, text)
        
        # Assign the predicted label based on probabilities
        if predicted['Real'] > predicted['Fake']:
            print("Real")
            predicted_labels.append(0)
        else:
            predicted_labels.append(1)
            print("Fake")
              

        if (predicted['Real'] > predicted['Fake'] and actual_label == 0) or \
           (predicted['Fake'] > predicted['Real'] and actual_label == 1):
            correct_predictions += 1
    
    accuracy = accuracy_score(actual_labels, predicted_labels)
    precision = precision_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)
    f1 = f1_score(actual_labels, predicted_labels)
    
    return accuracy, precision, recall, f1

# Call the function to calculate evaluation metrics
accuracy, precision, recall, f1 = calculate_metrics(df[:50])
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


0/50 % correct: 0.0


  return self._call_impl(*args, **kwargs)


Fake
1/50 % correct: 0.02
Fake
2/50 % correct: 0.02
Fake
3/50 % correct: 0.04
Fake
4/50 % correct: 0.06
Real
5/50 % correct: 0.06
Fake
6/50 % correct: 0.06
Fake
7/50 % correct: 0.08
Fake
8/50 % correct: 0.08
Fake
9/50 % correct: 0.08
Fake
10/50 % correct: 0.08
Fake
11/50 % correct: 0.08
Fake
12/50 % correct: 0.08
Fake
13/50 % correct: 0.1
Fake
14/50 % correct: 0.12
Fake
15/50 % correct: 0.14
Fake
16/50 % correct: 0.14
Fake
17/50 % correct: 0.14
Fake
18/50 % correct: 0.16
Fake
19/50 % correct: 0.18
Fake
20/50 % correct: 0.18
Fake
21/50 % correct: 0.2
Fake
22/50 % correct: 0.2
Fake
23/50 % correct: 0.2
Fake
24/50 % correct: 0.22
Real
25/50 % correct: 0.24
Fake
26/50 % correct: 0.26
Fake
27/50 % correct: 0.26
Fake
28/50 % correct: 0.28
Fake
29/50 % correct: 0.28
Fake
30/50 % correct: 0.28
Fake
31/50 % correct: 0.28
Fake
32/50 % correct: 0.3
Fake
33/50 % correct: 0.3
Fake
34/50 % correct: 0.3
Fake
35/50 % correct: 0.3
Fake
36/50 % correct: 0.3
Fake
37/50 % correct: 0.32
Fake
38/50 % correc

In [34]:
# Prediction function for LIME
def predict_proba(texts):
    # Tokenize the input text batch
    encoded_batch = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    # Make prediction
    with torch.no_grad():
        outputs = model(**encoded_batch)
    # Apply softmax to get probabilities from the output logits
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).numpy()
    return probs


In [35]:
# LIME explainer setup
class_names = ["Real", "Fake"]
explainer = LimeTextExplainer(class_names=class_names)

# Example text
input_text = """
Israel's Eurovision contestant says "nothing will deter" her after getting booed during a dress rehearsal on Wednesday.

Eden Golan was performing her song Hurricane ahead of Thursday's semi-final in Malmo, Sweden.

In a statement, she said she was "proud to represent my country".

There have been several campaigns to block Israel from taking part in this year's contest following the outbreak of war in the Middle East.

But the European Broadcasting Union (EBU), which hosts the event, has ruled Israel is allowed to compete.

After Wednesday's dress rehearsal, several videos surfaced on social media from audience members who had recorded Golan being booed by some members of the crowd.

Israel is currently engaged in a military campaign in Gaza, which they launched as a response to Hamas' cross-border attack on southern Israel on 7 October.

About 1,200 people were killed and 252 others were taken hostage.

More than 34,840 people have been killed in Gaza since then, according to the territory's Hamas-run health ministry.
"""
# Generate explanation
exp = explainer.explain_instance(input_text, predict_proba, num_features=10)

# Display results
probs = predict_proba([input_text])
print("Text: \n", input_text)
print("Probability (Fake) =", probs[0, 1])
print("Probability (Real) =", probs[0, 0])

# Visualize the explanation (only in Jupyter Notebook)
exp.show_in_notebook(text=True)


KeyboardInterrupt: 