# Claims prediction with the RoBERTa classifier used in the Coan et al. (2021) article 'Computer-assisted detection and classification of misinformation about climate change

## Setup

In [None]:
# Load the required packages

# Dataframes
import pandas as pd

# Regular expressions
import re

# Unidecoder
import unicodedata

# Timestamp / time measurment
import time

# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel

# Softmax function for predicted probabiliy calculation
from scipy.special import softmax

# PyTorch: enable GPU access
import torch

# If you want to select a specific GPU, set it here:
# gpu = 0
# torch.cuda.set_device(gpu) 

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use GPU {}:'.format(torch.cuda.current_device()), torch.cuda.get_device_name(torch.cuda.current_device()))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# Define required functions

# Define text pre-processing functions
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def strip_underscores(text):
    return re.sub(r'_+', ' ', text)
def remove_multiple_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Merge text pre-processing functions
def denoise_text(text):
    text = remove_between_square_brackets(text)
    text = remove_non_ascii(text)
    text = strip_underscores(text)
    text = remove_multiple_spaces(text)
    return text.strip()

## Data Preparation

In [None]:
# Load the text data
data = pd.read_csv('analysis/extracted_paras.csv')
print('{} paragraphs were loaded. Here are the first few rows of the data:'.format(len(data)))
data.head()

In [None]:
# Pre-process the text
data['text_denoised'] = data['text'].astype(str).apply(denoise_text)

## Predict text labels (claims) with the pre-trained RoBERTa classifier used in the Coan et al. (2021) article

In [None]:
%%time

# Define the model 
architecture = 'roberta'
model_name = 'CARDS_RoBERTa_Classifier'

# Load the classifier
model = ClassificationModel(architecture, model_name)

# Predict the labels
predictions, raw_outputs = model.predict(list(data.text_denoised))

In [None]:
# Join predictions to the data
data['RoBERTa_pred_label'] = predictions
data['RoBERTa_pred_probabilities'] = [softmax(element[0]) for element in raw_outputs]

data.head()