In [21]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import torch

In [33]:
df = pd.read_parquet('./data/train-data.parquet')
df_trump = pd.read_json('./data/tweets.json')
display(df.head())
display(df_trump.head())

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


Unnamed: 0,date,favorites,id,isRetweet,retweets,text
0,2024-11-04 10:20:38,3983,113425359755307360,False,1554,<p></p>
1,2024-11-04 10:20:27,2527,113425359034120560,False,713,"<p>Join me live in Raleigh, North Carolina at ..."
2,2024-11-04 09:56:53,4816,113425266314029296,False,1329,"<p><a href=""https://links.truthsocial.com/link..."
3,2024-11-04 09:36:27,6672,113425185981054000,False,2232,<p>MAKE AMERICA GREAT AGAIN!</p>
4,2024-11-04 09:17:13,5338,113425110394877856,False,2052,<p>“AMERICA IS BEING INVADED”</p>


In [34]:
# map all columns with true/false labels to 1 for true and 0 for false
def map_labels(df):
    for col in df.columns:
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)
        elif np.issubdtype(df[col].dtype, np.floating):
            df[col] = (df[col] > 2).astype(int)
    return df

# drop all columns with annotator in the name
df = df.loc[:, ~df.columns.str.contains('annotator')]
df = df.drop(columns=["infitms", "outfitms", "std_err", "hypothesis", "platform", "hate_speech_score"])
df = map_labels(df)
# train test split
df_train, df_test = train_test_split(df, test_size=0.4, random_state=42)
df_test, df_dev = train_test_split(df_test, test_size=0.5, random_state=42)
# df_extra, df_train = train_test_split(df, test_size=0.01, random_state=42)
# df_train, df_test = train_test_split(df_train, test_size=0.4, random_state=42)
# df_test, df_dev = train_test_split(df_test, test_size=0.5, random_state=42)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test),
    "dev": Dataset.from_pandas(df_dev)
})

In [39]:
dataset

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'text', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target_gender_non_binary', 'target_gender_transgender_men', 'target_gender_transgender_unspecified', 'target_gender_transgender_women', 'target_gender_women', 'target_g

In [98]:
labels = [label for label in dataset["train"].features.keys() if label not in ['comment_id', 'text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['sentiment',
 'respect',
 'insult',
 'humiliate',
 'status',
 'dehumanize',
 'violence',
 'genocide',
 'attack_defend',
 'hatespeech',
 'target_race_asian',
 'target_race_black',
 'target_race_latinx',
 'target_race_middle_eastern',
 'target_race_native_american',
 'target_race_pacific_islander',
 'target_race_white',
 'target_race_other',
 'target_race',
 'target_religion_atheist',
 'target_religion_buddhist',
 'target_religion_christian',
 'target_religion_hindu',
 'target_religion_jewish',
 'target_religion_mormon',
 'target_religion_muslim',
 'target_religion_other',
 'target_religion',
 'target_origin_immigrant',
 'target_origin_migrant_worker',
 'target_origin_specific_country',
 'target_origin_undocumented',
 'target_origin_other',
 'target_origin',
 'target_gender_men',
 'target_gender_non_binary',
 'target_gender_transgender_men',
 'target_gender_transgender_unspecified',
 'target_gender_transgender_women',
 'target_gender_women',
 'target_gender_other',
 'target_gender',
 't

In [40]:
X_train = df_train["text"].reset_index()
y_train = df_train.drop(columns=["text"])
display(X_train.head())

Unnamed: 0,index,text
0,0,Why did I click on your profile? You gave no s...
1,1,it really doesn't matter if you don't believe ...
2,2,"Please help support nonbinary pride, add a #Tw..."
3,3,#MissionUnstapabolBossingMaine @mainedcm | Mai...
4,4,I love that you enjoy being trans.


In [41]:
max_length = X_train["text"].str.len().max()
print(f"Maximum length of text: {max_length}")

Maximum length of text: 603


In [73]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [74]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)

Map: 100%|██████████| 81333/81333 [00:12<00:00, 6676.23 examples/s]
Map: 100%|██████████| 27111/27111 [00:03<00:00, 7104.26 examples/s]
Map: 100%|██████████| 27112/27112 [00:03<00:00, 7134.46 examples/s]


In [75]:
example = encoded_dataset["train"][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [76]:
tokenizer.decode(example['input_ids'])

"[CLS] why did i click on your profile? you gave no source anyway, you just are a hateful human being. good grief. you have been given sources by others disproving your statements, but no you didn ' t, once you cited the hatesub gc, that is used only to missrepresent transgender people and to deny their existence [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [77]:
example['labels']

[1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [78]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label != 0]

['sentiment',
 'target_sexuality_bisexual',
 'target_sexuality_gay',
 'target_sexuality_lesbian',
 'target_sexuality_other',
 'target_sexuality']

In [79]:
encoded_dataset.set_format("torch")

In [80]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [135]:
batch_size = 8
metric_name = "f1"

In [136]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)



In [138]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [139]:
encoded_dataset["train"][0]['labels'].type()

'torch.FloatTensor'

In [140]:
encoded_dataset["train"]['input_ids'][0]

tensor([  101,  1030,  6683,  2891, 21547,  1030,  7595,  5685, 21397,  7486,
         2323,  2022,  4452,  1997,  2151,  2173,  3352,  5152,  3484,  2084,
         2055,  2634,  1010,  2559,  2012,  1996,  4650,  1999,  2035,  5152,
         3484,  3032,  2085,  1012, 18883,  2015,  1998, 20474,  2015,  2071,
         2025, 21063,  1999,  1028, 19527,  2086,  1012,  2129,  2079,  2017,
         5987,  2068,  2000, 19242,  1996, 18221,  1999,  2634,  1029,  1001,
        20474,  6914, 10085,  5178,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [141]:
outputs = model(input_ids=encoded_dataset["train"]['input_ids'][0].unsqueeze(0), labels=encoded_dataset["train"][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.7002, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.4964,  0.1237,  0.7332,  0.4905,  0.4337, -0.2857,  0.2185, -0.3485,
         -0.4422, -0.5298,  0.5695,  0.0234,  0.3570,  0.1206, -0.6058, -0.3078,
         -0.4451, -0.0048, -0.6410, -0.0872, -0.2691,  0.2103,  0.2653, -0.6864,
         -0.3468, -0.0184,  0.2674, -0.7151, -0.4990,  0.1555,  0.2831, -0.0372,
          0.0360,  0.8311,  0.3228,  0.2725,  0.5254,  0.4875,  0.0399, -0.4417,
         -0.4516,  0.3269, -0.2376, -0.1142, -0.4359,  0.4867, -0.2368,  0.0204,
         -0.2572, -0.2442,  0.6405,  0.1258, -0.2876, -0.1532,  0.2366,  0.0225,
          0.2581,  0.0265,  0.1419, -0.1325, -0.3316, -0.1444, -0.3841]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [142]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [143]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.324192,0.535123,0.708906,0.0




KeyboardInterrupt: 

In [81]:
model_path = "bert-finetuned-hate-speech"

# Load tokenizer and model
trained_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [82]:
# Put model in eval mode
trained_model.eval()

# Create a DataLoader for the test dataset
test_loader = DataLoader(encoded_dataset['test'], batch_size=8)

results_df = pd.DataFrame(columns=['Text', 'Predicted_Values', 'Actual_Values'])

# Iterate through the test dataset and make predictions
for batch in test_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels'].int()

    decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]

    with torch.no_grad():
        outputs = trained_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.sigmoid(logits) > 0.5
        # Convert predictions to integers (1 or 0)
        predictions = predictions.int()

        batch_results = []
        for text, pred, actual in zip(decoded_texts, predictions.numpy(), labels.numpy()):
            batch_results.append({'Text': text, 'Predicted_Values': list(pred), 'Actual_Values': list(actual)})

        # Convert the list of results to a DataFrame
        batch_df = pd.DataFrame(batch_results)
        print(f"Processed {len(results_df) + len(batch_df)} rows so far.")

        # Concatenate the batch DataFrame with the main results DataFrame
        results_df = pd.concat([results_df, batch_df], ignore_index=True)
    
display(results_df)

Processed 8 rows so far.
Processed 16 rows so far.
Processed 24 rows so far.
Processed 32 rows so far.
Processed 40 rows so far.
Processed 48 rows so far.
Processed 56 rows so far.
Processed 64 rows so far.
Processed 72 rows so far.
Processed 80 rows so far.
Processed 88 rows so far.
Processed 96 rows so far.
Processed 104 rows so far.


KeyboardInterrupt: 

In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score
import openpyxl

# Convert the predicted and actual values to numpy arrays for easier computation
predicted_values_array = np.array(results_df['Predicted_Values'].tolist())
actual_values_array = np.array(results_df['Actual_Values'].tolist())

# Calculate accuracy, precision, recall, and F1 score for each label

label_precisions = np.round(precision_score(actual_values_array, predicted_values_array, average=None, zero_division=0), 3)
label_recalls = np.round(recall_score(actual_values_array, predicted_values_array, average=None, zero_division=0), 3)
label_f1_scores = np.round(f1_score(actual_values_array, predicted_values_array, average=None, zero_division=0), 3)
label_accuracies = np.round((predicted_values_array == actual_values_array).mean(axis=0), 3)

# Create a dictionary to map labels to their accuracies, precisions, recalls, and F1 scores
label_metrics_dict = {
    id2label[idx]: {
        'Accuracy': accuracy,
        'Precision': label_precisions[idx],
        'Recall': label_recalls[idx],
        'F1_Score': label_f1_scores[idx]
    }
    for idx, accuracy in enumerate(label_accuracies)
}

# Convert the dictionary to a DataFrame
accuracy_df = pd.DataFrame.from_dict(label_metrics_dict, orient='index').reset_index()
accuracy_df.rename(columns={'index': 'Label'}, inplace=True)
label_accuracy_dict = {id2label[idx]: accuracy for idx, accuracy in enumerate(label_accuracies)}
# Add a column for the number of positive occurrences in each label
accuracy_df['Positive_Occurrences'] = [actual_values_array[:, idx].sum() for idx in range(actual_values_array.shape[1])]

# Sort the DataFrame by accuracy in descending order
# Split the DataFrame into two: one with labels that don't have "target" in the name and the other with "target" labels
non_target_labels_df = accuracy_df[~accuracy_df['Label'].str.contains('target')]
target_labels_df = accuracy_df[accuracy_df['Label'].str.contains('target')]

# Sort both DataFrames by 'Positive_Occurrences' in descending order
non_target_labels_df = non_target_labels_df.sort_values(by='Positive_Occurrences', ascending=False)
target_labels_df = target_labels_df.sort_values(by='Positive_Occurrences', ascending=False)

# Display the sorted DataFrame
display(non_target_labels_df)

# Save the accuracy DataFrame to a CSV file
non_target_labels_df.to_excel('hate_speech_metrics.xlsx', index=False)
# Save the target labels DataFrame to a CSV file
target_labels_df.to_excel('target_metrics.xlsx', index=False)

Unnamed: 0,Label,Accuracy,Precision,Recall,F1_Score,Positive_Occurrences
0,sentiment,0.852,0.876,0.925,0.9,824
1,respect,0.859,0.881,0.912,0.896,771
2,insult,0.827,0.845,0.886,0.865,721
8,attack_defend,0.821,0.828,0.877,0.852,677
3,humiliate,0.782,0.768,0.834,0.8,601
4,status,0.778,0.762,0.831,0.795,596
5,dehumanize,0.772,0.709,0.717,0.713,456
6,violence,0.914,0.787,0.748,0.767,218
7,genocide,0.951,0.772,0.839,0.804,137
9,hatespeech,1.0,0.0,0.0,0.0,0


In [83]:
from bs4 import BeautifulSoup

def clean_html(raw_html):
    # Parse HTML
    soup = BeautifulSoup(raw_html, "html.parser")
    # Get plain text
    return soup.get_text(separator=" ", strip=True)

In [88]:
df_trump['isRetweet'] = df_trump['isRetweet'].astype(bool)
df_trump['text'] = df_trump['text'].apply(lambda x: clean_html(x))
# remove all rows with empty text
df_trump = df_trump[df_trump['text'].str.strip() != '']


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(raw_html, "html.parser")


In [89]:
# smaller_df['text'] = smaller_df['text'].astype(str)
display(df_trump.head(10))

Unnamed: 0,date,favorites,id,isRetweet,retweets,text
1,2024-11-04 10:20:27,2527,113425359034120560,False,713,"Join me live in Raleigh, North Carolina at 10:..."
2,2024-11-04 09:56:53,4816,113425266314029296,False,1329,https:// swampthevoteusa.com/
3,2024-11-04 09:36:27,6672,113425185981054000,False,2232,MAKE AMERICA GREAT AGAIN!
4,2024-11-04 09:17:13,5338,113425110394877856,False,2052,“AMERICA IS BEING INVADED”
8,2024-11-04 01:34:55,15593,113423292553721168,False,4014,"A great day in Pennsylvania, North Carolina, a..."
9,2024-11-04 00:59:10,12013,113423151971796160,False,2717,RT: https://truthsocial.com/users/realDonaldTr...
10,2024-11-03 22:13:34,14659,113422500803916096,False,4097,Mimi Ramirez-Rodriguez was kidnapped and murde...
11,2024-11-03 22:09:57,12277,113422486548204144,False,2733,"THANK YOU—MACON, GEORGIA! https:// swampthevot..."
12,2024-11-03 22:09:00,0,113422482812310384,True,0,RT @ TeamTrump PRESIDENT TRUMP: Under my leade...
13,2024-11-03 22:08:55,0,113422482474891328,True,0,RT @ TeamTrump Kamala talks about fixing the e...


In [90]:
dataset_trump = DatasetDict({
    "trump": Dataset.from_pandas(df_trump)
})
# dataset_trump.set_format("torch")
# dataset_trump

def process(examples):
    # take a batch of texts
    text = examples["text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    return encoding

dataset_trump = dataset_trump.map(process, batched=True, remove_columns=dataset_trump["trump"].column_names)
dataset_trump.set_format("torch")

Map: 100%|██████████| 76238/76238 [00:09<00:00, 8264.41 examples/s]


In [93]:
# Put model in eval mode
trained_model.eval()

# Create a DataLoader for the test dataset
trump_loader = DataLoader(dataset_trump['trump'], batch_size=8)

trump_results = pd.DataFrame(columns=['Text', 'Predicted_Values'])

# Iterate through the test dataset and make predictions
for batch in trump_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']

    decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]

    with torch.no_grad():
        outputs = trained_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.sigmoid(logits) > 0.5
        # Convert predictions to integers (1 or 0)
        predictions = predictions.int()

        batch_results = []
        for text, pred in zip(decoded_texts, predictions.numpy()):
            batch_results.append({'Text': text, 'Predicted_Values': list(pred)})

        # Convert the list of results to a DataFrame
        batch_df = pd.DataFrame(batch_results)
        print(f"Processed {len(trump_results) + len(batch_df)} rows so far.")

        # Concatenate the batch DataFrame with the main results DataFrame
        trump_results = pd.concat([trump_results, batch_df], ignore_index=True)
    
display(trump_results)

Processed 8 rows so far.
Processed 16 rows so far.
Processed 24 rows so far.
Processed 32 rows so far.
Processed 40 rows so far.
Processed 48 rows so far.
Processed 56 rows so far.
Processed 64 rows so far.
Processed 72 rows so far.
Processed 80 rows so far.
Processed 88 rows so far.
Processed 96 rows so far.
Processed 104 rows so far.
Processed 112 rows so far.
Processed 120 rows so far.
Processed 128 rows so far.
Processed 136 rows so far.
Processed 144 rows so far.
Processed 152 rows so far.
Processed 160 rows so far.
Processed 168 rows so far.
Processed 176 rows so far.
Processed 184 rows so far.
Processed 192 rows so far.
Processed 200 rows so far.
Processed 208 rows so far.
Processed 216 rows so far.
Processed 224 rows so far.
Processed 232 rows so far.
Processed 240 rows so far.
Processed 248 rows so far.
Processed 256 rows so far.
Processed 264 rows so far.
Processed 272 rows so far.
Processed 280 rows so far.
Processed 288 rows so far.
Processed 296 rows so far.
Processed 304 

Unnamed: 0,Text,Predicted_Values
0,"join me live in raleigh, north carolina at 10 ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,https : / / swampthevoteusa. com /,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,make america great again!,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,“ america is being invaded ”,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,"a great day in pennsylvania, north carolina, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
76233,""" "" my persona will never be that of a wallflo...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
76234,new blog post : celebrity apprentice finale an...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
76235,donald trump reads top ten financial tips on l...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
76236,donald trump will be appearing on the view tom...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [94]:
# save the trump results to a CSV file
trump_results.to_csv('trump_results.csv', index=False)

In [104]:
# Ensure labels is converted to a list
labels_list = list(labels)
print(labels_list)

# Create a DataFrame from the Predicted_Values column
predicted_values_df = pd.DataFrame(trump_results['Predicted_Values'].tolist(), columns=labels_list)

# Concatenate the original DataFrame with the new DataFrame
trump_results_expanded = pd.concat([trump_results.drop(columns=['Predicted_Values']), predicted_values_df], axis=1)

# Display the expanded DataFrame
display(trump_results_expanded)
# Save the expanded DataFrame to a CSV file
trump_results_expanded.to_csv('trump_results_expanded.csv', index=False)

['sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target_gender_non_binary', 'target_gender_transgender_men', 'target_gender_transgender_unspecified', 'target_gender_transgender_women', 'target_gender_women', 'target_gender_other', 'target_gender', 'target_sexuality_bisexual', 'target_sexuali

Unnamed: 0,Text,sentiment,respect,insult,humiliate,status,dehumanize,violence,genocide,attack_defend,...,target_age_other,target_age,target_disability_physical,target_disability_cognitive,target_disability_neurological,target_disability_visually_impaired,target_disability_hearing_impaired,target_disability_unspecific,target_disability_other,target_disability
0,"join me live in raleigh, north carolina at 10 ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,https : / / swampthevoteusa. com /,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,make america great again!,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,“ america is being invaded ”,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,"a great day in pennsylvania, north carolina, a...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76233,""" "" my persona will never be that of a wallflo...",1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76234,new blog post : celebrity apprentice finale an...,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
76235,donald trump reads top ten financial tips on l...,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76236,donald trump will be appearing on the view tom...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
