In [31]:
import tensorflow as tf
import pandas as pd

#df_4forums = pd.read_json('4ForumsResults.json') # This file is too large to go through git
#df_conme = pd.read_excel('convinceMe.xlsx')
df_credb = pd.read_excel('createDebate.xlsx')


In [32]:
# install transformers if not already installed
%pip install transformers

#import the BERT model and tokenizer
from transformers import BertModel, BertTokenizer

if 'bertModel' not in locals() or 'tokenizer' not in locals():
    #BERT base model
    bertModel = BertModel.from_pretrained('bert-base-uncased')
    
    #BERT Tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Note: you may need to restart the kernel to use updated packages.


In [33]:
df_credb.stance.value_counts()

unknown                       3051
prefers strict gun control    3051
opposes strict gun control    3051
undecided                     3051
other                         3051
Name: stance, dtype: int64

I'm gonna work with the "Create Debate" Dataset for now to save time

In [34]:
df_credb.stance.value_counts()

unknown                       3051
prefers strict gun control    3051
opposes strict gun control    3051
undecided                     3051
other                         3051
Name: stance, dtype: int64

In [35]:
df_credb = df_credb.query("stance == 'prefers strict gun control' | stance == 'opposes strict gun control'")

In [36]:
df_credb = df_credb[["text", "stance"]]

In [37]:
df_credb

Unnamed: 0,text,stance
642,Guns should be banned because they are not nee...,prefers strict gun control
643,"Yes, guns should be banned. Guns provide a tri...",prefers strict gun control
644,"If you look at history, the first thing empire...",prefers strict gun control
645,Yes that might be true but are you suggesting ...,prefers strict gun control
646,Gun control is misguided. When guns become ill...,prefers strict gun control
...,...,...
15240,"Yes, the fire arms should be banned. But if th...",opposes strict gun control
15241,Herein lies one of my biggest contentions rega...,opposes strict gun control
15242,Making firearms illegal only stops law abiding...,opposes strict gun control
15243,"Yeah, I thought that was hilarious.",opposes strict gun control


In [38]:
#Encoding the stance, which is our target class
import numpy as np

if type(df_credb.stance.iloc[1]) is str:
    class_mapping = {label: idx for idx, label in 
                    enumerate(np.unique(df_credb['stance']))}

class_mapping

{'opposes strict gun control': 0, 'prefers strict gun control': 1}

In [39]:
#Prevents accidentally overwriting the 
if type(df_credb.stance.iloc[1]) is str:
    df_credb['stance'] = df_credb['stance'].map(class_mapping)

df_credb.stance.value_counts()

1    3051
0    3051
Name: stance, dtype: int64

In [40]:
oppose = df_credb.query("stance == 0")
support = df_credb.query("stance == 1")

oppose = oppose[:75]
support = support[:75]

df_reduced = pd.concat([support,oppose])
print(len(df_reduced))

150


In [41]:
from transformers import BertModel, BertTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Load the pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a simple classifier on top of BERT
class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label

# Define some example data
texts = df_reduced.text.values.tolist()
labels = df_reduced.stance.values.tolist()

# Tokenize the texts
tokens = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Create the input_ids tensor and the attention_mask tensor
input_ids = tokens['input_ids']
attention_mask = torch.ones_like(input_ids)  # Create an attention_mask tensor with all elements set to 1

# Create a dataloader
dataset = TextDataset(input_ids, torch.tensor(labels))
dataloader = DataLoader(dataset, batch_size=150)

# Define the classifier and the optimizer
classifier = BertClassifier(num_classes=2)
optimizer = optim.Adam(classifier.parameters(), lr=0.0001)

# Train the classifier
for epoch in range(10):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, labels = batch
        logits = classifier(input_ids=input_ids, attention_mask=attention_mask)  # Pass the attention_mask tensor to the classifier
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} completed")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 0 completed
Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Epoch 4 completed
Epoch 5 completed
Epoch 6 completed
Epoch 7 completed
Epoch 8 completed
Epoch 9 completed


In [42]:
# Define a function to predict the category of a sentence
def predict_category(sentence):
    # Tokenize the sentence
    tokens = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']

    # Pass the input_ids and attention_mask tensors through the classifier
    with torch.no_grad():
        logits = classifier(input_ids, attention_mask)
        probabilities = nn.functional.softmax(logits, dim=1)
        predicted_category = torch.argmax(probabilities, dim=1).item()

    return predicted_category

In [43]:
predicted = list()
for i in range(df_credb.shape[0]):
    text = str(df_credb.iloc[i][0])
    predicted.append(predict_category(text))

In [55]:
print(df_credb.iloc[0][0])

predict_category(df_credb.iloc[0][0])

Guns should be banned because they are not needed in any domestic issue. The second ammendment was put in place because of fear that the british might invade america again or take control of the government. if this were the case the people would need weapons to defend themselves and regain america. The british aren't going to invade so we don't need to protect our selves. even in the this day and age america remains increadible safe compared to many other nations. we have no close enemies. if a major army were to attack us a few men with pistols or shotguns wouldn't do much against a soldier with an ak47 or tanks or bombers. Guns in America just make it easier for crimes to be committed. Some guns should never be considered allowed and this includes all semi automatic weapons as well as shotguns. Poverty, drugs, and lack of education are the reasons people turn to guns to kill. guns give you power to take life and should not be allowed to float around so that our students or citizens c

1

In [44]:
ground_truth = df_credb.stance.tolist()
true = 0
for i in range(len(predicted)):
    if predicted[i] == ground_truth[i]:
        true += 1

print(true/6102)

0.5037692559816453


In [45]:
torch.save(classifier, 'bert.sav')