In [None]:
import random
import numpy as np
import pandas as pd
random.seed(42)
np.random.seed(42)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
'''
# read original dataset
data_path = '/content/drive/MyDrive/dataset/convincing_data.csv'
data = pd.read_csv(data_path)
data['body_cleaned'] = data['body_cleaned'].apply(lambda x: x.strip('[]').replace("'",'').split(', '))
data ['text'] = data['body_cleaned'].apply(lambda x: ' '.join(x))
data = data[['id','text','score','body_length','sentiment']]
data.to_csv('/content/drive/MyDrive/dataset/text.csv')
'''

'\n# read original dataset\ndata_path = \'/content/drive/MyDrive/dataset/convincing_data.csv\'\ndata = pd.read_csv(data_path)\ndata[\'body_cleaned\'] = data[\'body_cleaned\'].apply(lambda x: x.strip(\'[]\').replace("\'",\'\').split(\', \'))\ndata [\'text\'] = data[\'body_cleaned\'].apply(lambda x: \' \'.join(x))\ndata = data[[\'id\',\'text\',\'score\',\'body_length\',\'sentiment\']]\ndata.to_csv(\'/content/drive/MyDrive/dataset/text.csv\')\n'

In [None]:
# OR directly read reprocessed dataset
data_path = '/content/drive/MyDrive/dataset/text.csv'
data = pd.read_csv(data_path)

In [None]:
# prob -> label
def score_label(score):
    if score > 1/3: return 1
    elif score < -1/3: return -1
    else: return 0
data['sentiment'] = data['sentiment'].apply(lambda x: score_label(x))

# Train-Test split and Load libraries

In [None]:
random.seed(42)
np.random.seed(42)
train_data_old, test_data = train_test_split(data[['text', 'sentiment']], test_size=0.1, random_state=42)
train_data, valid_data = train_test_split(train_data_old, test_size=0.1, random_state=42)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0


In [None]:
model_name = "google/electra-small-discriminator"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing BertForSequenceClassification: ['electra.encoder.layer.2.output.dense.weight', 'electra.encoder.layer.10.attention.output.LayerNorm.bias', 'electra.encoder.layer.0.attention.self.query.weight', 'electra.encoder.layer.9.output.LayerNorm.weight', 'electra.encoder.layer.7.attention.output.dense.bias', 'electra.encoder.layer.10.output.LayerNorm.bias', 'electra.encoder.layer.3.intermediate.dense.weight', 'electra.encoder.layer.5.attention.self.query.weight', 'electra.encoder.layer.7.attention.self.query.bias', 'electra.encoder.layer.7.attention.output.LayerNorm.weight', 'electra.encoder.layer.1.intermediate.dense.weight', 'electra.encoder.layer.3.attention.output.LayerNorm.bias', 'electra.encoder.layer.7.attention.output.dense.weight', 'electra.encoder.layer.6.output.dense.weight', 'electra.encoder.layer.9.attention.output.dense.weight', 'electra.encoder.layer.2.attention.output.LayerN

# Data Transfer

In [None]:
class SentimentDataset(Dataset):

    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row['text']
        label = row['sentiment']

        if label == -1:
            label = 0
        elif label == 0:
            label = 1
        elif label == 1:
            label = 2

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True,
            return_token_type_ids=False
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Data loader

In [None]:
def create_data_loader(data, tokenizer, batch_size=128, max_length=256):
    dataset = SentimentDataset(data, tokenizer, max_length=max_length)
    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=4
    )

train_data_loader = create_data_loader(train_data, tokenizer)
test_data_loader = create_data_loader(test_data, tokenizer)
valid_data_loader = create_data_loader(valid_data, tokenizer)

# Train model

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 20
learning_rate = 2e-5
warmup_steps = int(0.1 * len(train_data_loader))  # 10% of train data for warm-up
total_steps = epochs * len(train_data_loader)

optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)



In [None]:
from tqdm import tqdm

def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return total_loss / len(data_loader), accuracy.item()

def evaluate_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_loss += loss.item()

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return total_loss / len(data_loader), accuracy.item()

def train_and_evaluate(model, train_data_loader, valid_data_loader, optimizer, scheduler, device, epochs):
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        # 训练
        train_loss, train_acc = train_epoch(model, train_data_loader, optimizer, scheduler, device)
        print(f"Train loss: {train_loss}, Train accuracy: {train_acc}")

        # 测试
        valid_loss, valid_acc = evaluate_model(model, valid_data_loader, device)
        print(f"Valid loss: {valid_loss}, Valid accuracy: {valid_acc}")

train_and_evaluate(model, train_data_loader, valid_data_loader, optimizer, scheduler, device, epochs)

Epoch 1/20


100%|██████████| 4489/4489 [17:14<00:00,  4.34it/s]


Train loss: 0.7354627292056369, Train accuracy: 0.6727620234611763


100%|██████████| 499/499 [00:49<00:00, 10.07it/s]


Valid loss: 0.637413700919352, Valid accuracy: 0.7382660734427873
Epoch 2/20


100%|██████████| 4489/4489 [17:11<00:00,  4.35it/s]


Train loss: 0.6400135722078547, Train accuracy: 0.7351592132725017


100%|██████████| 499/499 [00:49<00:00, 10.02it/s]


Valid loss: 0.6296684713903553, Valid accuracy: 0.7413992981576638
Epoch 3/20


100%|██████████| 4489/4489 [17:11<00:00,  4.35it/s]


Train loss: 0.6302898144185556, Train accuracy: 0.7393316750249356


100%|██████████| 499/499 [00:49<00:00, 10.02it/s]


Valid loss: 0.6278688310979602, Valid accuracy: 0.7416186238877052
Epoch 4/20


100%|██████████| 4489/4489 [17:11<00:00,  4.35it/s]


Train loss: 0.6243883544431016, Train accuracy: 0.7415127741387444


100%|██████████| 499/499 [00:49<00:00, 10.00it/s]


Valid loss: 0.6278968563299618, Valid accuracy: 0.7418692818648953
Epoch 5/20


100%|██████████| 4489/4489 [17:12<00:00,  4.35it/s]


Train loss: 0.6191898551038231, Train accuracy: 0.7438453142923787


100%|██████████| 499/499 [00:50<00:00,  9.96it/s]


Valid loss: 0.6263467441938205, Valid accuracy: 0.7421042737185111
Epoch 6/20


 42%|████▏     | 1875/4489 [07:11<10:01,  4.34it/s]


KeyboardInterrupt: ignored

# Get model outcome

In [None]:
def get_test_labels_and_predictions(model, test_data_loader, device):
    model.eval()
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in tqdm(test_data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            true_labels.extend(labels.cpu().numpy().tolist())
            predicted_labels.extend(preds.cpu().numpy().tolist())

    true_labels = [-1 if x == 0 else 0 if x == 1 else 1 for x in true_labels]
    predicted_labels = [-1 if x == 0 else 0 if x == 1 else 1 for x in predicted_labels]

    return true_labels, predicted_labels

test_labels, predicted_test_labels = get_test_labels_and_predictions(model, test_data_loader, device)


100%|██████████| 555/555 [00:55<00:00, 10.02it/s]


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, predicted_test_labels))

              precision    recall  f1-score   support

          -1       0.76      0.83      0.79     29782
           0       0.61      0.30      0.40     11900
           1       0.75      0.84      0.79     29242

    accuracy                           0.74     70924
   macro avg       0.71      0.66      0.66     70924
weighted avg       0.73      0.74      0.73     70924



In [None]:
pd.Series(test_labels).to_csv('/content/drive/MyDrive/dataset/Bert_true.csv', index=False, header=False)
pd.Series(predicted_test_labels).to_csv('/content/drive/MyDrive/dataset/Bert_predict.csv', index=False, header=False)

In [None]:
xtestsss1 = pd.read_csv('/content/drive/MyDrive/dataset/Bert_true.csv', header=None)[0].values
xtestsss2 = pd.read_csv('/content/drive/MyDrive/dataset/Bert_predict.csv', header=None)[0].values
print(classification_report(xtestsss1, xtestsss2))

              precision    recall  f1-score   support

          -1       0.76      0.83      0.79     29782
           0       0.61      0.30      0.40     11900
           1       0.75      0.84      0.79     29242

    accuracy                           0.74     70924
   macro avg       0.71      0.66      0.66     70924
weighted avg       0.73      0.74      0.73     70924

