In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [4]:
# Assuming CUDA is available, else fallback to CPU
device = torch.device("gpu" if torch.cuda.is_available() else "cpu")

In [6]:
class CustomDataset(Dataset):
    """A custom dataset class for your irony detection task."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def load_dataset(file_path, tokenizer, max_length):
    """Function to load and tokenize the dataset."""
    df = pd.read_csv(file_path)
    texts = df['tweet'].tolist()  # Adjust column name based on your dataset
    labels = df['sarcastic'].tolist()  # Adjust column name based on your dataset

    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return CustomDataset(encodings, labels)

def compute_metrics(pred):
    """Function to compute metrics of the model's performance."""
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [8]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load and tokenize the dataset
file_path = '/Users/yinterested/Downloads/数据库/nlp_data/new_irony_dataset.csv'
max_length = 256
iron_dataset = load_dataset(file_path, tokenizer, max_length)


In [9]:
# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=iron_dataset,
    eval_dataset=iron_dataset,
    compute_metrics=compute_metrics,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [10]:
# Train the model
trainer.train()

# 假设模型已经训练完毕，我们将其保存到指定路径
model_save_path = '/Users/yinterested/文件(D)/研究生课程/CSCI_544-master/project/class-project-main/results/iron_model'

# 保存模型和tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

  0%|          | 0/366 [00:00<?, ?it/s]

{'loss': 0.7332, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.05}
{'loss': 0.6981, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.11}
{'loss': 0.6654, 'learning_rate': 3e-06, 'epoch': 0.16}
{'loss': 0.6384, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.22}
{'loss': 0.6327, 'learning_rate': 5e-06, 'epoch': 0.27}
{'loss': 0.5853, 'learning_rate': 6e-06, 'epoch': 0.33}
{'loss': 0.5776, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.38}
{'loss': 0.7044, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.44}
{'loss': 0.6397, 'learning_rate': 9e-06, 'epoch': 0.49}
{'loss': 0.5724, 'learning_rate': 1e-05, 'epoch': 0.55}
{'loss': 0.5877, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.6}
{'loss': 0.621, 'learning_rate': 1.2e-05, 'epoch': 0.66}
{'loss': 0.6221, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.71}


KeyboardInterrupt: 

In [3]:
from transformers import pipeline
# 加载训练好的反讽检测模型
model_path = model_save_path
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
irony_detector = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# 对情感分类数据集的每条文本进行反讽预测
def predict_irony_labels(texts, tokenizer, model, device):
    irony_labels = []
    for text in texts:
        # Encode the text, ensuring it's truncated to the max length the model can handle
        inputs = tokenizer.encode_plus(
            text,
            return_tensors='pt',
            max_length=512,
            truncation=True,
            padding='max_length'
        )

        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)

        # Assuming using a binary classification model where the second token (index 1) represents "irony"
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        irony_label = torch.argmax(predictions, dim=1).cpu().numpy()[0]  # Get the predicted class (0 or 1)
        irony_labels.append(irony_label)

    return irony_labels

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './iron_model'.

In [None]:

# Load data
data = pd.read_csv('/content/drive/MyDrive/data for Colab/class-project-main/IMDB Dataset.csv')
data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})
train_texts, test_texts, train_labels, test_labels = train_test_split(data['review'], data['label'], test_size=0.2)

train_texts = train_texts.reset_index(drop=True)
test_texts = test_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

# 假设你的模型和tokenizer已经定义好了
train_irony_labels = predict_irony_labels(train_texts, tokenizer, model, device)
test_irony_labels = predict_irony_labels(test_texts, tokenizer, model, device)

class CustomDataset1(Dataset):
    def __init__(self, texts, labels, irony_labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.irony_labels = irony_labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        item = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        item['irony_labels'] = torch.tensor(self.irony_labels[idx], dtype=torch.long)  # New
        return {key: value.squeeze(0) for key, value in item.items()}  # Ensure tensors are correctly shaped

    def __len__(self):
        return len(self.texts)

# Assuming the adjusted CustomDataset is used
train_dataset = CustomDataset1(train_texts, train_labels, train_irony_labels, tokenizer, 256)
test_dataset = CustomDataset1(test_texts, test_labels, test_irony_labels, tokenizer, 256)

model1 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    # Add the following line to report metrics every evaluation step
    report_to="all"
)

# Initialize the Trainer
trainer1 = Trainer(
    model=model1,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer1.train()

trainer1.evaluate(test_dataset)

combined data

In [11]:
import pandas as pd

In [12]:
chinese_data = pd.read_csv('/Users/yinterested/Downloads/数据库/nlp_data/combined_SemEvam_chinese.csv')

In [13]:
chinese_data

Unnamed: 0,tweet,sarcastic
0,Nice to have insomnia again!!!!,1
1,That's good! There's a big gap between the ric...,1
2,Morakot... How do you do... I can't believe ho...,1
3,"Very good, that what expensive to the burst of...",1
4,Great... After a morning of system maintenance...,1
...,...,...
1455,The Kaohsiung version also has a bunch of corp...,1
1456,"You're a fox, aren't you?",0
1457,It's true that all Chinese think the same way.,1
1458,"Taiwan is worthy of the outlaw pills, can go d...",1


In [18]:
IRONIC_DATASET_FP = "SemEval2018-T3-train-taskA.txt"
def parse_dataset(fp):
    y = []
    corpus = []
    with open(fp, 'rt', encoding='utf-8') as data_in:
        for line in data_in:
            if not line.lower().startswith("index"):  # 跳过首行
                line = line.rstrip()
                label, tweet = line.split("\t")[1], line.split("\t")[2]
                y.append(int(label))
                corpus.append(tweet)
    return corpus, y

irony_corpus, irony_y = parse_dataset(IRONIC_DATASET_FP)

orinal_data = pd.DataFrame({'tweet':irony_corpus, 
                            'sarcastic':irony_y
                            })




In [19]:
orinal_data

Unnamed: 0,tweet,sarcastic
0,Sweet United Nations video. Just in time for C...,1
1,@mrdahl87 We are rumored to have talked to Erv...,1
2,Hey there! Nice to see you Minnesota/ND Winter...,1
3,3 episodes left I'm dying over here,0
4,"""I can't breathe!"" was chosen as the most nota...",1
...,...,...
3829,@banditelli regarding what the PSU president does,0
3830,@banditelli But still bothers me that I see no...,0
3831,well now that i've listened to all of into the...,0
3832,Hummingbirds #Are #Experts #at #Hovering #Aft...,0


In [21]:
new_data = pd.concat([chinese_data, orinal_data])

In [23]:
new_data.to_csv('combined_new_irony.csv',index=None)