In [4]:
import pandas as pd
import os

dataset_dir = "./datasets/Sogou_data"
filename = "train.csv"
filepath = os.path.join(dataset_dir, filename)
# Load the CSV file
data = pd.read_csv(filepath, encoding='utf-8')  # Replace 'your_dataset.csv' with the path to your CSV file

# Display the column names
print(data.columns)

Index(['1\t女 足 头 号 射 手 为 冲 动 埋 单 　 韩 端 被 组 委 会 停 赛 两 场\t来 源 ： 金 羊 网 － 新 快 报 　 新 快 报 讯 　 （ 记 者 　 吴 禄 庭 ） 　 因 肘 击 对 手 吃 到 红 牌 的 韩 端 为 自 己 的 冲 动 “ 埋 单 ” ， 中 国 队 在 ５ 月 ３ ０ 日 晚 接 到 组 委 会 正 式 通 知 ： 中 国 女 足 头 号 射 手 被 停 赛 两 场 ， 同 时 罚 款 ３ ０ ０ ０ 美 元 。 这 意 味 着 韩 端 无 缘 今 晚 的 中 朝 大 战 。 　 韩 端 是 在 同 越 南 队 首 场 比 赛 中 吃 到 红 牌 的 ， 当 时 因 为 对 越 南 队 员 有 一 个 挥 击 的 动 作 ， 韩 端 被 主 裁 直 接 出 示 红 牌 罚 下 。 　 为 此 ， 中 国 女 足 曾 向 纪 律 委 员 会 递 交 了 一 份 书 面 报 告 ， 在 承 认 韩 端 错 误 的 同 时 ， 还 请 亚 足 联 考 虑 到 韩 端 在 １ ０ ０ 多 场 国 际 比 赛 中 从 没 吃 到 红 牌 的 因 素 ， 提 出 对 韩 端 只 停 赛 １ 场 的 要 求 。 但 纪 律 委 员 会 最 终 决 定 对 韩 端 实 施 追 加 处 罚 — — — 停 赛 两 场 ， 同 时 罚 款 ３ ０ ０ ０ 美 元 。 　 对 此 追 加 判 罚 ， 韩 端 显 得 很 郁 闷 。 不 过 根 据 规 则 ， 如 果 球 员 被 认 定 比 赛 中 有 伤 人 行 为 ， 那 么 至 少 会 被 处 以 ２ 至 ４ 场 的 停 赛 ， 同 时 罚 款 ３ ０ ０ ０ 至 ５ ０ ０ ０ 美 元 。 从 这 个 角 度 看 ， 韩 端 所 受 的 处 罚 已 是 亚 足 联 量 刑 最 轻 的 了 。 　 '], dtype='object')


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Assuming you have your Chinese data in 'texts' and 'labels' lists (texts: list of strings in Chinese, labels: list of integers)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load BERT tokenizer for Chinese text
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')  # Use a BERT model pre-trained on Chinese text
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

class TopicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TopicDataset(train_encodings, train_labels)
val_dataset = TopicDataset(val_encodings, val_labels)

# Load pre-trained BERT model for sequence classification on Chinese text
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=num_classes)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # output directory
    num_train_epochs=3,      # number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,       # number of warmup steps for learning rate scheduler
    weight_decay=0.01,      # strength of weight decay
    logging_dir='./logs',   # directory for storing logs
)

# Define Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

print(eval_results)


In [None]:
import re
import os
import torch
from torch import nn
from tqdm.auto import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Regex to extract category, subcategory, and question
regex = r"([\w]+):([\w]+) (.*)"

def read_questions(filepath):
    questions = []
    categories = set()
    with open(filepath, 'r') as f:
        for line in f:
            match = re.search(regex, line)
            if match:
                category = match.group(1)
                subcategory = match.group(2)
                question = match.group(3)
                questions.append((category, subcategory, question))
                categories.add(category)
    return questions, list(categories)

def preprocess(questions, tokenizer):
    input_ids = []
    attention_masks = []
    for cat, subcat, q in questions:
        encoded = tokenizer(q, truncation=True, padding='max_length')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return input_ids, attention_masks

# Paths
dataset_dir = "./datasets"
filename = "TREC_test.txt"
filepath = os.path.join(dataset_dir, filename)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Parse and tokenize
questions, categories = read_questions(filepath)
input_ids, attention_masks = preprocess(questions, tokenizer)

# Assign labels dynamically to categories
label_map = {cat: label for label, cat in enumerate(categories)}

# Prepare labels based on the assigned labels for categories
labels = [label_map[cat] for cat, _, _ in questions]

In [2]:
############################
# GPU and CPU Check Code
# KEEP AT THE TOP
############################

# !pip install psutil
# !pip install gputil

import psutil
import torch
import os
import spacy

from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

import psutil

# Get the number of CPUs
num_cpus = psutil.cpu_count(logical=False)  # physical cores
num_logical_cpus = psutil.cpu_count(logical=True)  # logical cores

print(f"Number of physical CPUs: {num_cpus}")
print(f"Number of logical CPUs: {num_logical_cpus}")

try:
    import GPUtil

    # Get the number of available GPUs
    gpus = GPUtil.getGPUs()
    num_gpus = len(gpus)

    print(f"Number of GPUs: {num_gpus}")

    for i, gpu in enumerate(gpus):
        print(f"GPU {i + 1}: {gpu.name}")
        print(f"\tMemory Total: {gpu.memoryTotal} MB")
        print(f"\tMemory Used: {gpu.memoryUsed} MB")
        print(f"\tMemory Free: {gpu.memoryFree} MB")
        print(f"\tGPU Utilization: {gpu.load * 100}%")
        print(f"\tGPU Temperature: {gpu.temperature} °C")
except ImportError:
    print("GPUtil library not found. Cannot check GPU information.")

Number of physical CPUs: 128
Number of logical CPUs: 128
Number of GPUs: 2
GPU 1: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 31.0 °C
GPU 2: NVIDIA A100 80GB PCIe
	Memory Total: 81920.0 MB
	Memory Used: 7.0 MB
	Memory Free: 81042.0 MB
	GPU Utilization: 0.0%
	GPU Temperature: 28.0 °C


In [5]:
# # USE ONLY TO EXTRACT FILES FROM TAR FILES

import tarfile

def extract_all_files(tar_file_path, extract_to):
    with tarfile.open(tar_file_path, 'r') as tar:
        tar.extractall(extract_to)

# Example usage
tar_file_path = 'datasets/ag_news_csv.tar.gz'
extract_to = 'datasets/'
extract_all_files(tar_file_path, extract_to)

In [14]:
# import torch
# from transformers import BertTokenizer, BertForSequenceClassification
# from torch.utils.data import DataLoader, Dataset
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split

# # Load classes
# with open('datasets/ag_news_csv/classes.txt', 'r') as file:
#     classes = file.read().splitlines()

# # Load train and test datasets
# train_df = pd.read_csv('datasets/ag_news_csv/train.csv', header=None, names=['label', 'text'])
# test_df = pd.read_csv('datasets/ag_news_csv/test.csv', header=None, names=['label', 'text'])

# # Preprocess data
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# class CustomDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len=128):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.text = self.data['text']
#         self.targets = self.data['label']
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.text)

#     def __getitem__(self, index):
#         text = str(self.text[index])
#         target = self.targets[index]

#         inputs = self.tokenizer.encode_plus(
#             text,
#             None,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding='max_length',
#             return_token_type_ids=False,
#             return_attention_mask=True,
#             return_tensors='pt',
#             truncation=True
#         )

#         return {
#             'input_ids': inputs['input_ids'].flatten(),
#             'attention_mask': inputs['attention_mask'].flatten(),
#             'targets': torch.tensor(target, dtype=torch.long)
#         }

# # # Encode labels
# # label_encoder = LabelEncoder()
# # train_df['label'] = label_encoder.fit_transform(train_df['label'])
# # test_df['label'] = label_encoder.transform(test_df['label'])

# # # Encode labels
# # label_encoder = LabelEncoder()
# # all_labels = pd.concat([train_df['label'], test_df['label']], axis=0)
# # label_encoder.fit(all_labels)

# # train_df['label'] = label_encoder.transform(train_df['label'])
# # test_df['label'] = label_encoder.transform(test_df['label'])
# # Encode labels
# label_encoder = LabelEncoder()
# all_labels = pd.concat([train_df['label'], test_df['label']], axis=0)

# try:
#     label_encoder.fit(all_labels)
# except KeyError as e:
#     unique_train_labels = set(train_df['label'])
#     unique_test_labels = set(test_df['label'])
    
#     problematic_label = unique_test_labels - unique_train_labels
#     print(f"Problematic label causing KeyError: {problematic_label}")

#     # Optionally, remove problematic labels before label encoding
#     all_labels = all_labels[~all_labels.isin(problematic_label)]
#     label_encoder.fit(all_labels)

# train_df['label'] = label_encoder.transform(train_df['label'])
# test_df['label'] = label_encoder.transform(test_df['label'])


# # Prepare DataLoader
# train_dataset = CustomDataset(train_df, tokenizer)
# test_dataset = CustomDataset(test_df, tokenizer)

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# import torch
# from transformers import BertTokenizer, BertForSequenceClassification
# from torch.utils.data import DataLoader, Dataset
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder

# # Load classes
# with open('datasets/ag_news_csv/classes.txt', 'r') as file:
#     classes = file.read().splitlines()

# # Load train and test datasets
# train_df = pd.read_csv('datasets/ag_news_csv/train.csv', header=None, names=['label', 'text'])
# test_df = pd.read_csv('datasets/ag_news_csv/test.csv', header=None, names=['label', 'text'])

# # Preprocess data
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Use LabelEncoder for labels
# label_encoder = LabelEncoder()
# train_df['label'] = label_encoder.fit_transform(train_df['label'])
# test_df['label'] = label_encoder.transform(test_df['label'])

# class CustomDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len=128):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.text = self.data['text'].tolist()
#         self.targets = self.data['label'].tolist()
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.text)

#     def __getitem__(self, index):
#         text = str(self.text[index])
#         target = int(self.targets[index])  # Ensure target is integer

#         inputs = self.tokenizer.encode_plus(
#             text,
#             None,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding='max_length',
#             return_token_type_ids=False,
#             return_attention_mask=True,
#             return_tensors='pt',
#             truncation=True
#         )

#         return {
#             'input_ids': inputs['input_ids'].flatten(),
#             'attention_mask': inputs['attention_mask'].flatten(),
#             'targets': torch.tensor(target, dtype=torch.long)
#         }

# # Prepare DataLoader
# train_dataset = CustomDataset(train_df, tokenizer)
# test_dataset = CustomDataset(test_df, tokenizer)

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load classes
with open('datasets/ag_news_csv/classes.txt', 'r') as file:
    classes = file.read().splitlines()

# Load train and test datasets
train_df = pd.read_csv('datasets/ag_news_csv/train.csv', header=None, names=['label', 'text'])
test_df = pd.read_csv('datasets/ag_news_csv/test.csv', header=None, names=['label', 'text'])

# Preprocess data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Use LabelEncoder for labels
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

# Get unique labels in the training set
unique_train_labels = set(train_df['label'])

# Filter out test set labels that are not in the training set
test_df_filtered = test_df[test_df['label'].isin(unique_train_labels)]

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = self.data['text'].tolist()
        self.targets = self.data['label'].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        target = int(self.targets[index])  # Ensure target is integer

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# Prepare DataLoader
train_dataset = CustomDataset(train_df, tokenizer)
test_dataset = CustomDataset(test_df_filtered, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


# Model configuration
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(classes))

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3):  # Change the number of epochs as needed
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
        loss = criterion(outputs.logits, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{3} - Loss: {running_loss/len(train_loader)}")

# Evaluate on test data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask


ValueError: y contains previously unseen labels: 'Fears for T N pension after talks'

In [21]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import spacy

# Load train and test datasets
train_df = pd.read_csv('datasets/ag_news_csv/train.csv', header=None, names=['label', 'text'])
test_df = pd.read_csv('datasets/ag_news_csv/test.csv', header=None, names=['label', 'text'])

# Initialize spaCy with the English model for sentence segmentation
nlp = spacy.load('en_core_web_sm')

# Tokenizer with WordPiece embeddings and 30,000 token vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Process train and test datasets
def preprocess_dataset(df):
    processed_texts = []
    for text in df['text']:
        # Perform sentence segmentation using spaCy
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]

        # Tokenize sentences and append to processed_texts
        tokenized_sentences = [tokenizer.tokenize(sent) for sent in sentences]
        processed_texts.append(tokenized_sentences)
    return processed_texts


print("here0")
# Preprocess train and test datasets
train_processed = preprocess_dataset(train_df)
test_processed = preprocess_dataset(test_df)

print("here00")
# Define a CustomDataset for BERT classification
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        # Flatten the list of tokenized sentences
        flat_text = [item for sublist in text for item in sublist]

        inputs = self.tokenizer.encode_plus(
            flat_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long)
        }

# Prepare train and test datasets
train_dataset = CustomDataset(train_processed, train_df['label'].tolist(), tokenizer)
test_dataset = CustomDataset(test_processed, test_df['label'].tolist(), tokenizer)

print("here1")
# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Model configuration
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)  # Change num_labels if needed

print("here2")
# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("here3")
for epoch in range(3):  # Change the number of epochs as needed
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
        loss = criterion(outputs.logits, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{3} - Loss: {running_loss/len(train_loader)}")


print("here4")
# Evaluate on test data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = correct / total
    print(f"Accuracy on test set: {accuracy * 100:.2f}%")


here0
here00
here1


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

here2
here3


TypeError: new(): invalid data type 'str'

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Assuming you have your data in 'texts' and 'labels' lists (texts: list of strings, labels: list of integers)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load BERT tokenizer and encode texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

class TopicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TopicDataset(train_encodings, train_labels)
val_dataset = TopicDataset(val_encodings, val_labels)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # output directory
    num_train_epochs=3,      # number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,       # number of warmup steps for learning rate scheduler
    weight_decay=0.01,      # strength of weight decay
    logging_dir='./logs',   # directory for storing logs
)

# Define Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

print(eval_results)


In [22]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import spacy

# Load train and test datasets
train_df = pd.read_csv('datasets/ag_news_csv/train.csv', header=None, names=['label', 'text'])
test_df = pd.read_csv('datasets/ag_news_csv/test.csv', header=None, names=['label', 'text'])

# Initialize spaCy with the English model for sentence segmentation
nlp = spacy.load('en_core_web_sm')

# Tokenizer with WordPiece embeddings and 30,000 token vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Process train and test datasets
def preprocess_dataset(df):
    processed_texts = []
    for text in df['text']:
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]

        tokenized_sentences = [tokenizer.tokenize(sent) for sent in sentences]
        processed_texts.append(tokenized_sentences)
    return processed_texts

# Preprocess train and test datasets
train_processed = preprocess_dataset(train_df)
test_processed = preprocess_dataset(test_df)

# Define a CustomDataset for BERT classification
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        flat_text = [item for sublist in text for item in sublist]

        inputs = self.tokenizer.encode_plus(
            flat_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long)
        }

# Prepare train and test datasets
train_dataset = CustomDataset(train_processed, train_df['label'].tolist(), tokenizer)
test_dataset = CustomDataset(test_processed, test_df['label'].tolist(), tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Model configuration
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Move model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
num_epochs = 5  # Increase the number of epochs as needed
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
        loss = criterion(outputs.logits, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {running_loss/len(train_loader)}")

# Evaluate on test data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = correct / total
    print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

TypeError: new(): invalid data type 'str'