In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the dataset
df = pd.read_csv('news_dataset.csv')

# Preprocessing
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # Remove digits
    text = re.sub('\d', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

df['text'] = df['text'].apply(preprocess_text)

# Split the dataset
train_text, val_text, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2)

# Tokenize the input text
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_text), truncation=True, padding=True)
val_encodings = tokenizer(list(val_text), truncation=True, padding=True)

train_dataset = list(zip(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels))
val_dataset = list(zip(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels))

# Fine-tune the BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['label'].unique()))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

# Evaluate the model
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

eval_results = trainer.evaluate()

# Predict on test set
test_text = ['The stock market is up today.', 'The president made a speech about the economy.']
test_encodings = tokenizer(test_text, truncation=True, padding=True)
test_dataset = list(zip(test_encodings['input_ids'], test_encodings['attention_mask']))
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

print("Predictions:", predicted_labels)


ModuleNotFoundError: No module named 'sklearn'