In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

In [2]:
train_df = pd.read_csv('/kaggle/input/dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/dataset/test.csv')
val_df = pd.read_csv('/kaggle/input/dataset/validation.csv')

In [3]:
import re
from nltk.corpus import stopwords
import spacy

if not spacy.util.is_package('en_core_web_sm'):
    spacy.cli.download("en_core_web_sm")

nlp = spacy.load('en_core_web_sm')


def preprocess_text(text):
    """ Method to clean reviews from noise and standardize text across the different classes.
    Arguments
    ---------
    text : String
        Text to clean
    Returns
    -------
    text : String
        Cleaned text
    """
    stop_words = set(stopwords.words('english'))
    
    text = text.lower() # Convert text to lowercase
    text = text.replace('\n', ' ') # Remove newline characters and extra spaces
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation and special characters
    text = re.sub(r'\d+', ' ', text) # Remove digits
    text = ' '.join([word for word in text.split() if word not in stop_words]) # Remove stopwords
    text = ' '.join( [token.lemma_ for token in nlp(text)]) # Lemmatize text

    return text

In [4]:
# train_df['text'] = train_df['text'].apply(lambda x: preprocess_text(x))
# val_df['text'] = val_df['text'].apply(lambda x: preprocess_text(x))
# test_df['text'] = test_df['text'].apply(lambda x: preprocess_text(x))

In [5]:
train_df['label'] = train_df['account.type'].apply(lambda x: 1 if x == 'human' else 0)
test_df['label'] = test_df['account.type'].apply(lambda x: 1 if x == 'human' else 0)
val_df['label'] = val_df['account.type'].apply(lambda x: 1 if x == 'human' else 0)

In [6]:
# Select model checkpoint
model_checkpoint = "albert-base-v2"  # For ALBERT, use 'albert-base-v2'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Convert Pandas DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch usage
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



Map:   0%|          | 0/20712 [00:00<?, ? examples/s]

Map:   0%|          | 0/2558 [00:00<?, ? examples/s]

Map:   0%|          | 0/2302 [00:00<?, ? examples/s]

In [7]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  
)


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [11]:
trainer.train()


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2776,0.295595,0.872285
2,0.1957,0.258517,0.883579
3,0.2028,0.283161,0.888792
4,0.2035,0.448582,0.887924
5,0.032,0.565613,0.887924
6,0.0489,0.707558,0.890096
7,0.0157,0.808408,0.890964
8,0.0001,0.904966,0.888358
9,0.0,0.908503,0.889661
10,0.0,0.911047,0.890964


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=6480, training_loss=0.09105615331578452, metrics={'train_runtime': 2634.2706, 'train_samples_per_second': 78.625, 'train_steps_per_second': 2.46, 'total_flos': 1237443709132800.0, 'train_loss': 0.09105615331578452, 'epoch': 10.0})

In [12]:
results = trainer.evaluate(test_dataset)
print(results)

# Generate classification report
predictions = trainer.predict(test_dataset)
y_preds = np.argmax(predictions.predictions, axis=-1)
y_true = test_df['label'].values
print(classification_report(y_true, y_preds))


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.8174412250518799, 'eval_accuracy': 0.8854573885848319, 'eval_runtime': 12.1583, 'eval_samples_per_second': 210.392, 'eval_steps_per_second': 6.58, 'epoch': 10.0}


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


              precision    recall  f1-score   support

           0       0.89      0.88      0.89      1280
           1       0.88      0.89      0.89      1278

    accuracy                           0.89      2558
   macro avg       0.89      0.89      0.89      2558
weighted avg       0.89      0.89      0.89      2558



In [13]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/spiece.model',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')