## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArgumentsa
import torch


## Load Data

In [None]:

train_df = pd.read_csv('/kaggle/input/dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/dataset/test.csv')
val_df = pd.read_csv('/kaggle/input/dataset/validation.csv')

## Labeling target data

In [None]:
train_df['label'] = train_df['account.type'].apply(lambda x: 1 if x == 'human' else 0)
test_df['label'] = test_df['account.type'].apply(lambda x: 1 if x == 'human' else 0)
val_df['label'] = val_df['account.type'].apply(lambda x: 1 if x == 'human' else 0)

## Tokenize Data

In [None]:
# Select model checkpoint
model_checkpoint = "distilbert-base-uncased"  # For ALBERT, use 'albert-base-v2'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Convert Pandas DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch usage
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])




Map:   0%|          | 0/20712 [00:00<?, ? examples/s]

Map:   0%|          | 0/2558 [00:00<?, ? examples/s]

Map:   0%|          | 0/2302 [00:00<?, ? examples/s]

## Define Model

In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  
)


## Define Metrics

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


## Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


## Train the Model

In [None]:
trainer.train()


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3135,0.271554,0.87576
2,0.1318,0.314657,0.882276
3,0.154,0.441174,0.87576
4,0.1212,0.58918,0.887055
5,0.0018,0.737664,0.885317
6,0.034,0.811598,0.878801
7,0.0058,0.829777,0.878367
8,0.0008,0.899255,0.879235
9,0.0053,0.945939,0.878367
10,0.0005,0.939164,0.880104


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=6480, training_loss=0.05699822525555332, metrics={'train_runtime': 1610.9592, 'train_samples_per_second': 128.569, 'train_steps_per_second': 4.022, 'total_flos': 6859161902407680.0, 'train_loss': 0.05699822525555332, 'epoch': 10.0})

## Evaluate the Model

In [None]:
results = trainer.evaluate(test_dataset)
print(results)

# Generate classification report
predictions = trainer.predict(test_dataset)
y_preds = np.argmax(predictions.predictions, axis=-1)
y_true = test_df['label'].values
print(classification_report(y_true, y_preds))


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.6083710193634033, 'eval_accuracy': 0.8788115715402658, 'eval_runtime': 6.444, 'eval_samples_per_second': 396.958, 'eval_steps_per_second': 12.415, 'epoch': 10.0}


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1280
           1       0.90      0.85      0.88      1278

    accuracy                           0.88      2558
   macro avg       0.88      0.88      0.88      2558
weighted avg       0.88      0.88      0.88      2558



## Save the Model

In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')