In [None]:
!pip install transformers datasets accelerate scikit-learn torch pandas numpy -q


In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

BASE_MODEL = 'distilbert-base-uncased'
MAX_LENGTH = 256
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
TASK = 'regression'
OUTPUT_DIR = "/content/models_foundation"

print(f"Model: {BASE_MODEL}, Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")


In [None]:
ds = load_dataset("benxh/tiktok-hooks-finetune")
df = ds['train'].to_pandas()

df['engagement_rate'] = (df['likes'] + df['comments'] + df['shares']) / (df['views'] + 1)
df['virality_score'] = df['engagement_rate'] * np.log1p(df['views'])
df['text_input'] = df['text_hook'].astype(str) + " [SEP] " + df['caption'].astype(str)
df = df.dropna(subset=['text_input', 'virality_score'])

df['uploaded_at'] = pd.to_datetime(df['uploaded_at'])
df = df.sort_values('uploaded_at')

train_size = int(len(df) * 0.70)
val_size = int(len(df) * 0.15)

train_df = df.iloc[:train_size].copy()
val_df = df.iloc[train_size:train_size+val_size].copy()
test_df = df.iloc[train_size+val_size:].copy()

print(f"Train: {len(train_df):,}, Val: {len(val_df):,}, Test: {len(test_df):,}")


In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text_input'], truncation=True, padding='max_length', max_length=MAX_LENGTH)

train_dataset = Dataset.from_pandas(train_df[['text_input', 'virality_score']])
val_dataset = Dataset.from_pandas(val_df[['text_input', 'virality_score']])
test_dataset = Dataset.from_pandas(test_df[['text_input', 'virality_score']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.map(lambda x: {'labels': float(x['virality_score'])}, remove_columns=['virality_score'])
val_dataset = val_dataset.map(lambda x: {'labels': float(x['virality_score'])}, remove_columns=['virality_score'])
test_dataset = test_dataset.map(lambda x: {'labels': float(x['virality_score'])}, remove_columns=['virality_score'])


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1, problem_type="regression")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mae = mean_absolute_error(labels, predictions)
    rmse = np.sqrt(mean_squared_error(labels, predictions))
    r2 = r2_score(labels, predictions)
    return {'mae': mae, 'rmse': rmse, 'r2': r2}


In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{OUTPUT_DIR}/{BASE_MODEL.replace('/', '_')}_{TASK}_{timestamp}"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="mae",
    greater_is_better=False,
    save_total_limit=2,
    seed=42,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()


In [None]:
test_results = trainer.evaluate(test_dataset)
print("Test Results:")
for key, value in test_results.items():
    if 'eval_' in key:
        print(f"  {key}: {value:.4f}")

trainer.save_model()
tokenizer.save_pretrained(output_dir)

metadata = {
    'base_model': BASE_MODEL,
    'task': TASK,
    'test_results': {k: float(v) for k, v in test_results.items()},
    'timestamp': timestamp,
}

with open(f'{output_dir}/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Model saved to {output_dir}")


In [None]:
from google.colab import files
import os

model_dir = "/content/models_foundation"
if os.path.exists(model_dir):
    model_folders = [os.path.join(model_dir, d) for d in os.listdir(model_dir) 
                     if os.path.isdir(os.path.join(model_dir, d))]
    if model_folders:
        latest_model = max(model_folders, key=os.path.getmtime)
        for file in ['pytorch_model.bin', 'config.json', 'tokenizer_config.json', 'vocab.txt', 'metadata.json']:
            file_path = os.path.join(latest_model, file)
            if os.path.exists(file_path):
                files.download(file_path)
