<a href="https://colab.research.google.com/github/xbeat/CPF/blob/main/AI/CPF3_SML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import random

vulnerability_templates = {
    "1.1": {"patterns": ["CEO requests: {action} now."], "actions": ["transfer funds", "share credentials"]},
    "2.1": {"patterns": ["URGENT: {action} in 1hr."], "actions": ["approve transfer", "reset password"]},
    "3.1": {"patterns": ["I helped you, please {action}."], "actions": ["share file", "approve request"]}
}

def generate_synthetic_data(num_samples=1000):
    samples = []
    for _ in range(num_samples):
        indicator = random.choice(list(vulnerability_templates.keys()))
        template = random.choice(vulnerability_templates[indicator]["patterns"])
        action = random.choice(vulnerability_templates[indicator]["actions"])
        text = template.format(action=action)
        severity = random.choice(["green", "yellow", "red"])
        samples.append({"text": text, "label": indicator, "severity": severity})
    with open("/content/drive/MyDrive/synthetic_data.json", "w") as f:
        json.dump(samples, f, indent=2)
    return samples

# Run in Colab
generate_synthetic_data()

[{'text': 'I helped you, please share file.',
  'label': '3.1',
  'severity': 'red'},
 {'text': 'URGENT: approve transfer in 1hr.',
  'label': '2.1',
  'severity': 'green'},
 {'text': 'I helped you, please share file.',
  'label': '3.1',
  'severity': 'red'},
 {'text': 'URGENT: approve transfer in 1hr.',
  'label': '2.1',
  'severity': 'green'},
 {'text': 'CEO requests: transfer funds now.',
  'label': '1.1',
  'severity': 'red'},
 {'text': 'CEO requests: transfer funds now.',
  'label': '1.1',
  'severity': 'yellow'},
 {'text': 'I helped you, please share file.',
  'label': '3.1',
  'severity': 'green'},
 {'text': 'I helped you, please approve request.',
  'label': '3.1',
  'severity': 'yellow'},
 {'text': 'CEO requests: share credentials now.',
  'label': '1.1',
  'severity': 'green'},
 {'text': 'CEO requests: transfer funds now.',
  'label': '1.1',
  'severity': 'yellow'},
 {'text': 'URGENT: approve transfer in 1hr.',
  'label': '2.1',
  'severity': 'red'},
 {'text': 'URGENT: reset 

In [None]:
!ls /content/drive/MyDrive/synthetic_data.json

/content/drive/MyDrive/synthetic_data.json


In [None]:
!pip install transformers datasets torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load data
dataset = load_dataset("json", data_files="/content/drive/MyDrive/synthetic_data.json", split="train")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")  # era microsoft/phi-3-mini-4k-instruct

# Preprocessing
def preprocess(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    labels = {"green": 0, "yellow": 1, "red": 2}
    tokenized["label"] = [labels[sev] for sev in examples["severity"]]
    return tokenized

dataset = dataset.map(preprocess, batched=True)
train_dataset, eval_dataset = dataset.train_test_split(test_size=0.2).values()

# Model - stesso modello del tokenizer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Training
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Aumentato
    learning_rate=2e-5,  # Learning rate ottimale
    warmup_steps=100,  # Warmup
    weight_decay=0.01,  # Regularizzazione
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

trainer = Trainer(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset)
trainer.train()

# Save to Hugging Face -
trainer.push_to_hub("CPF3-org/cpf-poc-model")



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.097679


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...nts.1757274246.53a683385944.13301.1: 100%|##########| 5.00kB / 5.00kB            

  ...nts.1757273539.53a683385944.13301.0: 100%|##########| 5.00kB / 5.00kB            

  ...nts.1757274385.53a683385944.13301.2: 100%|##########| 5.00kB / 5.00kB            

  /content/results/model.safetensors    :  12%|#2        | 32.8MB /  268MB            

  /content/results/training_args.bin    :   9%|9         |   544B / 5.78kB            

CommitInfo(commit_url='https://huggingface.co/CPF3/results/commit/f211de9bc50fee106642fb85f51cce5a7e9124eb', commit_message='CPF3-org/cpf-poc-model', commit_description='', oid='f211de9bc50fee106642fb85f51cce5a7e9124eb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CPF3/results', endpoint='https://huggingface.co', repo_type='model', repo_id='CPF3/results'), pr_revision=None, pr_num=None)

In [None]:
# FINE-TUNING
!pip install transformers datasets torch huggingface_hub

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from huggingface_hub import HfApi

# Load data
dataset = load_dataset("json", data_files="/content/drive/MyDrive/synthetic_data.json", split="train")

# Tokenizer e preprocessing
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    labels = {"green": 0, "yellow": 1, "red": 2}
    tokenized["label"] = [labels[sev] for sev in examples["severity"]]
    return tokenized

dataset = dataset.map(preprocess, batched=True)
train_dataset, eval_dataset = dataset.train_test_split(test_size=0.2).values()

# Model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Training
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Aumentato
    learning_rate=2e-5,  # Learning rate ottimale
    warmup_steps=100,  # Warmup
    weight_decay=0.01,  # Regularizzazione
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

trainer = Trainer(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset)
trainer.train()

# UPLOAD CORRETTO (sostituisce trainer.push_to_hub)
trainer.save_model("./cpf-model-final")
tokenizer.save_pretrained("./cpf-model-final")

api = HfApi()
api.upload_folder(
    folder_path="./cpf-model-final",
    repo_id="CPF3-org/cpf-poc-model",
    repo_type="model"
)

print("✅ Modello caricato in CPF3-org/cpf-poc-model")



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.100338
2,No log,1.09795
3,No log,1.099702


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...t/cpf-model-final/model.safetensors:   0%|          |  575kB /  268MB            

  ...t/cpf-model-final/training_args.bin:   1%|          |  47.0B / 5.78kB            

✅ Modello caricato in CPF3-org/cpf-poc-model


In [None]:
# In Colab, verifica distribuzione:
import json
with open("/content/drive/MyDrive/synthetic_data.json", "r") as f:
    data = json.load(f)

severity_count = {}
for item in data:
    sev = item["severity"]
    severity_count[sev] = severity_count.get(sev, 0) + 1

print(severity_count)

{'red': 342, 'green': 357, 'yellow': 301}


In [None]:
# Test diretto del modello
from transformers import pipeline
model = pipeline("text-classification", model="CPF3-org/cpf-poc-model")

tests = [
    "CEO requests: transfer funds now.",
    "URGENT: approve transfer in 1hr.",
    "Normal meeting tomorrow."
]

for text in tests:
    result = model(text)
    print(f"'{text}' -> {result}")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda:0


'CEO requests: transfer funds now.' -> [{'label': 'LABEL_2', 'score': 0.35572609305381775}]
'URGENT: approve transfer in 1hr.' -> [{'label': 'LABEL_0', 'score': 0.3749513328075409}]
'Normal meeting tomorrow.' -> [{'label': 'LABEL_0', 'score': 0.36182692646980286}]
