In [1]:
# Install required packages
!pip install transformers datasets torch scikit-learn pandas numpy -q


In [2]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import re
import json
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

print("✓ All packages installed successfully!")

✓ All packages installed successfully!


In [3]:
# SYNTHETIC DATA GENERATION
def generate_training_data():
    """
    Generate synthetic cloud platform logs for training.
    In production, you'd collect real logs from various platforms.
    """

    # AWS log patterns
    aws_logs = [
        "[    0.036775] RETBleed: WARNING: Spectre v2 mitigation leaves CPU vulnerable",
        "[    2.804897] fuse: init (API version 7.38)",
        "[    3.693699] ena 0000:00:05.0: Elastic Network Adapter (ENA) v2.14.1g",
        "cloud-init[1535]: Cloud-init v. 22.2.2 running 'init' at Tue, 23 Sep 2025",
        "[    5.971175] cloud-init[1535]: ci-info: |  ens5  | True |        172.31.42.16",
        "Amazon Linux 2023.8.20250915",
        "Kernel 6.1.150-174.273.amzn2023.x86_64 on an x86_64",
        "[    3.825948] ena 0000:00:05.0: Elastic Network Adapter (ENA) found at mem",
        "Datasource DataSourceEc2",
        "ec2-user/.ssh/authorized_keys",
        "ssh-rsa root@ip-172-31-42-16.ap-south-1.compute.internal",
        "AWS EC2 instance initialization complete",
        "ENA driver loaded successfully on EC2",
        "cloud-init finished at ap-south-1.compute.internal",
    ]

    # Azure log patterns
    azure_logs = [
        "WindowsAzureGuestAgent INFO Fabric request",
        "AzureLinuxAgent: INFO Starting Azure Linux Agent",
        "Microsoft.Azure.Monitor: Status Report for extension",
        "waagent: INFO Azure Extension Handler",
        "AzureSecurityPack: Security update applied successfully",
        "AzureDiagnostics: Collecting performance metrics",
        "INFO: Windows Azure Guest Agent Version",
        "Azure VM Agent started successfully",
        "Extension handler: Microsoft.Compute.CustomScriptExtension",
        "AzureMonitorLinuxAgent: Metrics collected",
        "Azure Storage SDK operation completed",
        "AzureFirewall: Traffic allowed from subnet",
        "Azure Backup: Snapshot created successfully",
        "AzureLoadBalancer: Health probe succeeded",
    ]

    # GCP log patterns
    gcp_logs = [
        "GCEMetadataScripts: Starting startup script",
        "google_guest_agent INFO GCE Agent running",
        "google-compute-engine: INFO Starting network setup",
        "GCE: Metadata server connection established",
        "google_osconfig_agent: Patch management initiated",
        "Stackdriver Logging agent started",
        "GCEInstanceSetup: Configuring network interfaces",
        "google-cloud-ops-agent: Telemetry collection active",
        "Cloud Logging API: Log entry written",
        "GCE VM instance startup complete",
        "google-guest-agent: Clock sync completed",
        "Cloud SQL Proxy: Connection established",
        "GKE Node agent: Kubernetes registration successful",
        "Google Cloud Storage FUSE mounted",
    ]

    # Create labeled dataset
    data = {
        'log_text': aws_logs + azure_logs + gcp_logs,
        'label': ['aws'] * len(aws_logs) + ['azure'] * len(azure_logs) + ['gcp'] * len(gcp_logs)
    }

    df = pd.DataFrame(data)

    # Add more variations for better training
    additional_data = []
    for _ in range(50):  # Generate 50 more samples per platform
        # AWS variations
        additional_data.append({
            'log_text': f"[    {np.random.randint(1, 10)}.{np.random.randint(100000, 999999)}] ena 0000:00:0{np.random.randint(1, 9)}.0: Elastic Network Adapter",
            'label': 'aws'
        })
        additional_data.append({
            'log_text': f"cloud-init[{np.random.randint(1000, 9999)}]: Cloud-init running on EC2 instance",
            'label': 'aws'
        })

        # Azure variations
        additional_data.append({
            'log_text': f"AzureLinuxAgent: INFO Agent version {np.random.randint(1, 3)}.{np.random.randint(0, 9)}.{np.random.randint(0, 99)}",
            'label': 'azure'
        })
        additional_data.append({
            'log_text': f"waagent: Extension handler completed with code {np.random.randint(0, 1)}",
            'label': 'azure'
        })

        # GCP variations
        additional_data.append({
            'log_text': f"google_guest_agent INFO Agent running on GCE instance-{np.random.randint(1, 999)}",
            'label': 'gcp'
        })
        additional_data.append({
            'log_text': f"GCEMetadataScripts: Script execution time {np.random.randint(1, 60)}s",
            'label': 'gcp'
        })

    df_additional = pd.DataFrame(additional_data)
    df = pd.concat([df, df_additional], ignore_index=True)

    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    return df

# Generate training data
print("Generating training dataset...")
df = generate_training_data()
print(f"✓ Generated {len(df)} training samples")
print(f"\nClass distribution:\n{df['label'].value_counts()}")
print(f"\nSample logs:")
print(df.head())

Generating training dataset...
✓ Generated 342 training samples

Class distribution:
label
azure    114
gcp      114
aws      114
Name: count, dtype: int64

Sample logs:
                                            log_text  label
0   waagent: Extension handler completed with code 0  azure
1         AzureLinuxAgent: INFO Agent version 1.3.36  azure
2      GCEMetadataScripts: Script execution time 44s    gcp
3  [    8.501333] ena 0000:00:08.0: Elastic Netwo...    aws
4  [    5.964390] ena 0000:00:01.0: Elastic Netwo...    aws


In [4]:
# DATA PREPROCESSING
label_map = {'aws': 0, 'azure': 1, 'gcp': 2}
id2label = {v: k for k, v in label_map.items()}
df['label_id'] = df['label'].map(label_map)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print(f"\n✓ Train set: {len(train_df)} samples")
print(f"✓ Test set: {len(test_df)} samples")

# Initialize tokenizer (CodeBERT uses RoBERTa tokenizer)
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')

def preprocess_function(examples):
    """Tokenize the log texts"""
    return tokenizer(
        examples['log_text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df[['log_text', 'label_id']])
test_dataset = Dataset.from_pandas(test_df[['log_text', 'label_id']])

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Rename label column
train_dataset = train_dataset.rename_column('label_id', 'labels')
test_dataset = test_dataset.rename_column('label_id', 'labels')

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print("✓ Data preprocessing complete!")


✓ Train set: 273 samples
✓ Test set: 69 samples


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

Map:   0%|          | 0/273 [00:00<?, ? examples/s]

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

✓ Data preprocessing complete!


In [6]:
model = RobertaForSequenceClassification.from_pretrained(
    'microsoft/codebert-base-mlm',
    num_labels=3,
    id2label=id2label,
    label2id=label_map
)

print("✓ Model loaded successfully!")

# Define training arguments
training_args = TrainingArguments(
    output_dir='./cloud-log-classifier',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',  # Changed from evaluation_strategy
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=False,
    report_to='none'
)

# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# Train the model
print("\n🚀 Starting training...")
trainer.train()

# Evaluate
print("\n📊 Evaluating model...")
results = trainer.evaluate()
print(f"✓ Test Accuracy: {results['eval_accuracy']:.4f}")

# Get detailed predictions for test set
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("\n📈 Classification Report:")
print(classification_report(
    true_labels,
    pred_labels,
    target_names=['AWS', 'Azure', 'GCP']
))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base-mlm and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded successfully!

🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0943,1.069668,0.333333
2,1.0124,0.73674,0.971014
3,0.333,0.112388,0.956522
4,0.0388,0.111455,0.971014
5,0.0056,0.05552,0.985507



📊 Evaluating model...


✓ Test Accuracy: 0.9855

📈 Classification Report:
              precision    recall  f1-score   support

         AWS       0.96      1.00      0.98        23
       Azure       1.00      0.96      0.98        23
         GCP       1.00      1.00      1.00        23

    accuracy                           0.99        69
   macro avg       0.99      0.99      0.99        69
weighted avg       0.99      0.99      0.99        69



In [7]:
# Save model and tokenizer
model_save_path = './cloud-log-classifier-final'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save label mapping
with open(f'{model_save_path}/label_mapping.json', 'w') as f:
    json.dump({'label_map': label_map, 'id2label': id2label}, f)

print(f"\n✅ Model saved to {model_save_path}")

# Create a zip file for download
!zip -r cloud-log-classifier-final.zip {model_save_path}
print("✓ Model packaged as cloud-log-classifier-final.zip")

# Download the model (optional)
print("\n📥 Download your model:")
files.download('cloud-log-classifier-final.zip')


✅ Model saved to ./cloud-log-classifier-final
  adding: cloud-log-classifier-final/ (stored 0%)
  adding: cloud-log-classifier-final/special_tokens_map.json (deflated 84%)
  adding: cloud-log-classifier-final/tokenizer_config.json (deflated 76%)
  adding: cloud-log-classifier-final/label_mapping.json (deflated 29%)
  adding: cloud-log-classifier-final/config.json (deflated 52%)
  adding: cloud-log-classifier-final/merges.txt (deflated 53%)
  adding: cloud-log-classifier-final/vocab.json (deflated 68%)
  adding: cloud-log-classifier-final/model.safetensors (deflated 7%)
✓ Model packaged as cloud-log-classifier-final.zip

📥 Download your model:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
#INFERENCE EXAMPLE (USE IN OTHER PROJECTS)
print("\n" + "="*70)
print("INFERENCE EXAMPLE - Use this code in other projects")
print("="*70)

class CloudLogClassifier:
    """
    Reusable classifier for cloud platform detection from logs.

    Usage in other projects:
    1. Load the saved model
    2. Create an instance of this class
    3. Call predict() method with log text
    """

    def __init__(self, model_path):
        """Load the fine-tuned model and tokenizer"""
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = RobertaForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()

        # Load label mapping
        with open(f'{model_path}/label_mapping.json', 'r') as f:
            mappings = json.load(f)
            self.id2label = {int(k): v for k, v in mappings['id2label'].items()}

    def predict(self, log_text):
        """
        Predict cloud platform from log text

        Args:
            log_text (str): Log text to classify

        Returns:
            dict: Prediction results with label and confidence
        """
        # Tokenize input
        inputs = self.tokenizer(
            log_text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=128
        ).to(self.device)

        # Get prediction
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_class = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0][predicted_class].item()

        return {
            'platform': self.id2label[predicted_class],
            'confidence': confidence,
            'all_probabilities': {
                self.id2label[i]: prob.item()
                for i, prob in enumerate(probabilities[0])
            }
        }

    def batch_predict(self, log_texts):
        """Predict multiple log texts at once"""
        return [self.predict(text) for text in log_texts]

# Test the classifier
print("\n🧪 Testing the trained classifier...")
classifier = CloudLogClassifier(model_save_path)

# Test with sample logs
test_logs = [
    "[    3.693699] ena 0000:00:05.0: Elastic Network Adapter (ENA) v2.14.1g",
    "AzureLinuxAgent: INFO Starting Azure Linux Agent",
    "google_guest_agent INFO GCE Agent running",
    "cloud-init[1535]: Cloud-init running on EC2",
    "waagent: Extension handler completed",
    "GCEMetadataScripts: Starting startup script"
]

print("\n📝 Sample Predictions:")
print("-" * 70)
for log in test_logs:
    result = classifier.predict(log)
    print(f"\nLog: {log[:60]}...")
    print(f"Predicted Platform: {result['platform'].upper()}")
    print(f"Confidence: {result['confidence']:.2%}")
    print(f"All probabilities: {result['all_probabilities']}")



INFERENCE EXAMPLE - Use this code in other projects

🧪 Testing the trained classifier...

📝 Sample Predictions:
----------------------------------------------------------------------

Log: [    3.693699] ena 0000:00:05.0: Elastic Network Adapter (EN...
Predicted Platform: AWS
Confidence: 99.90%
All probabilities: {'aws': 0.9990465044975281, 'azure': 0.0004860204644501209, 'gcp': 0.00046746485168114305}

Log: AzureLinuxAgent: INFO Starting Azure Linux Agent...
Predicted Platform: AZURE
Confidence: 99.84%
All probabilities: {'aws': 0.0007215996156446636, 'azure': 0.9983630776405334, 'gcp': 0.0009153559221886098}

Log: google_guest_agent INFO GCE Agent running...
Predicted Platform: GCP
Confidence: 99.88%
All probabilities: {'aws': 0.0005009484011679888, 'azure': 0.0007464088266715407, 'gcp': 0.9987525939941406}

Log: cloud-init[1535]: Cloud-init running on EC2...
Predicted Platform: AWS
Confidence: 99.90%
All probabilities: {'aws': 0.9990286827087402, 'azure': 0.0004928301204927266, 'gc