In [26]:
!pip install -q boto3 s3fs

In [28]:
import pandas as pd
import numpy as np
import boto3
import joblib
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load training data (assuming it's already in your SageMaker instance)
train_data = pd.read_csv('CIC_UNSW_NB15_training-set.csv')
train_data.drop(columns=['id'], inplace=True)
train_data.dropna(inplace=True)
train_data.drop_duplicates(inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = train_data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = train_data[col].astype(str)
    le.fit(train_data[col])
    train_data[col] = le.transform(train_data[col])
    label_encoders[col] = le

# Prepare features and labels
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train models
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

iso_forest = IsolationForest(random_state=42)
iso_forest.fit(X_train_scaled)

# Save models and encoders
joblib.dump(rf, 'rf_model.joblib')
joblib.dump(iso_forest, 'iso_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoders, 'label_encoders.joblib')

['label_encoders.joblib']

In [30]:
s3 = boto3.client('s3')
bucket = 'aws-cloudtrail-logs-258283632626-e2888416'
s3.upload_file('rf_model.joblib', bucket, 'models/rf_model.joblib')
s3.upload_file('iso_model.joblib', bucket, 'models/iso_model.joblib')
s3.upload_file('scaler.joblib', bucket, 'models/scaler.joblib')
s3.upload_file('label_encoders.joblib', bucket, 'models/label_encoders.joblib')