In [None]:
import glob
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
pd.options.mode.chained_assignment = None
from IPython.display import display, HTML

import warnings
warnings.filterwarnings("ignore")

In [None]:
# 掛載到Google雲端硬碟
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Functions for model analysis

# Save model function
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

# Feature Importance
"""
Function to Fit model based on optimal values of depth and number of estimators and use it
to compute feature importance for all the features.
"""
def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train):

    rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False)
    rf_opt.fit(X_train, y_train)
    feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    feature_importance.index = X_train.columns
    feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False)

    return feature_importance

"""
Function to Fit model based on optimal values of depth and number of estimators and feature importance
to find the fewest possible features to exceed the previously attained score with all selected features
"""
def get_fewest_features(depth, n_tree, max_leaf, importance):
    sorted_feature_names = importance.index
    features = []
    for f in range(1,len(sorted_feature_names)+1):
        features.append(sorted_feature_names[0:f])
    return features

## Get Scores of model with given parameters
def get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test):
    model = RandomForestClassifier(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=4,
                                    random_state=42, bootstrap=False)

    model.fit(X_train[feats], y_train)
    y_pred = model.predict(X_test[feats])

    class_report = classification_report(y_test, y_pred, target_names=classes, output_dict = True)
    macro_score = class_report['macro avg']['f1-score']
    weighted_score = class_report['weighted avg']['f1-score']

    return model, class_report, macro_score, weighted_score, y_pred

# Get X and Y from Dataset
def get_x_y_flow(Dataset, classes):
    X = Dataset[['pkt_len', 'srcport', 'dst_ip',
       'dstport', 'fin_flag', 'syn_flag',
       'rst_flag', 'psh_flag', 'ack_flag',
       'urg_flag', 'ece_flag', 'cwr_flag']]
    y = Dataset['Label'].replace(classes, range(len(classes)))
    return X, y

# Analyze Models to find best model
def analyze_models(classes, model_type, depths, n_trees, X_train, y_train, X_test, y_test, max_leaf,outfile):
    with open(outfile, "w") as res_file:
        print('depth;tree;n_feat;macro;weighted;feats', file=res_file)
        if model_type == 'RF':
            # FOR EACH (depth, n_tree, feat)
            for depth in depths:
                for n_tree in n_trees:
                    # get feature orders to use
                    importance = get_feature_importance(depth, n_tree, max_leaf, X_train, y_train)
                    m_feats = get_fewest_features(depth, n_tree, max_leaf, importance)
                    for feats in m_feats:
                        # Get the scores with the given (depth, n_tree, feat)
                        model, c_report, macro_f1, weight_f1, y_pred = get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test)
                        print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(list(feats)), file=res_file)
    print("Analysis Complete. Check output file.")
    return []

# 🧹 清理資料（針對 NaN / inf 做統一處理）
def clean_df(df, name="df"):
    print(f"🔍 清理 {name} 中的 inf / NaN...")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    before_drop = len(df)
    df.dropna(inplace=True)
    after_drop = len(df)
    print(f"✅ {name}: 清掉了 {before_drop - after_drop} 筆異常資料")
    return df

In [None]:
classes = ['BENIGN', 'Syn']

In [None]:
# Load Train and Test data
train_data = pd.read_csv("/content/drive/MyDrive/my_method/Train.csv")
test_data  = pd.read_csv("/content/drive/MyDrive/my_method/Test.csv")

# 去除欄位名稱中的隱藏字元或空白
train_data.columns = train_data.columns.str.strip()
test_data.columns  = test_data.columns.str.strip()

train_data = clean_df(train_data, "train_data")
test_data  = clean_df(test_data, "test_data")

# Get Variables and Labels
X_train, y_train = get_x_y_flow(train_data, classes)
X_test,  y_test  = get_x_y_flow(test_data, classes)

🔍 清理 train_data 中的 inf / NaN...
✅ train_data: 清掉了 0 筆異常資料
🔍 清理 test_data 中的 inf / NaN...
✅ test_data: 清掉了 0 筆異常資料


In [None]:
# Run model analysis
all_results = analyze_models(classes, "RF", [5,6,7,8,9,10], [2,3,5], X_train, y_train, X_test, y_test, 500, "/content/drive/MyDrive/my_method/My_method_2017.csv")

Analysis Complete. Check output file.


In [None]:
# Check model analysis resutls
results_analysis = pd.read_csv("/content/drive/MyDrive/my_method/My_method.csv", sep=";")
results_analysis = results_analysis.sort_values(by=['macro', 'weighted'], ascending=False)
results_analysis.reset_index(drop=True, inplace=True)

In [None]:
results_analysis[0:10]

Unnamed: 0,depth,tree,n_feat,macro,weighted,feats
0,9,3,6,0.998717,0.999415,"['dst_ip', 'pkt_len', 'dstport', 'srcport', 'a..."
1,8,5,4,0.998607,0.999364,"['dst_ip', 'pkt_len', 'srcport', 'dstport']"
2,7,2,12,0.998566,0.999346,"['dst_ip', 'pkt_len', 'dstport', 'srcport', 's..."
3,10,5,4,0.998549,0.999338,"['dst_ip', 'pkt_len', 'srcport', 'dstport']"
4,10,5,5,0.998538,0.999333,"['dst_ip', 'pkt_len', 'srcport', 'dstport', 'p..."
5,7,5,4,0.998534,0.999332,"['dst_ip', 'pkt_len', 'srcport', 'dstport']"
6,10,2,8,0.998513,0.999321,"['dst_ip', 'pkt_len', 'dstport', 'srcport', 's..."
7,10,3,6,0.998494,0.999313,"['dst_ip', 'pkt_len', 'dstport', 'srcport', 'a..."
8,9,3,12,0.998478,0.999306,"['dst_ip', 'pkt_len', 'dstport', 'srcport', 'a..."
9,9,3,10,0.998468,0.999301,"['dst_ip', 'pkt_len', 'dstport', 'srcport', 'a..."


In [None]:
# Get features of the best model
results_analysis['feats'][0]

"['dst_ip', 'pkt_len', 'dstport', 'srcport', 'ack_flag', 'syn_flag']"

In [None]:
select_feats = ['dst_ip', 'pkt_len', 'dstport', 'srcport', 'ack_flag', 'syn_flag']

In [None]:
# Retrain the best model and get its scores
model, class_report, macro_score, weighted_score, y_pred =  get_scores(classes, 9, 3, select_feats, 500, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.metrics import f1_score

# 假設 y_test 是真實標籤，y_pred 是模型輸出
macro_f1    = f1_score(y_test, y_pred, average='macro')     # 每類別 F1 取平均
weighted_f1 = f1_score(y_test, y_pred, average='weighted')  # 以樣本數加權
per_class_f1 = f1_score(y_test, y_pred, average=None)       # 逐類別 F1，回傳 ndarray

In [None]:
model

In [None]:
# Save the best model for future use
save_model(model, "/content/drive/MyDrive/my_method/My_method_model_2017.sav")