In [4]:
cmd_80 = 'nprint -P port80.pcap -t -W port80.npt'
cmd_443 = 'nprint -P port443.pcap -t -W port443.npt'
!{cmd_80}
!{cmd_443}

In [6]:
import pandas as pd
# 加载 .npt 文件：
# 数据分析项目：加密流量分类

#本 Jupyter Notebook 主要用于 **网络流量加密检测**，基于 `nPrint` 进行数据预处理，并使用 `RandomForestClassifier` 进行流量分类。

#**开发者**: Jordan Holland  
#**我的角色**: 仅用于学习和测试，并不属于本项目的开发者。

#---

nprint_80 = pd.read_csv('port80.npt', index_col=0)
nprint_443 = pd.read_csv('port443.npt', index_col=0)

print('Port 80 nPrint: Number of Packets: {0}, Features per packet: {1}'.format(nprint_80.shape[0], nprint_80.shape[1]))
print('Port 443 nPrint: Number of Packets: {0}, Features per packet: {1}'.format(nprint_443.shape[0], nprint_443.shape[1]))

Port 80 nPrint: Number of Packets: 2421, Features per packet: 480
Port 443 nPrint: Number of Packets: 2500, Features per packet: 480


In [7]:
print(nprint_80.columns)
print(nprint_443.columns)

Index(['tcp_sprt_0', 'tcp_sprt_1', 'tcp_sprt_2', 'tcp_sprt_3', 'tcp_sprt_4',
       'tcp_sprt_5', 'tcp_sprt_6', 'tcp_sprt_7', 'tcp_sprt_8', 'tcp_sprt_9',
       ...
       'tcp_opt_310', 'tcp_opt_311', 'tcp_opt_312', 'tcp_opt_313',
       'tcp_opt_314', 'tcp_opt_315', 'tcp_opt_316', 'tcp_opt_317',
       'tcp_opt_318', 'tcp_opt_319'],
      dtype='object', length=480)
Index(['tcp_sprt_0', 'tcp_sprt_1', 'tcp_sprt_2', 'tcp_sprt_3', 'tcp_sprt_4',
       'tcp_sprt_5', 'tcp_sprt_6', 'tcp_sprt_7', 'tcp_sprt_8', 'tcp_sprt_9',
       ...
       'tcp_opt_310', 'tcp_opt_311', 'tcp_opt_312', 'tcp_opt_313',
       'tcp_opt_314', 'tcp_opt_315', 'tcp_opt_316', 'tcp_opt_317',
       'tcp_opt_318', 'tcp_opt_319'],
      dtype='object', length=480)


In [8]:
import numpy as np

samples = []
labels = []
# 为每一行数据分配标签
for _, row in nprint_80.iterrows():
    samples.append(np.array(row))
    labels.append('unencrypted')

for _, row in nprint_443.iterrows():
    samples.append(np.array(row))
    labels.append('encrypted')

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
# 训练随机森林分类器
# Split data（数据划分）
X_train, X_test, y_train, y_test = train_test_split(samples, labels)

# Initialize Classifier（初始化）
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)

# Train （训练）
clf.fit(X_train, y_train) 

# Predict（预测）
y_pred = clf.predict(X_test)

# Statistics

# First, lets get a stat report about the precision and recall:
report = classification_report(y_test, y_pred)
print(report)

# Let's also get the ROC AUC score while we're here, which requires a probability instead of just the prediction
y_pred_proba = clf.predict_proba(X_test)
# predict_proba gives us a probability estimate of each class, while roc_auc just cares about the "positive" class
y_pred_proba_pos = [sublist[1] for sublist in y_pred_proba]
roc = roc_auc_score(y_test, y_pred_proba_pos)
print('ROC AUC Score: {0}'.format(roc))

              precision    recall  f1-score   support

   encrypted       1.00      1.00      1.00       637
 unencrypted       1.00      1.00      1.00       594

    accuracy                           1.00      1231
   macro avg       1.00      1.00      1.00      1231
weighted avg       1.00      1.00      1.00      1231

ROC AUC Score: 1.0


In [10]:
# Get Raw feature importances
feature_importances = clf.feature_importances_
# Match the feature names we know with the importances
named_importances = []
for column_name, importance in zip(nprint_80.columns, feature_importances):
    named_importances.append((column_name, importance))
# Sort the named feature importances
sorted_feature_importances = sorted(named_importances, key=lambda tup: tup[1], reverse=True)
# Now lets print the top 20 important features (bits)
print(*sorted_feature_importances[0:20], sep='\n') 

('tcp_opt_67', np.float64(0.040611837275277604))
('tcp_opt_6', np.float64(0.016950270085418623))
('tcp_doff_1', np.float64(0.015998307812345392))
('tcp_opt_55', np.float64(0.015263724237826463))
('tcp_opt_20', np.float64(0.015201795449317629))
('tcp_opt_40', np.float64(0.014557313555064427))
('tcp_opt_44', np.float64(0.014307996196197876))
('tcp_opt_50', np.float64(0.013509611802183269))
('tcp_opt_48', np.float64(0.013420939671601715))
('tcp_opt_77', np.float64(0.013149013622162403))
('tcp_opt_72', np.float64(0.012900716955766797))
('tcp_opt_29', np.float64(0.012825690244490437))
('tcp_opt_42', np.float64(0.012709578753563008))
('tcp_opt_37', np.float64(0.012645677334665434))
('tcp_opt_49', np.float64(0.012592556726987704))
('tcp_opt_24', np.float64(0.012449211428761143))
('tcp_opt_54', np.float64(0.012448154989741896))
('tcp_opt_9', np.float64(0.012037932621151395))
('tcp_opt_75', np.float64(0.011964617805953542))
('tcp_opt_64', np.float64(0.01173984695798011))


In [11]:
# Generate nPrints
cmd_80 = 'nprint -P port80.pcap -4  -W port80.npt'
cmd_443 = 'nprint -P port443.pcap -4 -W port443.npt'
!{cmd_80}
!{cmd_443}

# Load nPrints
nprint_80 = pd.read_csv('port80.npt', index_col=0)
nprint_443 = pd.read_csv('port443.npt', index_col=0)

# Assoicate with Labels
samples = []
labels = []
for _, row in nprint_80.iterrows():
    samples.append(np.array(row))
    labels.append('unencrypted')

for _, row in nprint_443.iterrows():
    samples.append(np.array(row))
    labels.append('encrypted')
    
# Train and Test the Classifier
# Split data
X_train, X_test, y_train, y_test = train_test_split(samples, labels)
# Initialize Classifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)
# Train 
clf.fit(X_train, y_train) 
# Predict
y_pred = clf.predict(X_test)
# Statistics
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

   encrypted       1.00      1.00      1.00       608
 unencrypted       1.00      1.00      1.00       623

    accuracy                           1.00      1231
   macro avg       1.00      1.00      1.00      1231
weighted avg       1.00      1.00      1.00      1231



In [14]:
# Generate nPrints
cmd_80 = 'nprint -P port80.pcap -p 30 -W port80.npt'
cmd_443 = 'nprint -P port443.pcap -p 30 -W port443.npt'
!{cmd_80}
!{cmd_443}

# Load nPrints
nprint_80 = pd.read_csv('port80.npt', index_col=0)
nprint_443 = pd.read_csv('port443.npt', index_col=0)

# Assoicate with Labels
samples = []
labels = []
for _, row in nprint_80.iterrows():
    samples.append(np.array(row))
    labels.append('unencrypted')

for _, row in nprint_443.iterrows():
    samples.append(np.array(row))
    labels.append('encrypted')
    
# Train and Test the Classifier
# Split data
X_train, X_test, y_train, y_test = train_test_split(samples, labels)
# Initialize Classifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)
# Train 
clf.fit(X_train, y_train) 
# Predict
y_pred = clf.predict(X_test)
# Statistics
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

   encrypted       0.67      0.68      0.67       629
 unencrypted       0.66      0.65      0.66       602

    accuracy                           0.66      1231
   macro avg       0.66      0.66      0.66      1231
weighted avg       0.66      0.66      0.66      1231



In [15]:
# Load nPrints
nprint_80 = pd.read_csv('port80.npt', index_col=0)
nprint_443 = pd.read_csv('port443.npt', index_col=0)

# Assoicate with Labels
samples = []
labels = []
for _, row in nprint_80.iterrows():
    # Check for no payload, all bits will be -1. There are more efficient ways to do this
    if len(set(row)) == 1:
        continue
    samples.append(np.array(row))
    labels.append('unencrypted')

for _, row in nprint_443.iterrows():
    # Check for no payload, all bits will be -1. There are more efficient ways to do this
    if len(set(row)) == 1:
        continue
    samples.append(np.array(row))
    labels.append('encrypted')
    
# Train and Test the Classifier
# Split data
X_train, X_test, y_train, y_test = train_test_split(samples, labels)
# Initialize Classifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)
# Train 
clf.fit(X_train, y_train) 
# Predict
y_pred = clf.predict(X_test)
# Statistics
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

   encrypted       0.63      1.00      0.77       404
 unencrypted       1.00      0.24      0.39       320

    accuracy                           0.67       724
   macro avg       0.81      0.62      0.58       724
weighted avg       0.79      0.67      0.60       724

