In [3]:
# IMDB数据分类
from keras.datasets import imdb
from keras_preprocessing import sequence

from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop


max_features = 10000
max_len = 500

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)


model = Sequential()
model.add(layers.Embedding(max_features, 128, input_length=max_len))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1))
model.summary()


model.compile(optimizer=RMSprop(lr=1e-4),loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train,epochs=40,batch_size=128, validation_split=0.2)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 500)
x_test shape: (25000, 500)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 conv1d_2 (Conv1D)           (None, 494, 32)           28704     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 98, 32)           0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 92, 32)            7200      
                                                                 
 global_max_pooling1d_1 (Glo  (None, 32)               0         
 balMaxPooling1D)                        

In [1]:
# 基于 1D 卷积的序列分类

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

seq_length = 64

model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(seq_length, 104)))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid')) # dense 全连接层

model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=16, epochs=10)
score = model.evaluate(x_test, y_test, batch_size=16)


NameError: name 'x_train' is not defined

In [1]:
# 1 导入包
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import normalize
from keras.layers import LSTM
from keras.layers import Embedding, SimpleRNN


In [2]:
# 2 读取数据集

# dataframe = pd.read_csv("total.csv",header=None).values # 取消第一行作为表头
train_set = pd.read_csv("train_set.csv")
test_set = pd.read_csv("test_set.csv")
print(train_set)

# 3 读标签
# label = dataframe.loc[:,['label']]
y_train = train_set['label']
test_label = test_set['label']

print(y_train)
print("y_train.shape",y_train)



        label  frame.len  ip.hdr_len  ip.dsfield  ip.len  ip.flags.rb  \
0           0         66          20           0      52            0   
1           0         66          20           0      52            0   
2           0         66          20           0      52            0   
3           0         66          20           0      52            0   
4           0         66          20           0      52            0   
...       ...        ...         ...         ...     ...          ...   
837180     22         74          20           0      60            0   
837181     22         74          20           0      60            0   
837182     22         74          20          40      60            0   
837183     22         66          20           0      52            0   
837184     22         74          20           0      60            0   

        ip.flags.df  ip.flags.mf  ip.frag_offset  ip.ttl  ...  \
0                 0            0               0      63  

In [3]:
print(y_train)

0          0
1          0
2          0
3          0
4          0
          ..
837180    22
837181    22
837182    22
837183    22
837184    22
Name: label, Length: 837185, dtype: int64


In [8]:

x_train_set_GA = train_set[['frame.len', 'ip.len', 'ip.flags.df', 'ip.flags.mf', 'ip.ttl',
       'ip.proto', 'tcp.hdr_len', 'tcp.flags.ns', 'tcp.flags.ecn',
       'tcp.flags.urg', 'tcp.flags.ack', 'tcp.flags.push', 'tcp.flags.reset',
       'tcp.flags.syn', 'tcp.flags.fin', 'tcp.window_size_value',
       'tcp.urgent_pointer', 'tcp.options.wscale.shift', 'tcp.options.mss_val',
       'tcp.options.timestamp.tsval', 'tcp.options.timestamp.tsecr',
       'udp.length', 'udp.checksum', 'tcp.option_kind',
       '_ws.col.Protocol_AJP13', '_ws.col.Protocol_BGP',
       '_ws.col.Protocol_BZR', '_ws.col.Protocol_DRDA', '_ws.col.Protocol_DSI',
       '_ws.col.Protocol_FTP', '_ws.col.Protocol_GIOP',
       '_ws.col.Protocol_GTPv2', '_ws.col.Protocol_Gopher',
       '_ws.col.Protocol_Gryphon', '_ws.col.Protocol_HTTP',
       '_ws.col.Protocol_HTTP/JSON', '_ws.col.Protocol_HTTP/XML',
       '_ws.col.Protocol_ICAP', '_ws.col.Protocol_ICMP',
       '_ws.col.Protocol_Kafka', '_ws.col.Protocol_LDAP',
       '_ws.col.Protocol_MySQL', '_ws.col.Protocol_NBNS',
       '_ws.col.Protocol_NDMP', '_ws.col.Protocol_NNTP',
       '_ws.col.Protocol_POP', '_ws.col.Protocol_Portmap',
       '_ws.col.Protocol_R3', '_ws.col.Protocol_RPC', '_ws.col.Protocol_RSYNC',
       '_ws.col.Protocol_RTMP', '_ws.col.Protocol_RTSP',
       '_ws.col.Protocol_SABP', '_ws.col.Protocol_SMTP',
       '_ws.col.Protocol_SSH', '_ws.col.Protocol_SSLv2',
       '_ws.col.Protocol_SSLv3', '_ws.col.Protocol_TCP',
       '_ws.col.Protocol_TCPCL', '_ws.col.Protocol_TDS',
       '_ws.col.Protocol_TELNET', '_ws.col.Protocol_TLSv1',
       '_ws.col.Protocol_TLSv1.1', '_ws.col.Protocol_TLSv1.2',
       '_ws.col.Protocol_TN3270', '_ws.col.Protocol_TPKT',
       '_ws.col.Protocol_UDP', '_ws.col.Protocol_VICP',
       '_ws.col.Protocol_ZEBRA']]
test_set = test_set[['frame.len', 'ip.len', 'ip.flags.df', 'ip.flags.mf', 'ip.ttl',
       'ip.proto', 'tcp.hdr_len', 'tcp.flags.ns', 'tcp.flags.ecn',
       'tcp.flags.urg', 'tcp.flags.ack', 'tcp.flags.push', 'tcp.flags.reset',
       'tcp.flags.syn', 'tcp.flags.fin', 'tcp.window_size_value',
       'tcp.urgent_pointer', 'tcp.options.wscale.shift', 'tcp.options.mss_val',
       'tcp.options.timestamp.tsval', 'tcp.options.timestamp.tsecr',
       'udp.length', 'udp.checksum', 'tcp.option_kind',
       '_ws.col.Protocol_AJP13', '_ws.col.Protocol_BGP',
       '_ws.col.Protocol_BZR', '_ws.col.Protocol_DRDA', '_ws.col.Protocol_DSI',
       '_ws.col.Protocol_FTP', '_ws.col.Protocol_GIOP',
       '_ws.col.Protocol_GTPv2', '_ws.col.Protocol_Gopher',
       '_ws.col.Protocol_Gryphon', '_ws.col.Protocol_HTTP',
       '_ws.col.Protocol_HTTP/JSON', '_ws.col.Protocol_HTTP/XML',
       '_ws.col.Protocol_ICAP', '_ws.col.Protocol_ICMP',
       '_ws.col.Protocol_Kafka', '_ws.col.Protocol_LDAP',
       '_ws.col.Protocol_MySQL', '_ws.col.Protocol_NBNS',
       '_ws.col.Protocol_NDMP', '_ws.col.Protocol_NNTP',
       '_ws.col.Protocol_POP', '_ws.col.Protocol_Portmap',
       '_ws.col.Protocol_R3', '_ws.col.Protocol_RPC', '_ws.col.Protocol_RSYNC',
       '_ws.col.Protocol_RTMP', '_ws.col.Protocol_RTSP',
       '_ws.col.Protocol_SABP', '_ws.col.Protocol_SMTP',
       '_ws.col.Protocol_SSH', '_ws.col.Protocol_SSLv2',
       '_ws.col.Protocol_SSLv3', '_ws.col.Protocol_TCP',
       '_ws.col.Protocol_TCPCL', '_ws.col.Protocol_TDS',
       '_ws.col.Protocol_TELNET', '_ws.col.Protocol_TLSv1',
       '_ws.col.Protocol_TLSv1.1', '_ws.col.Protocol_TLSv1.2',
       '_ws.col.Protocol_TN3270', '_ws.col.Protocol_TPKT',
       '_ws.col.Protocol_UDP', '_ws.col.Protocol_VICP',
       '_ws.col.Protocol_ZEBRA']]

print(x_train_set_GA)
print("筛选GA特征",x_train_set_GA.shape) # 筛选了69个特征

########## 一般是转为浮点数向量

# train_set = np.array(train_set[1:,:]).astype(np.float32) # 去掉第一行字段 
# test_set = np.array(test_set[1:,:]).astype(np.float32)

train_set_GA = np.array(x_train_set_GA).astype(np.float32) # 去掉第一行字段
test_set = np.array(test_set).astype(np.float32)

########



        frame.len  ip.len  ip.flags.df  ip.flags.mf  ip.ttl  ip.proto  \
0              66      52            0            0      63         6   
1              66      52            0            0      63         6   
2              66      52            0            0      63         6   
3              66      52            0            0      63         6   
4              66      52            0            0      63         6   
...           ...     ...          ...          ...     ...       ...   
837180         74      60            0            0      56         6   
837181         74      60            0            0      49         6   
837182         74      60            1            0      33         6   
837183         66      52            1            0     128         6   
837184         74      60            1            0      52         6   

        tcp.hdr_len  tcp.flags.ns  tcp.flags.ecn  tcp.flags.urg  ...  \
0              32.0           0.0            0.0   

In [9]:

# # 4 读特征  已经筛选出来了
# train_feature  = train_set[:,1:]
# test_feature = test_set[:,1:]

# 5 特征归一化  用MLP的时候用，LSTM不用
train_set_GA = normalize(train_set_GA, axis=0, norm='max')
x_test = normalize(test_set, axis=0, norm='max')

print("x_train_GA.shape",train_set_GA.shape)

# y_train[y_train == 23] = 0
# test_label[test_label == 23] = 0


# 6 标签转化为one-hot编码 
y_train = keras.utils.to_categorical(y_train, num_classes=23)
y_test = keras.utils.to_categorical(test_label, num_classes=23)

print("y_train.shape",y_train.shape)


x_train_GA.shape (837185, 69)
y_train.shape (837185, 23)


In [12]:


#################  多层感知机

model = Sequential()
# Dense(64) 是一个具有 64 个隐藏神经元的全连接层。
# 在第一层必须指定所期望的输入数据尺寸：
# 在这里，是一个104 维的向量。
model.add(Dense(64, activation='relu', input_dim=69))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(23, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',optimizer=sgd,metrics=['accuracy'])

model.fit(train_set_GA, y_train, epochs=20, batch_size=128)
score = model.evaluate(x_test, y_test, batch_size=128)

##################



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [38]:

########## Embedding 层输入整数向量

train_set = np.array(train_set).astype(np.int32) 
test_set = np.array(test_set).astype(np.int32)

# ##########
# # LSTM 不需要 特征归一化到0-1，用embedding层
# x_train = normalize(train_feature, axis=0, norm='max')
# x_test = normalize(test_feature, axis=0, norm='max')


In [39]:
##################  LSTM
max_features = 20000 # 作为特征的单词索引个数
# x_train = x_train[:, None]
# x_test = x_test[:,None]

print(x_train.shape)

model = Sequential()
# model.add(Dense(64, activation='relu', input_dim=104))
# model.add(Dropout(0.5))
model.add(Embedding(max_features, 32)) # 接收二维整数向量，输出三维向量
# model.add(Dropout(0.5))
# model.add(Dense(32, activation='relu'))
model.add(LSTM(32)) # 接收三维向量
model.add(Dense(23, activation='sigmoid')) 


# loss = [binary_crossentropy,categorical_crossentropy]
# optimizer = [rmsprop,] RMSProp对比Adamgrad增加了指数平滑
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train ,epochs=5, batch_size=128,validation_split=0.2)
score = model.evaluate(x_test, y_test, batch_size=128)

#################


(1360825, 104)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
