## 数据处理流程
1. 数据读入
2. 数据清洗
3. 数据无量纲化处理
4. 特征选择
5. 特征提取
6. 聚类

# 将数据文件处理成电脑可处理的小规模的数据

## 数据读入

In [1]:
# import libs
import numpy as np
import pandas as pd
import os
from sklearn.cluster import KMeans

In [2]:
# 从文件夹'newdata/'中读取数据文件
filedir = 'newdata/'
files = os.listdir(filedir)

FileNotFoundError: [Errno 2] No such file or directory: 'newdata/'

In [None]:
nrows = 10000 # 每个文件中读取前nrows行数据
alldata = pd.DataFrame() #新建空的DataFrame
for f in files: # 循环读入数据，加入到新建的DataFrame中
    tdata = pd.read_csv(filedir + f, nrows=nrows)
    print('read ' + str(tdata.shape[0]) + ' rows from' + f)
    alldata = alldata.append(tdata, ignore_index=True)
    #alldata = alldata.append(tdata)

In [None]:
cols = alldata.columns
alldata = alldata.drop(cols[0:2],axis = 1) # 删除两列必然没用的数据
alldata = alldata.dropna(how='any') # 删除含有nan值的数据
alldata.reset_index(inplace=True, drop=True) # 重置index
alldata
# 36079 rows × 87 columns

In [None]:
L = set(alldata[' Label'])
labels = []
for l in L:
    labels.append(l)
labels
    

## 数据清洗

In [None]:
alldata.dropna(how='any',inplace=True) # 再次删除含有空值的行

# 删除非数值型数据，这些数据暂时不用于聚类处理，但不代表这些数据没用
dropset = ['Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
           ' Fwd Header Length.1', 'SimillarHTTP', ' Inbound']
# 删除含有大量负数的列
drop_nega = [' Fwd Header Length', 'Init_Win_bytes_forward',
       ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward']
alldata.drop(dropset, axis=1,inplace=True)
alldata.drop(drop_nega, axis=1,inplace=True)
cols = alldata.columns
# len(dropset),cols.size

# 将labels替代成数值型值
for i in range(len(labels)):
    alldata.replace(to_replace =labels[i], value = i, inplace = True)
    print(labels[i] + ' was replace to ' + str(i))

In [None]:
# 删除数据中inf的值
idx_tuple = np.where(alldata.max(axis=1).values == np.inf)
idx = list(idx_tuple[0])
alldata.drop(axis=0, index = idx, inplace = True)
alldata.reset_index(inplace=True, drop=True)
# 删除数据中含负数的数据条数
idx_tuple = np.where(alldata.min(axis=1).values < 0)
idx = list(idx_tuple[0])
alldata.drop(axis=0, index = idx, inplace = True)
alldata.reset_index(inplace=True, drop=True)

In [None]:
# 拆分数据X-Y
Y = alldata[' Label']
X = alldata.drop([' Label'], axis=1)

In [None]:
# 将pandas数据转成numpy array
X = X.values
Y = Y.values

In [None]:
Y

## 数据无量纲化处理

In [None]:
from sklearn import preprocessing

In [None]:
# 采用两种不同的标准化方式
#MaxAbs标准化
#建立MinMaxScaler对象
maxabs = preprocessing.MaxAbsScaler()
# 标准化处理
data_maxabs = maxabs.fit_transform(X)

#zscore标准化
zscore = preprocessing.StandardScaler()
#zscore标准化
X_zscore = zscore.fit_transform(X)

## 特征选择

In [None]:
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest
from numpy import array

In [None]:
## 递归特征消除法
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
X_new = RFE(estimator=LogisticRegression(), n_features_to_select=10).fit_transform(data_maxabs, Y)

In [None]:
help(RFE)

In [None]:
X_new.shape

In [None]:
X_new

In [None]:
data_maxabs

In [None]:
# X_new = SelectKBest(lambda X, Y: array(list(map(lambda x:pearsonr(x, Y), X.o))).T, k=30).fit_transform(data_maxabs, Y)

## 特征提取

In [None]:
from sklearn.decomposition import PCA
estimator = PCA(n_components=8)
pca_X_train = estimator.fit_transform(X_new)

In [None]:
pca_X_train.shape

# 普通 Kmeans

In [None]:
help(KMeans)

In [None]:
kmeans = KMeans(n_clusters = 5, init='k-means++', n_init=50).fit(pca_X_train)

In [None]:
kmeans

In [None]:
set(kmeans.labels_)

In [None]:
y_ = kmeans.labels_

In [None]:
Y

In [None]:
y_, Y

In [None]:
for i in range(len(Y)): # 指标筛选之后好多了
    print(y_[i], Y[i])