# Lab5 数据预处理
流程如下：
- 数据读取
- 去噪（有大量outliers，必须进行）（将离群值设置为```nan```，交由下一步补全数据）
- 处理缺失数据
- 降维
- 标准化


In [10]:
import numpy as np
import pandas as pd
from funs import *
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from skrebate import ReliefF

### 读取数据

In [43]:
# 数据读取
X_df = pd.read_csv('Dataset/train_feature.csv')
y_df = pd.read_csv('Dataset/train_label.csv')
c = y_df['label'].unique().__len__() #分类数

X_pred_df = pd.read_csv('Dataset/test_feature.csv')
#print(X_pred_df.isnull().sum(), X_df.isnull().sum())

X, y, X_pred = X_df.values, y_df.values, X_pred_df.values

### 去噪
- 经过观察，离群值过于离谱，应该重做。将各个属性的outliers值设置为nan

In [44]:
for feature in X_df.columns:
    Q1 = X_df[feature].quantile(0.1)
    Q3 = X_df[feature].quantile(0.9)
    IQR = Q3 - Q1
    X_df.loc[X_df[(X_df[feature]<Q1-1.5*IQR)|(X_df[feature]>Q3+1.5*IQR)].index,[feature]]=np.nan

    X_pred_df.loc[X_pred_df[(X_pred_df[feature]<Q1-1.5*IQR)|(X_pred_df[feature]>Q3+1.5*IQR)].index,[feature]]=np.nan

X, y, X_pred = X_df.values, y_df.values, X_pred_df.values

### 处理缺失数据
以下方法选择一项：
- 平均值填补：在缺失处填上同类在该特征上的均值
- KNN方法填补：根据不完整样本距离完整样本的远近，进行拟合，并推算缺失值
- 过滤：直接删除不完整的样本

In [5]:
# 处理缺失数据：填补（平均值法）
for feature in X_df.columns:
    mean = X_df[feature].mean()
    X_df.loc[X_df[np.isnan(X_df[feature])].index,feature] = mean 

X, y, X_pred = X_df.values, y_df.values, X_pred_df.values

In [45]:
# 处理缺失数据：填补（KNN方法，注意要先标准化）
# 可能是冗余特征的干扰，效果不好
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5,weights='uniform', metric='nan_euclidean')   # 需要归一化（？）
X = imputer.fit_transform(X)
X_pred = imputer.transform(X_pred)                                              # 对预测目标进行填补
#X_imputed = pd.DataFrame(imputed, columns=X.columns)

X_df = pd.DataFrame(X, columns=X_df.columns)
X_pred_df = pd.DataFrame(X_pred, columns=X_pred_df.columns)

In [48]:
# 处理缺失数据：过滤（只使用完整数据，丢弃不完整数据）
tmp = pd.concat([X_df,y_df],axis=1)
tmp = tmp.dropna()#.values
y_df = tmp['label']
X_df = tmp.drop(columns=['label'])

X = X_df.values
y = y_df.values

In [None]:
X_df.to_csv('Dataset/train_f_preprocessed_1.csv',sep=',',header=True, index=False)
y_df.to_csv('Dataset/train_l_preprocessed_1.csv',sep=',',header=True, index=False)

X[:10,:5]

### 标准化
- 均值归零，特征归一

In [193]:
# 标准化
# sklearn.preprocessing.robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True, unit_variance=False)
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler() 
standardScaler.fit(X_df)

X = standardScaler.transform(X_df)
X_pred = standardScaler.transform(X_pred_df)
y = y_df.values

X_df = pd.DataFrame(X, columns=X_df.columns)
X_pred_df = pd.DataFrame(X_pred, columns=X_pred_df.columns)

In [None]:
np.std(X, axis=0)

### 特征选择
- 采用Relif-F算法

In [172]:
# relieff
r = ReliefF(discrete_threshold=1, n_neighbors=5, n_jobs=-1)
r.fit(X, y)

ReliefF(discrete_threshold=1, n_jobs=-1, n_neighbors=5)

In [192]:
r.attr

{'X01': ('continuous', 1.0, 0.0, 1.0, 0.386026469542752),
 'X02': ('continuous', 1.0, 0.0, 1.0, 0.47612891617110237),
 'X03': ('continuous', 3.0, 0.0, 3.0, 1.004820950386549),
 'X04': ('continuous', 1.0, 0.0, 1.0, 0.41305192141100605),
 'X05': ('continuous', 1.0, 0.0, 1.0, 0.34053532461934904),
 'X06': ('continuous', 8.1, 0.015, 8.084999999999999, 0.6104064856533892),
 'X07': ('continuous', 41.667, 0.0, 41.667, 2.9238644597700594),
 'X08': ('continuous', 7.0, 0.09, 6.91, 0.8442790554809808),
 'X09': ('continuous', 1.6, 0.04, 1.56, 0.21513716761930315),
 'X10': ('continuous', 1.0, 0.0, 1.0, 0.35303796559921335),
 'X11': ('continuous', 1.0, 0.0, 1.0, 0.39342025072640907)}

In [174]:
r.feature_importances_

array([0.00442997, 0.00162866, 0.02442997, 0.00390879, 0.01211726,
       0.00319137, 0.00453451, 0.01353852, 0.00678193, 0.10019544,
       0.04462541])

In [179]:
# 以决策树模型作为优化对象
from sklearn.tree import DecisionTreeClassifier 

model_DecTree = DecisionTreeClassifier(criterion = "entropy", max_depth=2, splitter='best',min_samples_leaf=5, random_state=0) #调包

for n_feature in range(1, 12, 1):
    r.set_params(n_features_to_select=n_feature)
    X_train = r.transform(X)
    # print(X_train.shape)
    y_train = y
    print(n_feature, "features: ", np.mean(cross_val_score(model_DecTree, X_train, y_train)))#cv=5

1 features:  0.8192589630814341
2 features:  0.8192589630814341
3 features:  0.8192589630814341
4 features:  0.8192589630814341
5 features:  0.8192589630814341
6 features:  0.8143809143009463
7 features:  0.8176329468212715
8 features:  0.8176329468212715
9 features:  0.8176329468212715
10 features:  0.8176329468212715
11 features:  0.8176329468212715


## 其他数据集测试

In [170]:
df = pd.read_csv('E:\课程资料\Maths\应用数学\机器学习概论\mylab\lab1\loan.csv')
df.drop("Loan_ID", axis=1, inplace=True)
# df = df.dropna(how='any') # 删除不完整数据，与后面的knn算法二选一

df.Gender = df.Gender.map({'Male':1,'Female':0})
df.Married = df.Married.map({'Yes':1,'No':0})
df.Dependents = df.Dependents.map({'3+':3, '2':2, '1':1, '0':0})
df.Education = df.Education.map({'Graduate':1, 'Not Graduate':0})
df.Self_Employed = df.Self_Employed.map({'Yes':1, 'No':0})
df.Property_Area = df.Property_Area.map({"Urban":1, 'Semiurban':0.5, 'Rural':0})
df.Loan_Status = df.Loan_Status.map({'Y':1, 'N':0})
df.Loan_Amount_Term = df.Loan_Amount_Term / 300
df.LoanAmount = df.LoanAmount /100
df.ApplicantIncome = df.ApplicantIncome / 10000
df.CoapplicantIncome = df.CoapplicantIncome /1000 # 数据预处理，简单约化一下

# knn算法
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5,weights='uniform', metric='nan_euclidean')   # 需要归一化（？）
data = imputer.fit_transform(df)
df = pd.DataFrame(data, columns=df.columns)

df = df.astype(dtype='float64')

In [171]:
from sklearn.model_selection import train_test_split
y = df['Loan_Status'].values
X = df.drop(columns=['Loan_Status']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
(X[:20, 9]), df[:20]['Credit_History'].values

(array([1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        0., 1., 1.]),
 array([1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        0., 1., 1.]))

## 训练测试
- 神经网络，随机森林，lab1逻辑回归

In [191]:
# lab1逻辑回归 二分类
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression as LR#调包
#from funs import LogisticRegression #自己的

model_logistic = LR(fit_intercept=True,C=0.2)
#model = LogisticRegression(fit_intercept=True,la=1,lr=0.001)

print(np.mean(cross_val_score(model_logistic, X, y.reshape(len(y)),cv=5)))

0.8127282420365187


In [169]:
# 神经网络
from sklearn.neural_network import MLPClassifier
from skrebate import ReliefF
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

model_MLP = MLPClassifier(hidden_layer_sizes=[20],max_iter=2000,learning_rate_init=1e-3,tol=1e-4,alpha=3e-6,activation='logistic')

clf = model_MLP
print(np.mean(cross_val_score(clf, X, y.reshape(len(y)), cv=5)))

# 特征选择
#for n_feature in range(10, 120,10):#range(1, X.shape[1]+1):
#    clf = make_pipeline(ReliefF(n_features_to_select=n_feature, discrete_threshold=5, n_neighbors=5, n_jobs=-1), 
#        model_MLP)
#    print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.8041666666666668


In [None]:
model_MLP.get_params()

In [None]:
clf = MLPClassifier(hidden_layer_sizes=[10],max_iter=10000,learning_rate_init=1e-3,tol=1e-5,alpha=1e-1,activation='identity')
#for i in range(5):
# clf.score() accuarcy

print(np.mean(cross_val_score(clf, X, y.reshape(len(y)), cv=10)))

In [None]:
# 寻找众数
for feature in X_df.columns:
    tmp = [i for i in X_df[feature]]
    n = max(set(tmp),key=tmp.count)
    if n==n:# 判断nan
        print(max(set(tmp),key=tmp.count))