# 特征选择

In [1]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 500)

### 导入数据

In [3]:
train_df = pd.read_csv('data_train.csv', low_memory=False, index_col=0)
test_df = pd.read_csv('data_test.csv', low_memory=False, index_col=0)
combine = [train_df, test_df]

### 分析数据（描述）

#### 一共有多少特征？

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38199 entries, 0 to 38198
Columns: 262 entries, vid to A601
dtypes: float64(18), object(244)
memory usage: 76.6+ MB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9538 entries, 0 to 9537
Columns: 262 entries, vid to A601
dtypes: float64(43), object(219)
memory usage: 19.1+ MB


训练集有13个数值特征而测试集有38个。  
因为在数据导入的时候，每一列只要有非数值数据，就会被转化成 object 类型。  
所以需要统计每一列 object 类型的数据的数值数据比例，大于某值（如0.8）则转化成数值，不能转化的变成 NaN。

In [6]:
numerical_feature = []
train_data_counts = train_df.shape[0]
for col in train_df.columns.values:
    num_counts = train_df[col].astype(str).str.match(r'^(-?\d+)(\.\d+)?$').sum()
    na_counts = train_df[col].isna().sum()
    if num_counts/(train_data_counts - na_counts) > 0.8:
        numerical_feature.append(col)
len(numerical_feature)

113

一共有108个符合要求的特征（去掉前面5个输出值），把所有这些列进行转换，不成功的转换成NaN

In [7]:
# to_numeric() = pd.to_numeric(downcast='float64', errors='coerce')
train_df[numerical_feature[5:]] = train_df[numerical_feature[5:]].apply(
    lambda x: pd.to_numeric(x, downcast='float', errors='coerce'))
test_df[numerical_feature[5:]] = test_df[numerical_feature[5:]].apply(
    lambda x: pd.to_numeric(x, downcast='float', errors='coerce'))

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38199 entries, 0 to 38198
Columns: 262 entries, vid to A601
dtypes: float32(108), float64(5), object(149)
memory usage: 60.9+ MB


### *可参考的一些处理错误数据的方法

In [9]:
# 可以对一些明显错误的数据进行修改，数据量很小，后来选择直接改成 NaN
# train_df[train_df['100006'].str.contains(r'[0-9]')==False]['100006']
# train_df[train_df['269004'].str.contains(r'[0-9]')==False]['269004']
# train_df[train_df['269005'].str.contains(r'[0-9]')==False]['269005']

In [10]:
# **把所有为`---`的值改为NaN**

# train_df.replace(to_replace=r'\-+', value=np.nan, inplace=True, regex=True)

# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         train_df[col] = pd.to_numeric(train_df[col], downcast='float', errors='ignore')

In [11]:
# 只是多了两列特征，因为不只是---的问题

# train_df[train_df['269004'].str.match(r'^(-?\d+\.\d+)?;(-?\d+\.\d+)?')==True]['269004']

In [12]:
# **有很多特征值有重复，变成了数值；数值的格式**

# train_df.replace(to_replace=r'^(-?\d+\.\d+)?;(-?\d+\.\d+)?', value=r'\1', inplace=True, regex=True)

# train_df.loc[23268]['269004']

# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         train_df[col] = pd.to_numeric(train_df[col], downcast='float', errors='ignore')

In [13]:
# 还有几列有问题，print出来看看。

# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         print(col)

# print(train_df['269012'].count(),
# train_df['313'].count(),
# train_df['32'].count(),
# train_df['38'].count())

In [14]:
# train_df[train_df['269012'].str.match(r'^(-?\d+)(\.\d+)?')==False]['269012']

# pd.to_numeric(train_df['269012'], downcast='float')

In [15]:
# 可以看到还有末尾多一个小数点的数据，把小数点去掉

# train_df.replace(to_replace=r'^(-?\d+\.\d+)?(-?\d+)?.$', value=r'\1', regex=True, inplace=True)

In [16]:
# 处理特例

# train_df.loc[26333]['313'] 

# train_df.loc[26333,['313']] = 189

In [17]:
# for col in test_df.describe().columns.values:
#     if col not in train_df.describe().columns.values:
#         train_df[col] = pd.to_numeric(train_df[col], downcast='float', errors='ignore') #用apply更好

## 用这些特征试试看看简单的算法

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [19]:
feature = train_df.describe().columns.values.tolist()[5:]

In [20]:
label = train_df.describe().columns.values.tolist()[0:5]

In [21]:
X_train = train_df.loc[:,feature].fillna(0)
Y_train = train_df.loc[:,label].fillna(0)
X_test = test_df.loc[:,feature].fillna(0)

In [22]:
X_train.shape, Y_train.shape, X_test.shape

((38199, 108), (38199, 5), (9538, 108))

## 回归树

In [23]:
regr = DecisionTreeRegressor()
regr.fit(X_train, Y_train)
acc_decision_tree = round(regr.score(X_train, Y_train) * 100, 2)
acc_decision_tree

100.0

In [25]:
Y_pred_regr = regr.predict(X_test)
Y_pred_regr_df = pd.DataFrame(Y_pred_regr, columns=label)
Y_pred_regr_df.head()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
0,135.0,80.0,2.45,1.08,2.47
1,130.0,80.0,3.33,1.13,3.5
2,94.0,58.0,0.53,1.58,2.82
3,128.0,77.0,1.39,1.29,2.02
4,150.0,82.0,1.03,1.0,2.69


In [31]:
Y_pred_regr_df['vid']=test_df['vid']
Y_pred_regr_df = Y_pred_regr_df.loc[:, ['vid']+label]

In [32]:
Y_pred_regr_df.to_csv('regr_output.csv',index=False, header=False)

## 随机森林

In [33]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, Y_train)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

88.92

In [45]:
Y_pred_rf = random_forest.predict(X_test)
print((Y_pred_rf<0).sum())
Y_pred_rf_df = pd.DataFrame(Y_pred_rf, columns=label)
Y_pred_rf_df.head()

0


Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
0,145.17,82.43,2.6651,1.4743,2.9846
1,134.9,82.97,2.452,1.2814,2.8994
2,114.45,71.1,1.1621,1.4454,2.5157
3,126.79,77.18,1.819,1.3123,3.1022
4,127.94,76.59,1.5678,1.2746,3.0717


In [46]:
Y_pred_rf_df['vid']=test_df['vid']
Y_pred_rf_df = Y_pred_rf_df.loc[:, ['vid']+label]

In [47]:
Y_pred_rf_df.to_csv('rf_output.csv',index=False, header=False)