## 数据探索
数据来自：https://www.kaggle.com/uciml/mushroom-classification ，其中输入x是一个22维的特征

In [30]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [3]:
data_path = "./data/"
data = pd.read_csv(data_path + "mushrooms.csv")

In [4]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
data.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class                       8124 non-null object
cap-shape                   8124 non-null object
cap-surface                 8124 non-null object
cap-color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill-attachment             8124 non-null object
gill-spacing                8124 non-null object
gill-size                   8124 non-null object
gill-color                  8124 non-null object
stalk-shape                 8124 non-null object
stalk-root                  8124 non-null object
stalk-surface-above-ring    8124 non-null object
stalk-surface-below-ring    8124 non-null object
stalk-color-above-ring      8124 non-null object
stalk-color-below-ring      8124 non-null object
veil-type                   8124 non-null object
veil-color                  8124 non-null object
ring-number

观察数据后我们有一个认知，所有的特征现在都是类别型的，我们需要对其进行编码。同时，数据非常好，没有任何的缺失。

数据总共有 8124 个，类别两个，有些特征非常不平衡，例如 gill-attachment 只有2种值，但是其中一个占了7914个。

## 特征编码

In [11]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
    
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [15]:
X = data.iloc[:,1:23]  
y = data.iloc[:, 0]  

## 数据划分
我们将数据分为 train 和 test，80%训练，20%测试

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42017)

In [20]:
y.sum() / y.shape[0]

0.48202855736090594

In [22]:
y_train.sum() / y_train.shape[0]

0.48192029543006615

In [23]:
y_test.sum() / y_test.shape[0]

0.48246153846153844

划分后，正负样本的比例还是一致的。

## 对数几率回归

In [24]:
from sklearn.linear_model import LogisticRegression
model_LR= LogisticRegression()

In [25]:
model_LR.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
y_prob = model_LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_LR.score(X_test, y_pred)

1.0

In [31]:
# 测试集上的准确率
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

0.95325960348467575

### 参数调优 
采用 GridSearchCV 寻找最优参数

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
model_lr = LogisticRegression()

我们此处有几个参数进行挑选


- C: Inverse of regularization strength 正则化系数的导数,值越小，正则化越强
- penalty： 正则化方法

In [35]:
test_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
              'penalty':['l1','l2']}

In [36]:
clf = GridSearchCV(model_lr,test_parameters,cv = 10)

In [37]:
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
clf.best_params_

{'C': 1000, 'penalty': 'l2'}

In [58]:
preds = clf.predict(X_test)
clf.score(X_test,preds)

1.0

In [60]:
metrics.roc_auc_score(y_test,preds)

0.971235804071926

## 决策树模型

In [61]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
model_tree = DecisionTreeClassifier()

In [63]:
model_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [66]:
y_pred = model_tree.predict(X_test)
model_tree.score(X_test, y_pred)

1.0

In [67]:
metrics.roc_auc_score(y_test,y_pred)

1.0

可以说树模型非常好。下面演示下参数调参


max_features: int, float, string or None, optional (default=None)
    The number of features to consider when looking for the best split:

- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a percentage and
  `int(max_features * n_features)` features are considered at each
  split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.


min_samples_leaf : int, float, optional (default=1)
    The minimum number of samples required to be at a leaf node:

- If int, then consider `min_samples_leaf` as the minimum number.
- If float, then `min_samples_leaf` is a percentage and
  `ceil(min_samples_leaf * n_samples)` are the minimum
  number of samples for each node.

In [68]:
model_tree = DecisionTreeClassifier()

test_parameters= { 'max_features': ["sqrt","log2"],
                  'min_samples_leaf': range(1,100,1) , 'max_depth': range(1,50,1)
                  }

In [69]:
tree = GridSearchCV(model_tree, test_parameters,cv=10)

In [70]:
tree.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': ['sqrt', 'log2'], 'min_samples_leaf': range(1, 100), 'max_depth': range(1, 50)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [71]:
tree.best_params_

{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1}

In [72]:
tree.best_score_

1.0

In [74]:
y_pred = tree.predict(X_test)
tree.score(X_test, y_pred)

1.0

In [75]:
metrics.roc_auc_score(y_test,y_pred)

1.0

## 随机森林