In [1]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.data.shape    #输出（150,4）

(150, 4)

In [2]:
import pandas as pd
df_iris = pd.DataFrame(iris.data)
df_iris.columns = iris.feature_names
df_iris['Species'] = iris.target
df_iris.sample(7)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
121,5.6,2.8,4.9,2.0,2
20,5.4,3.4,1.7,0.2,0
77,6.7,3.0,5.0,1.7,1
108,6.7,2.5,5.8,1.8,2
115,6.4,3.2,5.3,2.3,2
63,6.1,2.9,4.7,1.4,1
35,5.0,3.2,1.2,0.2,0


In [3]:
#查看各数据的缺失率
def show_missing_data(df_iris):
    missing_data_count = df_iris.isnull().sum()
    total = missing_data_count.sort_values(ascending=False)
    percent = (missing_data_count / df_iris.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

show_missing_data(df_iris)

Unnamed: 0,Total,Percent
Species,0,0.0
petal width (cm),0,0.0
petal length (cm),0,0.0
sepal width (cm),0,0.0
sepal length (cm),0,0.0


In [4]:
#简单的查看缺失情况的方法
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
Species              150 non-null int32
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [5]:
#将数据分为训练集与测试集
from sklearn.model_selection import train_test_split
features = df_iris[df_iris.loc[:,df_iris.columns!='Species'].columns] #提取特征
target = df_iris['Species'] #提取目标特征
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=33)


In [7]:
print(x_train.shape) #输出（105,4）
x_train.head()

(105, 4)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
98,5.1,2.5,3.0,1.1
11,4.8,3.4,1.6,0.2
131,7.9,3.8,6.4,2.0
39,5.1,3.4,1.5,0.2
21,5.1,3.7,1.5,0.4


In [8]:
#逻辑回归模型
from sklearn.linear_model import LogisticRegression #从sklearn中引入逻辑回归
from sklearn.preprocessing import StandardScaler # 引入StandardScaler进行标准化处理
#分别对训练和测试数据进行标准化
ss = StandardScaler() 
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
#使用逻辑回归分类器对测试集进行预测，预测结果保存到y_pred中
lr = LogisticRegression() #进行初始化
lr.fit(x_train, y_train) #进行训练
y_pred = lr.predict(x_test) #进行预测

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred,target_names=iris.target_names)) 


             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.86      0.80      0.83        15
  virginica       0.85      0.89      0.87        19

avg / total       0.89      0.89      0.89        45



In [24]:
#逻辑回归模型
from sklearn.linear_model import LogisticRegression #从sklearn中引入逻辑回归

lr = LogisticRegression() #进行初始化
lr.fit(x_train, y_train) #进行训练
y_pred = lr.predict(x_test) #进行预测

from sklearn.metrics import classification_report,accuracy_score
acc_lr = accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred,target_names=iris.target_names)) 

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.86      0.80      0.83        15
  virginica       0.85      0.89      0.87        19

avg / total       0.89      0.89      0.89        45



In [11]:
#支持向量机模型
from sklearn.svm import SVC, LinearSVC

svc = SVC(C=1e5)
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)

from sklearn.metrics import classification_report,accuracy_score
acc_svc = accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred,target_names=iris.target_names)) 

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.88      1.00      0.94        15
  virginica       1.00      0.89      0.94        19

avg / total       0.96      0.96      0.96        45



In [13]:
#朴素贝叶斯
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_test)

from sklearn.metrics import classification_report,accuracy_score
acc_gaussian = accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred,target_names=iris.target_names)) 

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.88      1.00      0.94        15
  virginica       1.00      0.89      0.94        19

avg / total       0.96      0.96      0.96        45



In [14]:
#决策树
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)

from sklearn.metrics import classification_report,accuracy_score
acc_decision_tree = accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred,target_names=iris.target_names)) 

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.79      1.00      0.88        15
  virginica       1.00      0.79      0.88        19

avg / total       0.93      0.91      0.91        45



In [15]:
#K近邻分类器
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

from sklearn.metrics import classification_report,accuracy_score
acc_knn = accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred,target_names=iris.target_names))

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.83      1.00      0.91        15
  virginica       1.00      0.84      0.91        19

avg / total       0.94      0.93      0.93        45



In [21]:
#梯度下降分类法
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(tol=1e-3) #默认random_state = None，所以每次运行结果会有不同
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_test)

from sklearn.metrics import classification_report,accuracy_score
acc_sgd = accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred,target_names=iris.target_names))

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.82      0.93      0.87        15
  virginica       0.94      0.84      0.89        19

avg / total       0.92      0.91      0.91        45



In [22]:
#随机森林
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)


from sklearn.metrics import classification_report,accuracy_score
acc_random_forest = accuracy_score(y_test, y_pred)
print(classification_report(y_test,y_pred,target_names=iris.target_names))

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        11
 versicolor       0.88      1.00      0.94        15
  virginica       1.00      0.89      0.94        19

avg / total       0.96      0.96      0.96        45



In [25]:
models = pd.DataFrame({
    'Model': [ 'Logistic Regression', 'Support Vector Machines', 'KNN', 
               'Naive Bayes','Decision Tree', 'RandomForestClassifier', 'SGDClassifier'],
    'Accuracy': [acc_lr, acc_svc, acc_knn, acc_gaussian, acc_decision_tree, acc_random_forest, acc_sgd]})
models.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Model
1,0.955556,Support Vector Machines
3,0.955556,Naive Bayes
5,0.955556,RandomForestClassifier
2,0.933333,KNN
4,0.911111,Decision Tree
6,0.911111,SGDClassifier
0,0.888889,Logistic Regression


In [27]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc  ###计算roc和auc
%matplotlib inline

lr_y_proba=lr.predict_proba(x_test)
lr_fpr,lr_tpr,lr_threasholds=roc_curve(y_test,lr_y_proba[:,1])
lr_roc_auc = auc(lr_fpr,lr_tpr)

svc_y_proba=svc.predict_proba(x_test)
svc_fpr,svc_tpr,svc_threasholds=roc_curve(y_test,svc_y_proba[:,1])
svc_roc_auc = auc(svc_fpr,svc_tpr)

knn_y_proba=knn.predict_proba(x_test)
knn_fpr,knn_tpr,knn_threasholds=roc_curve(y_test,knn_y_proba[:,1])
knn_roc_auc = auc(knn_fpr,knn_tpr)

gnb_y_proba=gnb.predict_proba(x_test)
gnb_fpr,gnb_tpr,gnb_threasholds=roc_curve(y_test,gnb_y_proba[:,1])
gnb_roc_auc = auc(gnb_fpr,gnb_tpr)

dt_y_proba=decision_tree.predict_proba(x_test)
dt_fpr,dt_tpr,dt_threasholds=roc_curve(y_test,dt_y_proba[:,1])
dt_roc_auc = auc(dt_fpr,dt_tpr)

rfc_y_proba=rfc.predict_proba(x_test)
rfc_fpr,rfc_tpr,rfc_threasholds=roc_curve(y_test,rfc_y_proba[:,1])
rfc_roc_auc = auc(rfc_fpr,rfc_tpr)

sgd_y_proba=sgd.predict_proba(x_test)
sgd_fpr,sgd_tpr,sgd_threasholds=roc_curve(y_test,sgd_y_proba[:,1])
sgd_roc_auc = auc(sgd_fpr,sgd_tpr)

plt.figure()
lw = 2
plt.figure(figsize=(10,10))

plt.plot(lr_fpr, lr_tpr, color='red',
         lw=lw, label='ROC curve-lr (area = %0.2f)' % lr_roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线

plt.plot(svc_fpr, svc_tpr, color='darkorange',
         lw=lw, label='ROC curve-svc (area = %0.2f)' % svc_roc_auc) 

plt.plot(knn_fpr, knn_tpr, color='aqua',
         lw=lw, label='ROC curve-knn (area = %0.2f)' % knn_roc_auc) 

plt.plot(gnb_fpr, gnb_tpr, color='burlywood',
         lw=lw, label='ROC curve-gnb (area = %0.2f)' % gnb_roc_auc) 

plt.plot(dt_fpr, dt_tpr, color='coral',
         lw=lw, label='ROC curve-dt (area = %0.2f)' % dt_roc_auc) 

plt.plot(rfc_fpr, rfc_tpr, color='darkgreen',
         lw=lw, label='ROC curve-rfc (area = %0.2f)' % rfc_roc_auc) 

plt.plot(sgd_fpr, sgd_tpr, color='black',
         lw=lw, label='ROC curve-sgd (area = %0.2f)' % sgd_roc_auc) 

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


ValueError: Data is not binary and pos_label is not specified