In [1]:
### 导入数据集
from sklearn.datasets import load_iris
iris_dataset = load_iris()
print(iris_dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [21]:
### 将数据分割为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( iris_dataset['data'], iris_dataset['target'], test_size=0.2, random_state=0)


In [22]:
### 1.线性回归
# 从 sklean 中导入函数
from sklearn import linear_model
linear = linear_model.LinearRegression()

# 使用训练集拟合模型
linear.fit(X_train, y_train)
# 检查模型得分
linear.score(X_train, y_train)

# 方程系数和截距
print('Coefficient: \n', linear.coef_)
print('Intercept: \n', linear.intercept_)

# 预测测试集结果
y_pred = linear.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

# 打印得分
print("Train set score: {:.2f}".format(linear.score(X_train, y_train)))
print("Test set score: {:.2f}".format(linear.score(X_test, y_test)))

Coefficient: 
 [-0.10627533 -0.0397204   0.22894234  0.61123074]
Intercept: 
 0.16149541375178766
Test set predictions:
 [ 2.06844113  0.96345869 -0.14707913  1.81304847 -0.03927091  2.26261445
 -0.02790835  1.31919261  1.28225236  1.11236814  1.54774304  1.3013147
  1.21150471  1.32946956  1.32977882 -0.12060477  1.35574095  1.21453128
  0.03610115 -0.02518356  1.79941543  1.39427904  0.07356109  0.02146025
  1.59050985 -0.11917603  0.14803648  1.16645127  0.90550234  0.10586947]
Train set score: 0.93
Test set score: 0.91


In [5]:
### 2.逻辑回归
# 从 sklean 中导入函数
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()

# 进行模型拟合，训练集和测试集
logistic.fit(X_train, y_train)
logistic.score(X_train, y_train)

# 方程系数和截距
print('Coefficient: \n', logistic.coef_)
print('Intercept: \n', logistic.intercept_)

#输出预测值
y_pred = logistic.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

# 打印得分
print("Train set score: {:.2f}".format(logistic.score(X_train, y_train)))
print("Test set score: {:.2f}".format(logistic.score(X_test, y_test)))


Coefficient: 
 [[-0.41737227  0.85016051 -2.33197581 -0.98816372]
 [ 0.52060603 -0.29765862 -0.22056052 -0.71101065]
 [-0.10323376 -0.55250189  2.55253633  1.69917437]]
Intercept: 
 [  9.25389214   1.75982812 -11.01372026]
Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
Train set score: 0.98
Test set score: 0.97


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
### 3.决策树
# 从 sklean 中导入函数
from sklearn import tree
tree = tree.DecisionTreeClassifier(criterion='gini') 

# 对于分类，这里您可以将算法更改为 gini 或 entropy (information gain)，默认是 gini 。
tree.fit(X_train, y_train)

# 输出预测值
y_pred = tree.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))

# 打印得分
print("Test set score: {:.2f}".format(tree.score(X_train, y_train)))
print("Test set score: {:.2f}".format(tree.score(X_test, y_test)))


Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
Test set score: 1.00
Test set score: 0.97


In [11]:
### 4.支持向量机(SVM)
# 从 sklean 中导入函数
from sklearn import svm
model = svm.SVC()

# 进行模型拟合，训练集和测试集
model.fit(X_train, y_train)

# 输出预测值
predicted= model.predict(X_test)
print("Test set predictions:\n {}".format(predicted))

# 打印得分
print("Test set score: {:.2f}".format(model.score(X_train, y_train)))
print("Test set score: {:.2f}".format(model.score(X_test, y_test)))


Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
Test set score: 0.96
Test set score: 0.97


In [12]:
### 5.朴素贝叶斯
# 从 sklean 中导入函数
from sklearn.naive_bayes import GaussianNB
model = GaussianNB() 
# 多项式类还有其他分布，例如 BernoulliNB（伯努利朴素贝叶斯）

# 进行模型拟合，训练集和测试集
model.fit(X_train, y_train)
model.score(X_train, y_train)

# 输出预测值
predicted= model.predict(X_test)
print("Test set predictions:\n {}".format(predicted))

# 打印得分
print("Test set score: {:.2f}".format(model.score(X_train, y_train)))
print("Test set score: {:.2f}".format(model.score(X_test, y_test)))



Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 1]
Test set score: 0.95
Test set score: 1.00


In [21]:
### 6.K-邻近算法（KNN）
# 从 sklean 中导入函数
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier( n_neighbors = 6 ) 
# n_neighbors的默认值为5

# 进行模型拟合，训练集和测试集
model.fit(X_train, y_train)
model.score(X_train, y_train)

# 输出预测值
predicted= model.predict(X_test)
print("Test set predictions:\n {}".format(predicted))

# 打印得分
print("Test set score: {:.2f}".format(model.score(X_train, y_train)))
print("Test set score: {:.2f}".format(model.score(X_test, y_test)))



Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
Test set score: 0.96
Test set score: 0.97


In [23]:
### 7. K均值算法（K-Means)
# 从 sklean 中导入函数
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3, random_state=0)

# 进行模型拟合，训练集和测试集
model.fit(X_train, y_train)
model.score(X_train, y_train)

# 输出预测值
predicted= model.predict(X_test)
print("Test set predictions:\n {}".format(predicted))

# 打印得分
print("Test set score: {:.2f}".format(model.score(X_train, y_train)))
print("Test set score: {:.2f}".format(model.score(X_test, y_test)))



Test set predictions:
 [2 2 1 0 1 0 1 2 2 2 0 2 2 2 2 1 2 2 1 1 2 2 1 1 2 1 1 2 2 1]
Test set score: -63.20
Test set score: -16.04


In [24]:
### 8.随机森林（RandomForestClassifier）
# 从 sklean 中导入函数
from sklearn.ensemble import RandomForestClassifier
model= RandomForestClassifier()

# 进行模型拟合，训练集和测试集
model.fit(X_train, y_train)
model.score(X_train, y_train)

# 输出预测值
predicted= model.predict(X_test)
print("Test set predictions:\n {}".format(predicted))

# 打印得分
print("Test set score: {:.2f}".format(model.score(X_train, y_train)))
print("Test set score: {:.2f}".format(model.score(X_test, y_test)))


Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 1 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0]
Test set score: 1.00
Test set score: 0.97


In [27]:
### 9.降维算法（Dimensionality Reduction Algorithms）

#### 9.1 主成分分析
# 从 sklean 中导入函数
from sklearn import decomposition
pca = decomposition.PCA( n_components= 3 ) 
# n_components =min(n_sample, n_features)

# 进行模型拟合，训练集
train_reduced = pca.fit_transform(X_train)

# 输出预测值
test_reduced = pca.transform(X_test)
test_reduced

#### 9.2 因子分析（Factor analysis） 
# 从 sklean 中导入函数
from sklearn import decomposition
factor = decomposition.FactorAnalysis()

# 进行模型拟合，训练集
train_reduced = factor.fit_transform(X_train)

# 输出预测值
test_reduced = factor.transform(X_test)
test_reduced

array([[ 0.79742237, -0.41759564,  0.        ,  0.        ],
       [ 0.04059616, -0.64224522,  0.        ,  0.        ],
       [-1.18977861,  1.02820387,  0.        ,  0.        ],
       [ 1.20727472,  0.20537958,  0.        ,  0.        ],
       [-1.24002151,  0.18468748,  0.        ,  0.        ],
       [ 1.14076431,  0.1155713 ,  0.        ,  0.        ],
       [-1.24947866,  0.26824411,  0.        ,  0.        ],
       [ 0.53601941,  0.26528774,  0.        ,  0.        ],
       [ 0.56598398,  0.06065337,  0.        ,  0.        ],
       [ 0.15113329, -0.14304269,  0.        ,  0.        ],
       [ 0.57174634, -0.40514062,  0.        ,  0.        ],
       [ 0.40124114,  0.24774949,  0.        ,  0.        ],
       [ 0.26553189, -0.17276726,  0.        ,  0.        ],
       [ 0.48273639, -0.04795907,  0.        ,  0.        ],
       [ 0.34482921, -0.10971106,  0.        ,  0.        ],
       [-1.34928339,  0.32626566,  0.        ,  0.        ],
       [ 0.31605615, -0.

In [None]:
### 10.梯度提升分类器(Gradient Boosing) 和 Ada Boost
# 从 sklean 中导入函数
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

# 进行模型拟合，训练集
model.fit(X_train, y_train)

# 输出预测值
predicted= model.predict(X_test)
predicted

# 打印得分
print("Test set score: {:.2f}".format(model.score(X_train, y_train)))
print("Test set score: {:.2f}".format(model.score(X_test, y_test)))