In [1]:
import time
from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.neighbors import KNeighborsClassifier # KNN分类器
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

load直接加载的内存的，数据集比较小，并不会保存到本地磁盘
fetch数据集比较大，下载下来后会存在本地磁盘，下一次就不会再连接sklearn的服务器

In [2]:
from sklearn.datasets import load_iris

# 加载鸢尾花数据集
li = load_iris()

# 打印特征数据的类型
print("获取特征值")
print(type(li.data))  # 输出特征数据的类型，应该是 numpy.ndarray 类型
print('-' * 50)

# 打印数据集的形状，展示样本量和特征数量
print(li.data.shape)  # 输出数据的形状：150个样本，4个特征
# 150行，每行代表一个样本，每列代表一个特征

获取特征值
<class 'numpy.ndarray'>
--------------------------------------------------
(150, 4)


In [3]:
print("目标值：")
print(li.target) # 输出目标值
print('-' * 50)
print(li.DESCR)  # 数据集的描述
print('-' * 50)
print(li.feature_names)  # 重点,特征名字
print('-' * 50)
print(li.target_names) # 目标名字

目标值：
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
--------------------------------------------------
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0 

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# 加载鸢尾花数据集
li = load_iris()

# 使用train_test_split将数据集拆分为训练集和测试集
# test_size=0.25 表示将25%的数据用作测试集，剩余的75%作为训练集
# random_state=1 确保每次分割时得到相同的随机数据（便于复现）
x_train, x_test, y_train, y_test = train_test_split(li.data, li.target, test_size=0.25, random_state=1)

# 输出训练集特征数据的形状
print("训练集特征值shape：", x_train.shape)
print('-' * 50)

# 输出测试集特征数据的形状
print("测试集特征值shape：", x_test.shape)

训练集特征值shape： (112, 4)
--------------------------------------------------
测试集特征值shape： (38, 4)


In [5]:
# 下面是比较大的数据，需要下载一会，20类新闻
#subset代表下载的数据集类型，默认是train，只有训练集
news = fetch_20newsgroups(subset='all', data_home='G:\\wangdao_python_short_class\\python_ml\\data')
# print(news.feature_names)  #这个数据集是没有的，因为没有特征，只有文本数据
# print(news.DESCR)
print('第一个样本')
print(news.data[0])

第一个样本
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [6]:
# 打印特征数据的类型
print('特征类型')
print(type(news.data))  # 输出新闻数据类型，应该是一个列表（list）类型
print('-' * 50)

print(len(news.target_names))
print('-' * 50)

# 打印前15个目标标签
print(news.target[0:15])  # 打印前15个样本的目标标签（类别）
print('-' * 50)

# 使用pprint打印所有目标类别的名称
print(list(news.target_names))  # 输出20类新闻的类别名称

特征类型
<class 'list'>
--------------------------------------------------
20
--------------------------------------------------
[10  3 17  3  4 12  4 10 10 19 19 11 19 13  0]
--------------------------------------------------
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
print('-' * 50)
print(len(news.data))
print('新闻所有的标签')
print(news.target)
print('-' * 50)
print(min(news.target), max(news.target))

--------------------------------------------------
18846
新闻所有的标签
[10  3 17 ...  3  1  7]
--------------------------------------------------
0 19


In [8]:
# california housing数据集
house = fetch_california_housing(data_home='G:\\wangdao_python_short_class\\python_ml\\data')
print("获取特征值")
print(house.data[0])  #第一个样本特征值
print('样本的形状')
print(house.data.shape)
print('-' * 50)

获取特征值
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
样本的形状
(20640, 8)
--------------------------------------------------


In [9]:
print("目标值")
print(house.target[0:10])
print('-' * 50)
print(house.DESCR)
print('-' * 50)
print(house.feature_names)
print('-' * 50)

目标值
[4.526 3.585 3.521 3.413 3.422 2.697 2.992 2.414 2.267 2.611]
--------------------------------------------------
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in h

## 分类器和预估器

### K近邻算法
- 欧式距离：勾股定理 
- 脱敏： 去除敏感信息，如手机号，身份证号等
- 当axis=1时，数组的变化是横向的，体现出列的增加或者减少。反之，当axis=0时，数组的变化是纵向的，体现出行的增加或减少。

In [10]:
# K近邻
"""
K-近邻预测用户签到位置
:return:None
"""
# 读取数据
data = pd.read_csv("G:/wangdao_python_short_class/python_ml/data/FBlocation/train.csv")
print(data.head(10))
print(data.shape)
print(data.info())

# 处理数据
# 缩小数据,查询数据,为了减少计算时间
data = data.query("x > 1.0 &  x < 1.25 & y > 2.5 & y < 2.75")

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949
5       5  3.8099  1.9586        75  178065  6289802927
6       6  6.3336  4.3720        13  666829  9931249544
7       7  5.7409  6.7697        85  369002  5662813655
8       8  4.3114  6.9410         3  166384  8471780938
9       9  6.3414  0.0758        65  400060  1253803156
(29118021, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29118021 entries, 0 to 29118020
Data columns (total 6 columns):
 #   Column    Dtype  
---  ------    -----  
 0   row_id    int64  
 1   x         float64
 2   y         float64
 3   accuracy  int64  
 4   time      int64  
 5   place_id  int64  
dtypes: float64(2), int64(4)
memory usage: 1.3 GB
None


In [11]:
print(data.shape)
data.describe()

(17710, 6)


Unnamed: 0,row_id,x,y,accuracy,time,place_id
count,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0
mean,14505690.0,1.122538,2.632309,82.482101,397551.263128,5129895000.0
std,8353805.0,0.077086,0.070144,113.613227,234601.097883,2357399000.0
min,600.0,1.0001,2.5001,1.0,119.0,1012024000.0
25%,7327816.0,1.0492,2.5738,25.0,174069.75,3312464000.0
50%,14430710.0,1.1233,2.6423,62.0,403387.5,5261906000.0
75%,21634630.0,1.1905,2.6878,75.0,602111.75,6766325000.0
max,29112150.0,1.2499,2.7499,1004.0,786218.0,9980711000.0


In [12]:
# 处理时间的数据
time_value = pd.to_datetime(data['time'], unit='s') # uint是单位，s是秒
print(time_value.head(10))  # 最大时间是1月10号

600    1970-01-01 18:09:40
957    1970-01-10 02:11:10
4345   1970-01-05 15:08:02
4735   1970-01-06 23:03:03
5580   1970-01-09 11:26:50
6090   1970-01-02 16:25:07
6234   1970-01-04 15:52:57
6350   1970-01-01 10:13:36
7468   1970-01-09 15:26:06
8478   1970-01-08 23:52:02
Name: time, dtype: datetime64[ns]


In [13]:
# 把日期格式转换成 字典格式，把年，月，日，时，分，秒转换为字典格式
time_value = pd.DatetimeIndex(time_value)
print(time_value[0:10])

DatetimeIndex(['1970-01-01 18:09:40', '1970-01-10 02:11:10',
               '1970-01-05 15:08:02', '1970-01-06 23:03:03',
               '1970-01-09 11:26:50', '1970-01-02 16:25:07',
               '1970-01-04 15:52:57', '1970-01-01 10:13:36',
               '1970-01-09 15:26:06', '1970-01-08 23:52:02'],
              dtype='datetime64[ns]', name='time', freq=None)


In [14]:
data.shape

(17710, 6)

In [15]:
print('-' * 50)
# 构造一些特征，执行的警告是因为我们的操作是复制，loc是直接放入
print(type(data))
# data['day'] = time_value.day
# data['hour'] = time_value.hour
# data['weekday'] = time_value.weekday
# 日期，是否是周末，小时对于个人行为的影响是较大的(例如吃饭时间去饭店，看电影时间去电影院等),所以才做下面的处理
data.insert(data.shape[1], 'day', time_value.day) # data.shape[1]是代表插入到最后的意思,一个月的哪一天
data.insert(data.shape[1], 'hour', time_value.hour) # 是否去一个地方打卡，早上，中午，晚上是有影响的
data.insert(data.shape[1], 'weekday', time_value.weekday) # 0代表周一，6代表周日，星期几

# 把时间戳特征删除
data = data.drop(['time'], axis=1)
print('-' * 50)
data.head()

--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
--------------------------------------------------


Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
600,600,1.2214,2.7023,17,6683426742,1,18,3
957,957,1.1832,2.6891,58,6683426742,10,2,5
4345,4345,1.1935,2.655,11,6889790653,5,15,0
4735,4735,1.1452,2.6074,49,6822359752,6,23,1
5580,5580,1.0089,2.7287,19,1527921905,9,11,4


In [16]:
#星期天，实际weekday的值是6
per = pd.Period('1970-01-01 18:00', 'h')
per.weekday

3

In [17]:
#观察数据，看下是否有空值，异常值
data.describe()

Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
count,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0
mean,14505690.0,1.122538,2.632309,82.482101,5129895000.0,5.101863,11.485545,3.092377
std,8353805.0,0.077086,0.070144,113.613227,2357399000.0,2.709287,6.932195,1.680218
min,600.0,1.0001,2.5001,1.0,1012024000.0,1.0,0.0,0.0
25%,7327816.0,1.0492,2.5738,25.0,3312464000.0,3.0,6.0,2.0
50%,14430710.0,1.1233,2.6423,62.0,5261906000.0,5.0,12.0,3.0
75%,21634630.0,1.1905,2.6878,75.0,6766325000.0,7.0,17.0,4.0
max,29112150.0,1.2499,2.7499,1004.0,9980711000.0,10.0,23.0,6.0


In [18]:
# # 把签到数量少于n个目标位置删除，place_id是标签，即目标值
place_count = data.groupby('place_id').count()
place_count

Unnamed: 0_level_0,row_id,x,y,accuracy,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1012023972,1,1,1,1,1,1,1
1057182134,1,1,1,1,1,1,1
1059958036,3,3,3,3,3,3,3
1085266789,1,1,1,1,1,1,1
1097200869,1044,1044,1044,1044,1044,1044,1044
...,...,...,...,...,...,...,...
9904182060,1,1,1,1,1,1,1
9915093501,1,1,1,1,1,1,1
9946198589,1,1,1,1,1,1,1
9950190890,1,1,1,1,1,1,1


In [19]:
place_count['x'].describe() #打卡地点总计805个，50%打卡小于2次

count     805.000000
mean       22.000000
std        88.955632
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max      1044.000000
Name: x, dtype: float64

In [20]:
# 把index变为0,1,2，3,4,5,6这种效果，从零开始排，原来的index是row_id
# 只选择去的人大于3的数据，认为1,2,3的是噪音，这个地方去的人很少，不用推荐给其他人
tf = place_count[place_count.row_id > 3].reset_index()
tf  #剩余的签到地点

Unnamed: 0,place_id,row_id,x,y,accuracy,day,hour,weekday
0,1097200869,1044,1044,1044,1044,1044,1044,1044
1,1228935308,120,120,120,120,120,120,120
2,1267801529,58,58,58,58,58,58,58
3,1278040507,15,15,15,15,15,15,15
4,1285051622,21,21,21,21,21,21,21
...,...,...,...,...,...,...,...,...
234,9741307878,5,5,5,5,5,5,5
235,9753855529,21,21,21,21,21,21,21
236,9806043737,6,6,6,6,6,6,6
237,9809476069,23,23,23,23,23,23,23


In [21]:
# 根据设定的地点目标值，对原本的样本进行过滤
#isin可以过滤某一列要在一组值
data = data[data['place_id'].isin(tf.place_id)]
data.shape

(16918, 8)

In [22]:
# 取出数据当中的特征值和目标值
y = data['place_id']
# 删除目标值，保留特征值
x = data.drop(['place_id'], axis=1)
# 删除无用的特征值，row_id是索引，这就是噪音
x = x.drop(['row_id'], axis=1)
print(x.shape)
print(x.columns)
# 预处理完成

(16918, 6)
Index(['x', 'y', 'accuracy', 'day', 'hour', 'weekday'], dtype='object')


In [23]:
# 进行数据的分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) # y为目标值

# 特征工程（标准化）,下面3行注释，一开始我们不进行标准化，看下效果，目标值要不要标准化？
std = StandardScaler()

# 对训练集的特征值进行标准化,服务于knn fit
x_train = std.fit_transform(x_train)
# transform返回的是copy，不在原有的输入对象中去修改
# print(id(x_test))
print(std.mean_)
print(std.var_)

# 对测试集的特征值进行标准化
x_test = std.transform(x_test)  #transfrom不再进行均值和方差的计算，是在原有的基础上去标准化
print('-' * 50)
# print(id(x_test))
print(std.mean_)
print(std.var_)

[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]
--------------------------------------------------
[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]


In [24]:
x_train.shape

(12688, 6)

In [25]:
# 进行算法流程 
# 超参数，可以通过设置n_neighbors=5，来调整结果好坏
knn = KNeighborsClassifier(n_neighbors=5) 

# fit， predict,score，训练，knn的fit是不训练的，只是把训练集的特征值和目标值放入到内存中
knn.fit(x_train, y_train)

# 得出预测结果
y_predict = knn.predict(x_test)
print("预测的目标签到位置为：", y_predict[0:10]) 

# 得出准确率,是评估指标
print("预测的准确率:", knn.score(x_test, y_test))
# print(y_predict)
# y_test

预测的目标签到位置为： [5689129232 1097200869 2355236719 9632980559 6424972551 4022692381
 8048985799 3533177779 1435128522 3312463746]
预测的准确率: 0.4806146572104019


In [26]:
print(max(time_value))

1970-01-10 02:23:38


近似误差：对现有训练集的训练误差，对训练输入的数据进行预测，也会有误差
<br>
估计误差：对测试集的测试误差
<br>
过拟合： 训练集的误差很小，但是测试集的误差很大，模型过于复杂，无法泛化到新的数据
<br>
k值的影响（调超参）：
- 过小：容易受到异常点的影响
- 过大：容易受最近数据的影响，导致比例变化
<br>
优点：训练时间复杂度为0
<br>
缺点：预测时间复杂度高，内存占用大

### 网格搜索

In [27]:
# 网格搜索时讲解
# 构造一些参数（超参）的值进行搜索
param = {"n_neighbors": [3, 5, 10, 12, 15],'weights':['uniform', 'distance']} # weights是权重的选择，uniform是均匀权重，distance是距离权重

# 进行网格搜索，cv=3是3折交叉验证（用其中2折训练，1折验证）
gc = GridSearchCV(knn, param_grid=param, cv=3)

gc.fit(x_train, y_train)  

# 预测准确率，为了给大家看看
print("在测试集上准确率：", gc.score(x_test, y_test))

print("在交叉验证当中最好的结果：", gc.best_score_) #最好的结果

print("选择最好的模型是：", gc.best_estimator_) #最好的模型,告诉你用了哪些参数

print("每个超参数每次交叉验证的结果：", gc.cv_results_) #每个超参数每次交叉验证的结果



在测试集上准确率： 0.49763593380614657
在交叉验证当中最好的结果： 0.4816362349278435
选择最好的模型是： KNeighborsClassifier(n_neighbors=12, weights='distance')
每个超参数每次交叉验证的结果： {'mean_fit_time': array([0.00699862, 0.00849517, 0.00835848, 0.0091575 , 0.00785613,
       0.00666674, 0.0063351 , 0.00633399, 0.00666936, 0.00700013]), 'std_fit_time': array([2.97996732e-06, 1.46926450e-03, 5.84729654e-04, 1.62229212e-03,
       1.96489956e-03, 4.71595499e-04, 4.69973935e-04, 4.71089867e-04,
       4.70419466e-04, 1.12391596e-07]), 'mean_score_time': array([0.12224325, 0.05468035, 0.1473    , 0.09738692, 0.14430706,
       0.07766612, 0.13986429, 0.081333  , 0.14520264, 0.10575755]), 'std_score_time': array([0.0061324 , 0.00608711, 0.01891237, 0.0190368 , 0.01630402,
       0.00205409, 0.00146997, 0.00046778, 0.00246448, 0.01080078]), 'param_n_neighbors': masked_array(data=[3, 3, 5, 5, 10, 10, 12, 12, 15, 15],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
      

### 准确率、精确率和召回率
准确率（accuracy）：所有预测正确的样本数除以总样本数((TP+TN)/(TP+TN+FP+FN))
<br>
精确率（precision）：预测为正的样本中真正为正的样本数除以所有预测为正的样本数(TP/(TP+FP))
<br>
召回率（recall）：真正为正的样本中预测为正的样本数除以所有真正为正的样本数(TP/(TP+FN))
<br>
F1-score：精确率和召回率的调和平均数，F1-score= 2TP/(2TP+FP+FN) 既考虑精确率又考虑召回率

|       | 正例       | 假例       |
|-------|------------|------------|
| 正例  | 真正例 TP  | 伪反例 FN |
| 假例  | 伪正例 FP  | 真反例 TN |

### 朴素贝叶斯

In [28]:
"""
朴素贝叶斯进行文本分类
:return: None
"""
# 二十类新闻数据集
news = fetch_20newsgroups(subset='all', data_home='G:/wangdao_python_short_class/python_ml/data')

print(len(news.data))  #样本数，包含的特征
print('-'*50)
print(news.data[0]) #第一个样本 特征
print('-'*50)
print(news.target) #标签
print(np.unique(news.target)) #标签的类别
print(news.target_names) #标签的名字

18846
--------------------------------------------------
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


----------------------------------------

In [29]:
print('-'*50)
# 进行数据分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=1)

# 对数据集进行特征抽取
tf = TfidfVectorizer()

# 以训练集当中的词的列表进行每篇文章重要性统计['a','b','c','d']
x_train = tf.fit_transform(x_train)

#针对特征内容，可以自行打印，下面的打印可以得到特征数目，总计有15万特征
print(len(tf.get_feature_names_out()))

--------------------------------------------------
153196


In [30]:
print(tf.get_feature_names_out()[100000])
print(tf.get_feature_names_out()[0:10])
print(tf.get_feature_names_out()[100000:100000+10])

murky
['00' '000' '0000' '00000' '0000000004' '0000000005' '0000000667'
 '0000001200' '000003' '000005102000']
['murky' 'murmurs' 'murnane' 'murph' 'murphey' 'murphy' 'murr11' 'murray'
 'murray_craven' 'murrayfield']


In [31]:
import time
# 进行朴素贝叶斯算法的预测,alpha是拉普拉斯平滑系数，分子和分母加上一个系数，分母加alpha*特征词数目
mlt = MultinomialNB(alpha=1.0)

# print(x_train.toarray())
#统计训练时间
start=time.time()
mlt.fit(x_train, y_train) # 训练模型
end=time.time()
end-start 

0.0889890193939209

In [32]:
x_transform_test = tf.transform(x_test)  #特征数目不发生改变
print(len(tf.get_feature_names_out())) #查看特征数目

153196


In [33]:
start=time.time()
y_predict = mlt.predict(x_transform_test)

print("预测的前面10篇文章类别为：", y_predict[0:10])

# 得出准确率,这个是很难提高准确率，为什么呢？
print("准确率为：", mlt.score(x_transform_test, y_test))
end=time.time()
print("预测时间：", end-start)

预测的前面10篇文章类别为： [16 19 18  1  9 15  1  2 16 13]
准确率为： 0.8518675721561969
预测时间： 0.0379946231842041


In [34]:
#预测的文章数目
len(y_predict)

4712

In [35]:
# 目前这个场景我们不需要召回率，support是真实的为那个类别的有多少个样本
print(classification_report(y_test, y_predict,target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.91      0.77      0.83       199
           comp.graphics       0.83      0.79      0.81       242
 comp.os.ms-windows.misc       0.89      0.83      0.86       263
comp.sys.ibm.pc.hardware       0.80      0.83      0.81       262
   comp.sys.mac.hardware       0.90      0.88      0.89       234
          comp.windows.x       0.92      0.85      0.88       230
            misc.forsale       0.96      0.67      0.79       257
               rec.autos       0.90      0.87      0.88       265
         rec.motorcycles       0.90      0.95      0.92       251
      rec.sport.baseball       0.89      0.96      0.93       226
        rec.sport.hockey       0.95      0.98      0.96       262
               sci.crypt       0.76      0.97      0.85       257
         sci.electronics       0.84      0.80      0.82       229
                 sci.med       0.97      0.86      0.91       249
         

In [36]:
print(y_test.shape) # 测试集中有多少 样本
y_test1 = np.where(y_test == 0, 1, 0)
print(y_test1.sum()) # label为0的样本数，即真实结果为正例(TP+FN)
y_predict1 = np.where(y_predict == 0, 1, 0)
print(y_predict1.sum()) # 预测结果为正例的样本数(TP+FP)

(4712,)
199
168


In [37]:
print((y_test1*y_predict1).sum()) # 真实结果为正例，预测结果为正例的样本数,即真正例数(TP)

153


In [38]:
#                            precision  recall  f1-score   support
#            alt.atheism       0.91      0.77      0.83       199
# 计算精确率
print(153/168) 
# 计算召回率
print(153/199)
# 计算F1-score
print(2*153/(199+168))

0.9107142857142857
0.7688442211055276
0.8337874659400545


In [39]:
max(y_test),min(y_test) # 目标值最大最小值
# 把0-19总计20个分类，变为0和1
# 第一个参数：类别是以改为0到19的
y_test1 = np.where(y_test == 0, 1, 0)
print(y_test1.sum()) #label为0的样本数
y_predict1 = np.where(y_predict == 0, 1, 0)
print(y_predict1.sum())
# roc_auc_score的y_test只能是二分类,针对多分类如何计算AUC
print("AUC指标：", roc_auc_score(y_test1, y_predict1))

199
168
AUC指标： 0.8827602448315142


* 优点：
- 计算简单，速度快
- 适用于文本分类
<br>
* 缺点：
- 需要知道先验概率，如果先验概率不准确，会影响预测结果
- 对缺失值不敏感

### 决策树

In [40]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [41]:
# 熵越小，确定性越大，把信息增益大的特征放到靠上的叶子节点
# 信息增益：信息增益=熵-条件熵
print(np.log2(1/32))
print(1 / 2 * np.log2(1 /2) + 1 / 2 * np.log2(1 /2))
print(1 / 3 * np.log2(1 / 3) + 2 / 3 * np.log2(2 / 3))
print(0.01 * np.log2(0.01) + 0.99 * np.log2(0.99))

-5.0
-1.0
-0.9182958340544896
-0.08079313589591118


In [42]:
"""
决策树对泰坦尼克号进行预测生死
:return: None
"""
# 获取数据
titan = pd.read_csv("G:/wangdao_python_short_class/python_ml/data/titanic.txt")
titan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   row.names  1313 non-null   int64  
 1   pclass     1313 non-null   object 
 2   survived   1313 non-null   int64  
 3   name       1313 non-null   object 
 4   age        633 non-null    float64
 5   embarked   821 non-null    object 
 6   home.dest  754 non-null    object 
 7   room       77 non-null     object 
 8   ticket     69 non-null     object 
 9   boat       347 non-null    object 
 10  sex        1313 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 113.0+ KB


In [43]:
# 处理数据，找出特征值和目标值
x = titan[['pclass', 'age', 'sex']]
y = titan['survived']
print(x.info())  # 用来判断是否有空值
print(x.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  1313 non-null   object 
 1   age     633 non-null    float64
 2   sex     1313 non-null   object 
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
None
       pclass         age   sex
count    1313  633.000000  1313
unique      3         NaN     2
top       3rd         NaN  male
freq      711         NaN   850
mean      NaN   31.194181   NaN
std       NaN   14.747525   NaN
min       NaN    0.166700   NaN
25%       NaN   21.000000   NaN
50%       NaN   30.000000   NaN
75%       NaN   41.000000   NaN
max       NaN   71.000000   NaN


In [44]:
# 一定要进行缺失值处理,填为均值
mean=x['age'].mean()
x.loc[:,'age']=x.loc[:,'age'].fillna(mean)

In [45]:
print(x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  1313 non-null   object 
 1   age     1313 non-null   float64
 2   sex     1313 non-null   object 
dtypes: float64(1), object(2)
memory usage: 30.9+ KB
None


In [46]:
# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)
print(x_train.head())

    pclass        age     sex
598    2nd  30.000000    male
246    1st  62.000000    male
905    3rd  31.194181  female
300    1st  31.194181  female
509    2nd  64.000000    male


In [47]:
#性别是女性的数量
x_train[x_train['sex'] == 'female'].count()

pclass    341
age       341
sex       341
dtype: int64

In [48]:
#女性中存活的情况对比
z=x_train.copy() #z是为了把特征和目标存储到一起
z['survived'] = y_train #把目标值存储到z中
z[z['sex'] == 'female']['survived'].value_counts() #女性中存活的情况

survived
1    230
0    111
Name: count, dtype: int64

In [49]:
z[z['sex'] == 'male']['survived'].value_counts() #男性中存活的情况

survived
0    539
1    104
Name: count, dtype: int64

In [50]:
y_train.value_counts() #没存活的是650，存活的是334

survived
0    650
1    334
Name: count, dtype: int64

In [51]:
x_train.to_dict(orient="records") #把df变为字典，样本变为一个一个的字典，字典中列名变为键， record是按行存储

[{'pclass': '2nd', 'age': 30.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 62.0, 'sex': 'male'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '2nd', 'age': 64.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '3rd', 'age': 24.0, 'sex': 'female'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '2nd', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '3rd', 'age': 21.0, 'sex': 'male'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '2nd', 'age': 23.0, 'sex': 'female'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '3rd', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '1st', 'age': 4

In [52]:
# 进行处理（特征工程）特征-》类别-》one_hot编码
dict = DictVectorizer(sparse=False) # sparse=False表示不用稀疏矩阵

# 这一步是对字典进行特征抽取,to_dict可以把df变为字典，records代表列名变为键
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
print(type(x_train)) 
print(dict.get_feature_names_out())
print('-' * 50)
x_test = dict.transform(x_test.to_dict(orient="records"))
print(x_train)

<class 'numpy.ndarray'>
['age' 'pclass=1st' 'pclass=2nd' 'pclass=3rd' 'sex=female' 'sex=male']
--------------------------------------------------
[[30.          0.          1.          0.          0.          1.        ]
 [62.          1.          0.          0.          0.          1.        ]
 [31.19418104  0.          0.          1.          1.          0.        ]
 ...
 [34.          0.          1.          0.          0.          1.        ]
 [46.          1.          0.          0.          0.          1.        ]
 [31.19418104  0.          0.          1.          0.          1.        ]]


In [53]:
# 用决策树进行预测，修改max_depth试试,修改criterion为entropy
#树过于复杂，就会产生过拟合
dec = DecisionTreeClassifier()

#训练
dec.fit(x_train, y_train)

# 预测准确率
print("预测的准确率：", dec.score(x_test, y_test))

# 导出决策树的结构
export_graphviz(dec, out_file="tree.dot",feature_names=['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'female', 'male'])

预测的准确率： 0.8085106382978723


### sklearn.tree.DecisionTreeClassifier

```
python
class sklearn.tree.DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=None)
```

### 参数说明

#### `criterion` （特征选择标准）
- 可选值：`"gini"` 或 `"entropy"`
- 默认值：`"gini"`，即基于基尼系数（CART算法）
- 说明：
  - `"gini"`：基尼系数
  - `"entropy"`：信息增益

#### `min_samples_split` （内部节点再划分所需最小样本数）
- 默认值：`2`
- 说明：
  - 限制了子树继续划分的条件。如果某节点的样本数少于 `min_samples_split`，则不会继续选择最优特征进行划分。
  - 样本量不大时可以使用默认值；若样本量非常大，则推荐增大此值。
  - 示例：对于10万样本的数据，可以设置 `min_samples_split=10` 作为参考。

#### `min_samples_leaf` （叶子节点最少样本数）
- 默认值：`1`
- 说明：
  - 限制了叶子节点的最小样本数。如果某叶子节点的样本数小于此值，则会与其兄弟节点一起被剪枝。
  - 可输入一个整数表示样本数，或小数表示占样本总数的百分比。
  - 示例：对于10万样本的数据，可以设置 `min_samples_leaf=5` 作为参考。

#### `max_depth` （决策树最大深度）
- 默认值：`None` （不限制深度）
- 说明：
  - 限制决策树的最大深度。如果数据量或特征数量较大，建议设置此值以防止过拟合。
  - 常用值范围：`10-100` 之间，根据具体数据分布调整。

#### `random_state` （随机数种子）
- 说明：
  - 用于控制算法运行时的随机性，确保结果可复现。

---

In [54]:
#调整决策树的参数
# 分割数据集到训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=4)
# 进行处理（特征工程）特征-》类别-》one_hot编码
dict = DictVectorizer(sparse=False)

# 这一步是对字典进行特征抽取
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
x_test = dict.transform(x_test.to_dict(orient="records"))

# print(x_train)
# 用决策树进行预测，修改max_depth为10，发现提升了,min_impurity_decrease带来的增益要大于0.01才会进行划分
dec = DecisionTreeClassifier(max_depth=7,min_impurity_decrease=0.01,min_samples_split=20)

dec.fit(x_train, y_train)

# 预测准确率
print("预测的准确率：", dec.score(x_test, y_test))

# 导出决策树的结构
export_graphviz(dec, out_file="tree1.dot", feature_names=dict.get_feature_names_out())

预测的准确率： 0.8206686930091185


* 优点：
- 简单的理解和解释，树木可视化。
- 需要很少的数据准备，其他技术通常需要数据归一化，标准化（决策树不需要进行归一化和标准化）
* 缺点：
- 决策树学习者可以创建不能很好地推广数据的过于复杂的树，这被称为过拟合。
- 决策树可能不稳定，因为数据的小变化可能会导致完全不同的树被生成（弱分类器）

### 随机森林
在机器学习中，随机森林是一个包含多个决策树的分类器，并且其输出的类别是由个别树输出的类别的众数而定

In [55]:
# 随机森林进行预测 （超参数调优），n_jobs充分利用多核的一个参数
rf = RandomForestClassifier(n_jobs=-1)
# 120, 200, 300, 500, 800, 1200,n_estimators森林中决策树的数目，也就是分类器的数目
# max_samples  是最大样本数
#bagging类型
param = {"n_estimators": [1500,2000, 5000], "max_depth": [2, 3, 5, 8, 15, 25]}

# 网格搜索与交叉验证
gc = GridSearchCV(rf, param_grid=param, cv=3)

gc.fit(x_train, y_train)

print("准确率：", gc.score(x_test, y_test))

print("查看选择的参数模型：", gc.best_params_)

print("选择最好的模型是：", gc.best_estimator_)

准确率： 0.8328267477203647
查看选择的参数模型： {'max_depth': 3, 'n_estimators': 1500}
选择最好的模型是： RandomForestClassifier(max_depth=3, n_estimators=1500, n_jobs=-1)


In [56]:
print("每个超参数每次交叉验证的结果：", gc.cv_results_)

每个超参数每次交叉验证的结果： {'mean_fit_time': array([1.53045042, 2.02830839, 5.07205017, 1.53012013, 2.04756331,
       5.11021352, 1.54488961, 2.04744474, 5.11406994, 1.54032962,
       2.06378675, 5.17004418, 1.55411267, 2.2410535 , 5.20420702,
       1.52999679, 2.09405859, 5.16317932]), 'std_fit_time': array([0.01688135, 0.00866311, 0.05436627, 0.00933741, 0.00853653,
       0.03394131, 0.01324001, 0.01162367, 0.01661884, 0.00306726,
       0.00549089, 0.04053249, 0.00093875, 0.14319464, 0.05249656,
       0.01168311, 0.06614526, 0.0487279 ]), 'mean_score_time': array([0.16769179, 0.21574179, 0.52160756, 0.16409111, 0.21601836,
       0.52855094, 0.16393654, 0.23636556, 0.5215981 , 0.16609478,
       0.21871098, 0.54130363, 0.17016832, 0.23139437, 0.52047968,
       0.16340605, 0.2339433 , 0.53352563]), 'std_score_time': array([0.00370046, 0.00106293, 0.00059051, 0.00052359, 0.00061155,
       0.00464492, 0.00063767, 0.02216718, 0.00177232, 0.00404266,
       0.0046698 , 0.01486683, 0.00489894