In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn.datasets as sd
import sklearn.preprocessing as sp
import sklearn.linear_model as lm
import sklearn.tree as st
import sklearn.ensemble as se
import sklearn.model_selection as ms
import sklearn.metrics as sm



In [71]:
cars = pd.read_csv("car.txt",header=None,names=["price","service","door","passenger","trunk","safe","level"]) #

for i in cars.columns:
    print(cars[i].value_counts(),end="\n\n")


vhigh    432
high     432
med      432
low      432
Name: price, dtype: int64

vhigh    432
high     432
med      432
low      432
Name: service, dtype: int64

2        432
3        432
4        432
5more    432
Name: door, dtype: int64

2       576
4       576
more    576
Name: passenger, dtype: int64

small    576
med      576
big      576
Name: trunk, dtype: int64

low     576
med     576
high    576
Name: safe, dtype: int64

unacc    1210
acc       384
good       69
vgood      65
Name: level, dtype: int64



## 确定：是分类问题，还是回归问题？
答：分类，输出是离散值

## 选哪一个分类模型：逻辑回归、决策树、随机森林、Ada_Boost、GBDT
答：随机森林（当然也可以尝试其他模型）

# 针对数据完成标签编码预处理

In [72]:
# 基于自带的函数完成标签编码

data = cars.copy()

encoder_dict={}
for col_ind,col_val in cars.items():
    encoder=sp.LabelEncoder()
    data[col_ind] = encoder.fit_transform(col_val)
    encoder_dict[col_ind]=encoder
cars

encoder_dict


{'price': LabelEncoder(),
 'service': LabelEncoder(),
 'door': LabelEncoder(),
 'passenger': LabelEncoder(),
 'trunk': LabelEncoder(),
 'safe': LabelEncoder(),
 'level': LabelEncoder()}

In [73]:
# 基于自定义的标签编码

def my_label_encoder(data,col_name,trans_list):
    encoder_df = pd.DataFrame(data[col_name].value_counts())
    encoder_df = encoder_df.reset_index()
    encoder_df[col_name]=encoder_df["index"]
    encoder_df["label"]=pd.Series(trans_list)
    

    temp = pd.merge(data[col_name], encoder_df, on=col_name, how="left")
    data[col_name]=temp["label"]
    



data = cars.copy()
my_label_encoder(data,data.columns[0],[4,3,2,1])
my_label_encoder(data,data.columns[1],[4,3,2,1])
my_label_encoder(data,data.columns[2],[2,3,4,5])
my_label_encoder(data,data.columns[3],[2,4,5])
my_label_encoder(data,data.columns[4],[1,2,3])
my_label_encoder(data,data.columns[5],[1,2,3])
my_label_encoder(data,data.columns[6],[1,2,3,4])



data
    

Unnamed: 0,price,service,door,passenger,trunk,safe,level
0,4,4,2,2,1,1,1
1,4,4,2,2,1,2,1
2,4,4,2,2,1,3,1
3,4,4,2,2,2,1,1
4,4,4,2,2,2,2,1
...,...,...,...,...,...,...,...
1723,1,1,5,5,2,2,3
1724,1,1,5,5,2,3,4
1725,1,1,5,5,3,1,1
1726,1,1,5,5,3,2,3


In [74]:
# 整理输入集与输出集
x = data.iloc[:,:-1] #取后100个数据，只涉及二分类
y = data.iloc[:,-1] #取后100个数据，只涉及二分类

train_x,test_x,train_y,test_y = ms.train_test_split(x,y,test_size=0.2,random_state=7,stratify=y)
#stratify=y，将训练集和检测集按类型等比例抽取，防止训练于检测样本类型分布不匹配！
print(train_x.shape)
print(test_x.shape)
print(test_y.value_counts())




(1382, 6)
(346, 6)
1    242
2     77
3     14
4     13
Name: level, dtype: int64


In [75]:
# 创建分类模型，做5次交叉验证，验证模型是否可用
# 如果分数还可以，在正儿八经的训练模型


# model_RF=se.GradientBoostingClassifier(max_depth=5,min_samples_split=3,n_estimators=200,random_state=7)
model_RF=se.RandomForestClassifier(max_depth=5,min_samples_split=3,n_estimators=200,random_state=7)

#做5次交叉验证
scores = ms.cross_val_score(model_RF,x,y,cv=5,scoring="accuracy")
print("5次交叉验证精确度分数：",scores.mean())
scores = ms.cross_val_score(model_RF,x,y,cv=5,scoring="precision_weighted")
print("5次交叉验证查准率分数：",scores.mean())
scores = ms.cross_val_score(model_RF,x,y,cv=5,scoring="recall_weighted")
print("5次交叉验证召回率分数：",scores.mean())
scores = ms.cross_val_score(model_RF,x,y,cv=5,scoring="f1_weighted")
print("5次交叉验证f1分数：",scores.mean())


5次交叉验证精确度分数： 0.7761078998073218


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5次交叉验证查准率分数： 0.750848901925902
5次交叉验证召回率分数： 0.7761078998073218
5次交叉验证f1分数： 0.755538699605307


In [76]:
# 模型评估
# train_x=x
# train_y=y
# test_x=x
# test_y=y

model_RF.fit(train_x,train_y)
pred_test_y = model_RF.predict(test_x)

m = sm.confusion_matrix(test_y,pred_test_y)
print(m)
cr = sm.classification_report(test_y,pred_test_y)
print(cr)

[[235   7   0   0]
 [  4  73   0   0]
 [  0  14   0   0]
 [  0  10   0   3]]
              precision    recall  f1-score   support

           1       0.98      0.97      0.98       242
           2       0.70      0.95      0.81        77
           3       0.00      0.00      0.00        14
           4       1.00      0.23      0.38        13

    accuracy                           0.90       346
   macro avg       0.67      0.54      0.54       346
weighted avg       0.88      0.90      0.88       346



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
