In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.metrics import classification_report

In [6]:
# 读入数据
car = pd.read_csv("D:/input/car.csv")
# 打印数据集的前五行
car.head(5)

FileNotFoundError: [Errno 2] No such file or directory: 'D:/input/car.csv'

In [None]:
# 查看数据集大小
car.shape

In [None]:
# 查看各个取值的数量分布
print(car["acceptance"].value_counts())

In [None]:
# 查看各个取值的比例分布
print((car["acceptance"].value_counts() + 0.0) / car.shape[0])

In [None]:
# 创建要进行数字编码的数据集
car_num_encoded = car.copy()
# 根据编码规则创建编码字典
buying_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
# 进行编码
car_num_encoded["buying"] = car_num_encoded["buying"].map(buying_dict)

In [None]:
# 查看编码列的前五行
car_num_encoded["buying"].head(5)

In [None]:
# maint特征数字编码
maint_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
car_num_encoded["maint"] = car_num_encoded["maint"].map(maint_dict)

In [None]:
# lug_boot特征数字编码
lug_boot_dict = {"small": 0, "med": 1, "big": 2}
car_num_encoded["lug_boot"] = car_num_encoded["lug_boot"].map(lug_boot_dict)

In [None]:
# safety特征数字编码
safety_dict = {"low": 0, "med": 1, "high": 2}
car_num_encoded["safety"] = car_num_encoded["safety"].map(safety_dict)

In [None]:
# 查看编码后的数据前五行
car_num_encoded.head(5)

In [None]:
# 将车门数量一列中，5more替换为6
car_num_encoded.doors.replace("5more", "6", inplace=True)
# 将核载人数一列中，more替换为6
car_num_encoded.persons.replace("more", "6", inplace=True)

In [None]:
print(car_num_encoded["doors"].value_counts())

In [None]:
print(car_num_encoded["persons"].value_counts())

In [None]:
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)

In [None]:
# 将车门数量属性类型变更为整型
car_num_encoded["doors"] = car_num_encoded["doors"].astype("int")
# 将核载人数属性类型变更为整型
car_num_encoded["persons"] = car_num_encoded["persons"].astype("int")

In [None]:
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)

In [None]:
car_num_encoded.head(5)

In [None]:
# 创建编码器
buying_encoder = LabelEncoder()
# 适配数据集并完成编码
encoded_buying = buying_encoder.fit_transform(car["buying"])

In [None]:
buying_encoder.inverse_transform([0, 1, 2, 3])

In [None]:
pd.get_dummies(car["buying"], prefix="buying_").head(5)

In [None]:
car_onehot_encoded = pd.DataFrame()

In [None]:
for col in ["buying", "maint", "doors", "persons", "lug_boot", "safety"]:
    col_encoded = pd.get_dummies(car[col], prefix=col + "_")
    car_onehot_encoded = pd.concat([car_onehot_encoded, col_encoded], axis=1)

In [None]:
car_onehot_encoded = pd.concat([car_onehot_encoded, car["acceptance"]], axis=1)

In [None]:
car_onehot_encoded.head(5)

In [None]:
# 数字编码数据集进行划分
x_num = car_num_encoded.iloc[:,:6]
y_num = car_num_encoded["acceptance"]
# 将百分之七十数据作为训练集，剩余作为测试集。按照acceptance进行分成抽样，确保测试集和训练集满意程度分布情况一致
x_train_num, x_test_num, y_train_num, y_test_num = train_test_split(x_num, y_num, test_size=0.3, stratify=y_num, random_state=42)

In [None]:
# 输出训练集各个取值样本所占比例
print (y_train_num.value_counts()/len(y_train_num))
# 输出测试集各个取值样本所占比例
print (y_test_num.value_counts()/len(y_test_num))

In [None]:
# one-hot编码数据集进行划分
x_onehot = car_onehot_encoded.iloc[:,:21]
y_onehot = car_onehot_encoded["acceptance"]
# 将百分之七十数据作为训练集，剩余作为测试集。按照acceptance进行分成抽样，确保测试集和训练集满意程度分布情况一致
x_train_onehot, x_test_onehot, y_train_onehot, y_test_onehot = train_test_split(x_onehot, y_onehot, test_size=0.3, stratify=y_onehot, random_state=42)

In [None]:
# 输出训练集各个取值样本所占比例
print (y_train_onehot.value_counts()/len(y_train_onehot))
# 输出测试集各个取值样本所占比例
print (y_test_onehot.value_counts()/len(y_test_onehot))

In [None]:
# 创建逻辑回归模型模型
num_model = LogisticRegression(max_iter=5000)
onehot_model = LogisticRegression(max_iter=5000)

In [None]:
# 分别用数字编码的数据集和one-hot编码的数据集训练模型
num_model.fit(x_train_num, y_train_num)
onehot_model.fit(x_train_onehot, y_train_onehot)

In [None]:
# 获取数据编码对应模型的预测结果
y_pred_num = num_model.predict(x_test_num)
# 获取one-hot编码对应模型的预测结果
y_pred_onehot = onehot_model.predict(x_test_onehot)

In [None]:
# 查看数字编码训练模型的性能
print(classification_report(y_test_num, y_pred_num))

In [None]:
# 查看onehot编码训练模型的性能
print(classification_report(y_test_onehot, y_pred_onehot))

In [None]:
car_onehot_encoded.to_csv("D:/output/one-hot.csv")