In [1]:
pip install ucimlrepo

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting ucimlrepo
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/22/47/9350b2eeeaef8c0fd3ec3505c8a0481b576845b3df0d71c76f989c23d3c6/ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6
Note: you may need to restart the kernel to use updated packages.


In [1]:
# 导入Wine Quality数据集
from ucimlrepo import fetch_ucirepo 
  
# 获取数据集
wine_quality = fetch_ucirepo(id=186) 
  
# 从数据集中得到特征和目标值
x = wine_quality.data.features 
y = wine_quality.data.targets 

# 打印变量信息
print(wine_quality.variables) 

                    name     role         type demographic  \
0          fixed_acidity  Feature   Continuous        None   
1       volatile_acidity  Feature   Continuous        None   
2            citric_acid  Feature   Continuous        None   
3         residual_sugar  Feature   Continuous        None   
4              chlorides  Feature   Continuous        None   
5    free_sulfur_dioxide  Feature   Continuous        None   
6   total_sulfur_dioxide  Feature   Continuous        None   
7                density  Feature   Continuous        None   
8                     pH  Feature   Continuous        None   
9              sulphates  Feature   Continuous        None   
10               alcohol  Feature   Continuous        None   
11               quality   Target      Integer        None   
12                 color    Other  Categorical        None   

               description units missing_values  
0                     None  None             no  
1                     None  Non

In [2]:
import pandas as pd
import sklearn

# 将特征和目标值合并为一个DataFrame，便于处理
df = pd.concat([x, y], axis=1)

# 展示10行数据
print(df.head(10))

# 展示信息
print(df.info())

   fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   
5            7.4              0.66         0.00             1.8      0.075   
6            7.9              0.60         0.06             1.6      0.069   
7            7.3              0.65         0.00             1.2      0.065   
8            7.8              0.58         0.02             2.0      0.073   
9            7.5              0.50         0.36             6.1      0.071   

   free_sulfur_dioxide  total_sulfur_dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0

In [3]:
# 1.Data Cleaning
## Missing Value Handling
counts = df.isnull().sum()  # 计算每列的缺失值数量
columns = counts[counts > 0].index.tolist()  # 找到有缺失值的列
df.dropna(subset=columns, inplace=True)  # 删除这些列中有缺失值的行

## Duplicate Data Handling
df.drop_duplicates(inplace=True)  # 检查并移除重复的行

In [4]:
# 2.Data Integration
df['total_acidity'] = df['fixed_acidity'] + df['volatile_acidity']  # 计算“总酸度”，即“固定酸度”与“挥发酸度”的和，然后作为新列添加到数据集中

In [5]:
# 3.Data Transformation
## Normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['quality_normalized'] = scaler.fit_transform(df[['quality']])  # 使用MinMaxScaler将“quality”列的数据归一化到[0,1]范围

## Discretization
df['fixed_acidity_level'] = pd.cut(df['fixed_acidity'], bins=3, labels=['low', 'medium', 'high'])  # 将“固定酸度”列的连续值离散化为三个等级：低、中、高

In [6]:
# 4.Data Reduction
## Feature Selection
from sklearn.feature_selection import f_classif
x = df.drop(['quality', 'quality_normalized', 'total_acidity', 'fixed_acidity_level'], axis=1)  # 去除新增的一些数据列, 准备特征值
y = df['quality']  # 准备目标值
f_values, p_values = f_classif(x, y)  # 执行ANOVA（方差分析）
anova_results = pd.DataFrame({'Feature': x.columns, 'F-value': f_values, 'P-value': p_values})  # 将结果保存为DataFrame
top_features = anova_results.sort_values(by='F-value', ascending=False).head(3)['Feature'].tolist()  # 选择F值最高的三个特征

In [9]:
# 输出结果以便检查

# 计算每列的缺失值数量
missing_value_counts = df.isnull().sum()

# 计算重复行的数量
duplicate_row_counts = df.duplicated().sum()

# 判断数据清洗是否成功
if ((missing_value_counts.all() == 0) and (duplicate_row_counts == 0)):
    print("数据清洗成功：数据中已经没有缺失值和重复值。")
else:
    print("数据清洗失败：数据中仍有缺失值或重复值。")

# 判断数据集成是否成功
if "total_acidity" in df.columns:
    print("数据集成成功：'total_acidity'列已添加到DataFrame中。")
else:
    print("数据集成失败：'total_acidity'列未找到。")

# 检查"quality_normalized"的最小值和最大值
quality_min = df["quality_normalized"].min()
quality_max = df["quality_normalized"].max()

# 判断归一化是否成功
if quality_min >= 0 and quality_max <= 1:
    print("数据转换成功：'quality'已成功归一化到[0,1]范围内。")
else:
    print("数据转换失败：'quality'归一化可能未正确执行。")

# 检查"fixed_acidity_level"列的唯一值
unique_level = df["fixed_acidity_level"].unique().tolist()
unique_level_str = ", ".join(str(level) for level in unique_level)

# 判断离散化是否成功
if len(unique_level) == 3:
    print(f"数据转换成功：'fixed acidity'已成功离散化为三个等级：{unique_level_str}。")
else:
    print("数据转换失败：'fixed acidity'离散化可能未正确执行。")

# 判断数据降维是否成功
if len(top_features) == 3:
    print("数据降维成功：已成功选择对葡萄酒质量影响最大的三个特征。")
    # 输出对葡萄酒质量影响最大的三个特征
    print("对葡萄酒质量影响最大的三个特征为：")
    for feature in top_features:
        print(feature)
else:
    print("数据降维失败：特征选择可能未正确执行。")
    
# 展示前10行数据
df.head(10)

数据清洗成功：数据中已经没有缺失值和重复值。
数据集成成功：'total_acidity'列已添加到DataFrame中。
数据转换成功：'quality'已成功归一化到[0,1]范围内。
数据转换成功：'fixed acidity'已成功离散化为三个等级：low, medium, high。
数据降维成功：已成功选择对葡萄酒质量影响最大的三个特征。
对葡萄酒质量影响最大的三个特征为：
alcohol
density
volatile_acidity


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,total_acidity,quality_normalized,fixed_acidity_level
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,8.1,0.333333,low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,8.68,0.333333,low
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,8.56,0.333333,low
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,11.48,0.5,medium
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,8.06,0.333333,low
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5,8.5,0.333333,medium
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7,7.95,0.666667,low
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7,8.38,0.666667,low
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,8.0,0.333333,low
10,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,5,7.28,0.333333,low
