In [None]:
!pip install pandas==1.5.3 scikit-learn==1.2.0 numpy==1.24.0 matplotlib==3.6.0

Collecting pandas==1.5.3
  Downloading pandas-1.5.3.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m207.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:02[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# 示例数据集，包含类别特征
data = {'Department': ['Sales', 'Engineering', 'HR', 'Marketing', 'Sales'],
        'Education_Level': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master']}
df = pd.DataFrame(data)

# 标签编码（展示潜在问题）
label_enc = LabelEncoder()
df['Dept_LabelEnc'] = label_enc.fit_transform(df['Department'])
print("标签编码结果:\n", df[['Department', 'Dept_LabelEnc']])

# pandas实现独热编码
df_onehot = pd.get_dummies(df['Department'], prefix='Dept')
df = pd.concat([df, df_onehot], axis=1)
print("\npandas独热编码结果:\n", df)

# sklearn实现独热编码示例
ohe = OneHotEncoder(sparse_output=False)  # 修正了参数名称
dept_encoded = ohe.fit_transform(df[['Department']])
dept_encoded_df = pd.DataFrame(dept_encoded, columns=ohe.get_feature_names_out())
print("\nsklearn独热编码结果:\n", dept_encoded_df)


标签编码结果:
     Department  Dept_LabelEnc
0        Sales              3
1  Engineering              0
2           HR              1
3    Marketing              2
4        Sales              3

pandas独热编码结果:
     Department Education_Level  Dept_LabelEnc  Dept_Engineering  Dept_HR  \
0        Sales        Bachelor              3             False    False   
1  Engineering          Master              0              True    False   
2           HR             PhD              1             False     True   
3    Marketing        Bachelor              2             False    False   
4        Sales          Master              3             False    False   

   Dept_Marketing  Dept_Sales  
0           False        True  
1           False       False  
2           False       False  
3            True       False  
4           False        True  

sklearn独热编码结果:
    Department_Engineering  Department_HR  Department_Marketing  \
0                     0.0            0.0                   0.0 

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 不同量级的样本数据
data = {'Income': [35000, 48000, 60000, 58000, 52000],  # 修正了数据格式
        'Age': [25, 30, 35, 40, 45]}  # 补充了完整的年龄数据
df = pd.DataFrame(data)
print("原始数据:\n", df)

# 最小-最大归一化
scaler_minmax = MinMaxScaler()
df_minmax = pd.DataFrame(scaler_minmax.fit_transform(df), columns=df.columns)
print("\n归一化后数据:\n", df_minmax)

# 标准化
scaler_std = StandardScaler()
df_standardized = pd.DataFrame(scaler_std.fit_transform(df), columns=df.columns)
print("\n标准化后数据:\n", df_standardized)



原始数据:
    Income  Age
0   35000   25
1   48000   30
2   60000   35
3   58000   40
4   52000   45

归一化后数据:
    Income   Age
0    0.00  0.00
1    0.52  0.25
2    1.00  0.50
3    0.92  0.75
4    0.68  1.00

标准化后数据:
      Income       Age
0 -1.754693 -1.414214
1 -0.292449 -0.707107
2  1.057315  0.000000
3  0.832354  0.707107
4  0.157472  1.414214


In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# 示例数据
data = {'Feature1': [0.1, 0.2, 0.4, 0.5],
        'Feature2': [1.0, 0.9, 0.7, 0.6],
        'Feature3': [0.4, 0.5, 0.6, 0.7]}
df = pd.DataFrame(data)
print("原始数据:\n", df)

# 执行PCA
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df)
df_pca_df = pd.DataFrame(df_pca, columns=['Principal Component 1', 'Principal Component 2'])
print("\nPCA降维后的数据:\n", df_pca_df)


原始数据:
    Feature1  Feature2  Feature3
0       0.1       1.0       0.4
1       0.2       0.9       0.5
2       0.4       0.7       0.6
3       0.5       0.6       0.7

PCA降维后的数据:
    Principal Component 1  Principal Component 2
0               0.320045              -0.008450
1               0.148896               0.018163
2              -0.148896              -0.018163
3              -0.320045               0.008450


In [4]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

# 示例数据
data = {'Feature1': [1, 2, 3, 4, 5],
        'Feature2': [2, 3, 4, 5, 6],
        'Feature3': [5, 4, 3, 2, 1],
        'Target': [0, 1, 0, 1, 0]}
df = pd.DataFrame(data)

# 特征选择
X = df.drop('Target', axis=1)
y = df['Target']
selector = SelectKBest(score_func=f_classif, k=2)
X_new = selector.fit_transform(X, y)

# 显示选择后的特征
print("\n选择的特征:\n", X_new)



选择的特征:
 [[2 5]
 [3 4]
 [4 3]
 [5 2]
 [6 1]]


In [5]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

# 示例数据
data = {'X1': [1, 2, 3, 4],
        'X2': [2, 3, 4, 5]}
df = pd.DataFrame(data)

# 多项式特征生成
poly = PolynomialFeatures(degree=2)
df_poly = poly.fit_transform(df)

# 显示多项式特征
df_poly_df = pd.DataFrame(df_poly, columns=['1', 'X1', 'X2', 'X1^2', 'X1 X2', 'X2^2'])
print("\n多项式特征生成后的数据:\n", df_poly_df)



多项式特征生成后的数据:
      1   X1   X2  X1^2  X1 X2  X2^2
0  1.0  1.0  2.0   1.0    2.0   4.0
1  1.0  2.0  3.0   4.0    6.0   9.0
2  1.0  3.0  4.0   9.0   12.0  16.0
3  1.0  4.0  5.0  16.0   20.0  25.0


In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 不同量级的样本数据
data = {'Income': [35000, 48000, 60000, 58000, 52000],  # 修正了数据格式
        'Age': [25, 30, 35, 40, 45]}  # 补充了完整的年龄数据
df = pd.DataFrame(data)
print("原始数据:\n", df)

# 最小-最大归一化
scaler_minmax = MinMaxScaler()
df_minmax = pd.DataFrame(scaler_minmax.fit_transform(df), columns=df.columns)
print("\n归一化后数据:\n", df_minmax)

# 标准化
scaler_std = StandardScaler()
df_standardized = pd.DataFrame(scaler_std.fit_transform(df), columns=df.columns)
print("\n标准化后数据:\n", df_standardized)


原始数据:
    Income  Age
0   35000   25
1   48000   30
2   60000   35
3   58000   40
4   52000   45

归一化后数据:
    Income   Age
0    0.00  0.00
1    0.52  0.25
2    1.00  0.50
3    0.92  0.75
4    0.68  1.00

标准化后数据:
      Income       Age
0 -1.754693 -1.414214
1 -0.292449 -0.707107
2  1.057315  0.000000
3  0.832354  0.707107
4  0.157472  1.414214
