## Abalone dataset
### 1

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams['font.size'] = 12
sns.set_style("whitegrid")
print("Environment setted.")

columns = ['Sex', 'length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']
df = pd.read_csv("abalone.data", names=columns)
print("Data inputted.")
df.head()

# OHE
df = pd.get_dummies(df, columns=['Sex'], prefix='Sex')

Environment setted.
Data inputted.


In [16]:
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df.drop('Rings', axis=1)
y = df['Rings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(
    n_estimators=100,       
    random_state=42)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

y_train_pred = np.round(y_train_pred).astype(int)
y_test_pred = np.round(y_test_pred).astype(int)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
print('mse on training set',mse_train)
print('mse on test set',mse_test)

mse on training set 0.7027835977252319
mse on test set 5.148325358851674


### 2

In [17]:
rf = RandomForestRegressor(
    n_estimators=100,        # 森林里树的棵数（越多越稳定）
    max_depth=5,             # 每棵树的最大深度
    min_samples_split=10,    # 内部分支的最小样本数
    min_samples_leaf=5,      # 叶子节点最小样本数
    max_features='sqrt',     # 每次分裂最多考虑特征数
    random_state=42)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

y_train_pred = np.round(y_train_pred).astype(int)
y_test_pred = np.round(y_test_pred).astype(int)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
print('mse on training set',mse_train)
print('mse on test set',mse_test)

mse on training set 4.829392397485782
mse on test set 5.526315789473684


### 3
no change

### 4
用模型的特征重要性（feature importances）来筛选、精简特征。也就是选出最有用的几个变量

In [24]:
# 获取特征重要性
importances = rf.feature_importances_
feature_names = X_train.columns

# 保留重要性大于5%
threshold = 0.05
important_mask = importances > threshold
important_features = feature_names[important_mask]
print("Importance values are above 5%：", important_features)

selected_importances = importances[importances > threshold]
total_retained_importance = selected_importances.sum()
print("total retained feature importance value：", total_retained_importance)



Importance values are above 5%： Index(['Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
       'Viscera_weight', 'Shell_weight'],
      dtype='object')
total retained feature importance value： 0.9249544352499031
