In [1236]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler


# from mlxtend.feature_selection import SequentialFeatureSelector 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score,mean_squared_error

import matplotlib.pyplot as plt

import statsmodels.api as sm

import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error

In [1237]:
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [1238]:
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [None]:
data = pd.read_csv("BodyFat.csv")
print(data.isnull().sum())
data.describe()

In [None]:

ls = list(data.loc[:,"BODYFAT"])
ls.sort()
ls

In [None]:
df = pd.DataFrame(data)
plt.figure(figsize=(10, 5))
df.boxplot()
plt.title('Boxplot of Original Data')
plt.xticks(rotation=75)
plt.show()



### Remove outliers

In [None]:
filtered_columns = [col for col in data.columns if col not in ["IDNO","BODYFAT","DENSITY","AGE"]]
df_removed = df.copy()
for column in filtered_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_removed[column] = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)][column]


plt.figure(figsize=(10, 5))
df_removed.boxplot()
plt.xticks(rotation=90)
plt.title('Boxplot of Filtered Data (After Removing Outliers)')
plt.show()

In [None]:
df_removed.describe()
# X = df_removed.drop(columns=["IDNO","BODYFAT","DENSITY","ADIPOSITY"])
X = df_removed.drop(columns=["IDNO","BODYFAT","DENSITY"])
y = df_removed["BODYFAT"]
print("X missing data:\n",X.isnull().sum())
print("y missing data:\n",y.isnull().sum())

### K-fold searching for k

In [None]:
pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('imputer', KNNImputer()),  
    ('model',KNeighborsRegressor())  
])

param_grid = {
    'imputer__n_neighbors': [ 3,5,7,9, 11, 13, 15, 17, 19, 21, 25]  
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
#kf = KFold(n_splits=5, shuffle=True)
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

print("Best n_neighbors:", grid_search.best_params_)

## Imputation

In [1245]:
imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(imputer.fit_transform(df_removed), columns=df_removed.columns)


# scaler = MinMaxScaler()
# scaler.fit(df_imputed)
# df_imputed_scaled = pd.DataFrame(scaler.transform(df_imputed),columns=df_imputed.columns)

# df_imputed_scaled.describe()

In [None]:
plt.figure(figsize=(10, 5))
df_imputed.boxplot()
plt.title('Boxplot of Imputed Data')
plt.xticks(rotation=75)
plt.show()

### Dealing with BMI calculating problem(not solved)

In [None]:
ratio = (df_imputed["WEIGHT"]/df_imputed["HEIGHT"]**2)/df_imputed["ADIPOSITY"]
pd.DataFrame(ratio).describe()
plt.figure(figsize=(10, 5))
pd.DataFrame(ratio).boxplot()
plt.title('Boxplot of Imputed Data')
plt.xticks(rotation=75)
plt.show()

$$ratio = \frac{\frac{Weight}{Height^2}}{bmi}$$

$$1kg = 2.2046lb $$
$$1m = 39.3701in$$
$$\frac{1kg}{1m^2}=\frac{2.2046lb}{(39.3701in)^2}=0.001422\frac{lb}{in^2}$$


In [None]:
ratio_df = pd.DataFrame(ratio)


outliers_index = []


for col in ratio_df.columns:
    Q1 = ratio_df[col].quantile(0.25)  
    Q3 = ratio_df[col].quantile(0.75)  
    IQR = Q3 - Q1  
    
    lower_bound = Q1 - 1.5 * IQR  
    upper_bound = Q3 + 1.5 * IQR  
    

    outliers = ratio_df[(ratio_df[col] < lower_bound) | (ratio_df[col] > upper_bound)].index
    outliers_index.extend(outliers.tolist()) 


outliers_index = sorted(set(outliers_index))  
print(f"All Outliers Index: {outliers_index}")

In [None]:
handle = df_imputed.iloc[outliers_index,:]
handle

In [None]:
mean_height = df_imputed['HEIGHT'].mean()
std_height = df_imputed['HEIGHT'].std()
mean_weight = df_imputed['WEIGHT'].mean()
std_weight =df_imputed['WEIGHT'].std()


handle['zscore_height'] = (handle['HEIGHT'] - mean_height) / std_height
handle['zscore_weight'] = (handle['WEIGHT'] - mean_weight) / std_weight
transform = 0.001422

def recalculate_values(row):
    if abs(row['zscore_height']) > abs(row['zscore_weight']):
        new_height = np.sqrt(row['WEIGHT'] / (row['ADIPOSITY']*transform))
        return new_height, row['WEIGHT']  
    else:
        new_weight = row['ADIPOSITY'] * (row['HEIGHT'] ** 2)*transform
        return row['HEIGHT'], new_weight  

handle = pd.DataFrame(handle)
handle[['new_height', 'new_weight']] = handle.apply(recalculate_values, axis=1, result_type='expand')

handle['HEIGHT'] = handle['new_height']
handle['WEIGHT'] = handle['new_weight']

handle.drop(columns=['zscore_height', 'zscore_weight', 'new_height', 'new_weight'], inplace=True)
handle

In [None]:
df_imputed.set_index('IDNO', inplace=True)
handle.set_index('IDNO', inplace=True)

df_imputed.update(handle)

df_imputed.reset_index(inplace=True)
df_imputed.describe()

In [None]:
ratio = (df_imputed["WEIGHT"]/df_imputed["HEIGHT"]**2)/df_imputed["ADIPOSITY"]

plt.figure(figsize=(10, 5))
pd.DataFrame(ratio).boxplot()
plt.title('Boxplot of Imputed Data')
plt.xticks(rotation=75)
plt.show()

In [None]:
df_imputed

In [None]:
df_imputed["BFP"] =495/(1.0324 - 0.19077*np.log10(df_imputed["ABDOMEN"]-df_imputed["NECK"]) + 0.15456*np.log10(df_imputed["HEIGHT"]*2.54))-450


df_selected = df_imputed[['BFP', 'BODYFAT']]

print(df_selected)

In [None]:
plt.scatter(df_imputed['BODYFAT'], df_imputed['BFP'], color='blue', label='BFP vs BODYFAT')

plt.title('Comparison of BFP and BODYFAT')
plt.xlabel('BODYFAT')
plt.ylabel('BFP')


plt.plot([df_imputed['BODYFAT'].min(), df_imputed['BODYFAT'].max()], 
         [df_imputed['BODYFAT'].min(), df_imputed['BODYFAT'].max()], 
         color='red', linestyle='--', label='Ideal line (y=x)')

plt.legend()

plt.show()

In [None]:

df_selected['Residual'] = df_selected['BFP'] - df_selected['BODYFAT']


df_selected['Abs_Residual'] = np.abs(df_selected['Residual']/df_selected['BFP'])


df_sorted_by_residual = df_selected.sort_values(by='Abs_Residual', ascending=False)


print(df_sorted_by_residual.head(20))




In [None]:
high_residual_ids = df_sorted_by_residual[df_sorted_by_residual['Abs_Residual'] > 0.5].index


df_imputed.loc[high_residual_ids, 'BODYFAT'] = df_imputed.loc[high_residual_ids, 'BFP']

df_imputed = df_imputed.drop(columns=["BFP"])

df_imputed.describe()

In [1259]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression



## SCALED

In [1260]:
scaler = MinMaxScaler()
scaler.fit(df_imputed)
df_imputed_scaled = pd.DataFrame(scaler.transform(df_imputed),columns=df_imputed.columns)

## Step-wise Selection

In [1261]:
def forward_selection(X, y, significance_level=0.05):
    initial_features = []
    remaining_features = list(X.columns)
    while remaining_features:
        p_values = pd.Series(index=remaining_features, dtype=float)
        for feature in remaining_features:
            model = sm.OLS(y, sm.add_constant(X[initial_features + [feature]])).fit()
            p_values[feature] = model.pvalues[feature]
        min_p_value = p_values.min()
        if min_p_value < significance_level:
            best_feature = p_values.idxmin()
            initial_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    return initial_features

In [1262]:
def backward_selection(X, y, significance_level=0.05):
    features = list(X.columns) 
    while len(features) > 0:
        model = sm.OLS(y, X[features]).fit()
        p_values = model.pvalues
        max_p_value = p_values.max()
        if max_p_value > significance_level:
            excluded_feature = p_values.idxmax()
            print(f"Removing {excluded_feature} with p-value {max_p_value}")
            features.remove(excluded_feature)
        else:
            break
    return features

In [1263]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS

In [None]:
X_imputed = df_imputed.drop(columns=["IDNO","BODYFAT","DENSITY"])
y = df_imputed["BODYFAT"]

model = LinearRegression()

sfs = SFS(model, n_features_to_select='auto', direction='forward',cv=None)
sfs.fit(X_imputed , y)

selected_features = sfs.get_support()
# print("Selected features (by index):", selected_features)
X_selected = X_imputed.loc[:, selected_features]   

print("Selected features matrix:\n", X_selected.columns)

In [None]:
X_imputed_scaled = df_imputed_scaled.drop(columns=["IDNO","BODYFAT","DENSITY"])
y = df_imputed["BODYFAT"]

model = LinearRegression()

sfs = SFS(model, n_features_to_select='auto', direction='backward',cv=5)
sfs.fit(X_imputed_scaled , y)

selected_features = sfs.get_support()
# print("Selected features (by index):", selected_features)
X_selected = X_imputed_scaled.loc[:, selected_features]   

print("Selected features matrix:\n", X_selected.columns)

In [1266]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() 
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [None]:
result_forward = forward_selection(X_imputed_scaled , y)
result_forward

In [None]:
result_backward = backward_selection(X_imputed_scaled , y)
result_backward

In [None]:
output_stepwise = stepwise_selection(X_imputed_scaled , y)
result_stepwise = output_stepwise
result_stepwise

## Visualization

In [None]:
X_forward = X_imputed_scaled.loc[:, result_forward]
X_backward = X_imputed_scaled.loc[:, result_backward]
X_stepwise = X_imputed_scaled.loc[:, result_stepwise] 

model = LinearRegression()

model.fit(X_forward,y)

y_pred = model.predict(X_forward)


plt.figure(figsize=(10, 6))
plt.scatter(y, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label='Ideal fit')
plt.xlabel('Actual(%)')
plt.ylabel('Predicted(%)')
plt.title('Actual vs Predicted(MLR Forward Selection)')
plt.legend()

plt.show()


print("Mean Squared Error:", mean_squared_error(y, y_pred))

In [None]:
cv_scores = cross_val_score(model, X_forward, y, cv=5, scoring='neg_mean_squared_error')

cv_mse_scores = -cv_scores

print("Cross-Validation MSE for each fold:", cv_mse_scores)

print("Average MSE from Cross-Validation:", cv_mse_scores.mean())

In [None]:
model = LinearRegression()

model.fit(X_backward,y)

y_pred = model.predict(X_backward)


plt.figure(figsize=(10, 6))
plt.scatter(y, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label='Ideal fit')
plt.xlabel('Actual(%)')
plt.ylabel('Predicted(%)')
plt.title('Actual vs Predicted(MLR Backward Selection)')
plt.legend()

plt.show()


print("Mean Squared Error:", mean_squared_error(y, y_pred))

In [None]:
cv_scores = cross_val_score(model, X_backward, y, cv=5, scoring='neg_mean_squared_error')

cv_mse_scores = -cv_scores

print("Cross-Validation MSE for each fold:", cv_mse_scores)

print("Average MSE from Cross-Validation:", cv_mse_scores.mean())

In [None]:
model = LinearRegression()

model.fit(X_stepwise,y)

y_pred = model.predict(X_stepwise)

plt.figure(figsize=(10, 6))
plt.scatter(y, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label='Ideal fit')
plt.xlabel('Actual(%)')
plt.ylabel('Predicted(%)')
plt.title('Actual vs Predicted(MLR Stepwise Selection)')
plt.legend()

plt.show()


print("Mean Squared Error:", mean_squared_error(y, y_pred))

In [None]:
cv_scores = cross_val_score(model, X_stepwise, y, cv=5, scoring='neg_mean_squared_error')

cv_mse_scores = -cv_scores

print("Cross-Validation MSE for each fold:", cv_mse_scores)

print("Average MSE from Cross-Validation:", cv_mse_scores.mean())

In [None]:
X_with_const = sm.add_constant(X_forward)  
ols_model = sm.OLS(y, X_with_const)
results = ols_model.fit()


print("forward:",results.summary())

In [None]:
X_with_const = sm.add_constant(X_backward)  
ols_model = sm.OLS(y, X_with_const)
results = ols_model.fit()


print("stepwise:",results.summary())

## Decision Tree

In [1278]:
from sklearn.tree import DecisionTreeRegressor, plot_tree ,export_text

X = df_imputed_scaled.drop(columns=["IDNO","BODYFAT","DENSITY"])
y = df_imputed["BODYFAT"]

tree_model = DecisionTreeRegressor(max_depth=None)
tree_model.fit(X, y)

importances = tree_model.feature_importances_

In [1279]:
feature_names = X.columns if isinstance(X, pd.DataFrame) else [f"Feature {i}" for i in range(X.shape[1])]


In [None]:
feature_importance_dict = {feature: importance for feature, importance in zip(feature_names, importances)}


sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))


print("Sorted feature importance:")
for feature, importance in sorted_feature_importance.items():
    print(f"{feature}: {importance}")

In [None]:
sorted_feature_names = [feature for feature, importance in sorted_feature_importance.items()][:6]

sorted_feature_names

In [None]:
X_tree_selected = X[sorted_feature_names]
X_tree_selected

In [None]:
tree_model = DecisionTreeRegressor(max_depth=5)
tree_model.fit(X_tree_selected, y)

y_pred = tree_model.predict(X_tree_selected)


plt.figure(figsize=(10, 6))
plt.scatter(y, y_pred, color='blue', label='Predicted vs Actual')

plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label='Ideal fit')

plt.xlabel('Actual(%)')
plt.ylabel('Predicted(%)')
plt.title('Actual vs Predicted (Decision Tree Regressor)')
plt.legend()

plt.show()


In [None]:
cv_scores = cross_val_score(tree_model, X_tree_selected, y, cv=5, scoring='neg_mean_squared_error')

cv_mse_scores = -cv_scores

print("Cross-Validation MSE for each fold:", cv_mse_scores)

print("Average MSE from Cross-Validation:", cv_mse_scores.mean())

In [None]:
tree_rules = export_text(tree_model, feature_names=list(X_tree_selected.columns))
print(tree_rules)