In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Dealing with the data

In [None]:
data = pd.read_csv('bodyfat.csv')
data.shape
data.head()

In [None]:
#Standardize header names.
def rename_coloumn(data):
    #data = data.rename(columns={'Abdomen': 'Waist'})
    data.rename(columns={'Abdomen': 'Waist'}, inplace=True)
    
def coloumn_names_lwr(data):
    data.columns = map(str.lower, data.columns)
    data.head()
    #data.columns = [x.lower() for x in data.columns]
    
def coloumn_names_captl(data):
    data.columns = map(str.capitalize, data.columns)
    data.head()
    #data.columns = [x.capitalize() for x in data.columns]    
'''
def replace_whitespaces(data):
    cols = []
    for col in data.columns:
        cols.append(col.replace(' ', '_'))
    data.columns = cols
    return data
'''    


In [None]:
def check_missing_data(data):
    print(f"zero values in the dataset\n: {data.isnull().sum() * 100 / len(data)}")
    print(f"unknown values in the dataset:{data.isna().sum() * 100 / len(data)}")
    print("\n")
    print("do you wish to clean your data set ?")
    userinput = input("press 1 to clean, press 0 to skip")
    if userinput == 1:
        print("cleaning your data set ... ")
    elif userinput != 0:
        print("Cleaning skipped..")    


In [None]:
#numercial and categorical data

numerical_values = pd.DataFrame()
continuous_values = pd.DataFrame()
categorical_values = pd.DataFrame()

def check_datatypes(data):
    numerical = data.select_dtypes(np.number)
    numerical_continuous = data.select_dtypes(include=['float64'])
    categorical = data.select_dtypes(object)
    return numerical,numerical_continuous,categorical


Omitting the density coloumn on purpose because to predict the body fat percentage of a person based on the circumferential measurements already available, which is good enough to predict with so much available data.

I will, create Model only with these data, and predict again with my personal data set for the information randomly to validate the model.

Fat (%) = [(4*95/density) -4.51 x 100]

source:https://www.cambridge.org/core/services/aop-cambridge-core/content/view/DA80501B784742B9B2F4F454BDEE923B/S0007114567000728a.pdf/the-assessment-of-the-amount-of-fat-in-the-human-body-from-measurements-of-skinfold-thickness.pdf


# Exploring the data

In [None]:
#converting the dataset to metric system
def convert_weight_kg(data):
    data['weight']=data['weight'].apply(lambda x : round((x * 0.453),2))
    
def convert_inch_to_cm(data):
    #drop weight, density and body fat percentage
    # 12 inches --> 30 cm
    #formula X cm = [30/12] * input inches
    df_drops = data[['density','bodyfat','age','weight']]
    df2 = data.drop(['density','bodyfat','age','weight'], axis=1)
    df2 = df2.apply(lambda x : x * 2.5)
    data = pd.concat([df_drops, df2], axis=1, join="outer")
    return data
    

Finding the Relationships:

I am concerned about the correlation between the Label = 'Bodyfat%' and the features = [weight,Chest,abdomen, hip, Bicep,Thigh]

so I am going to find the Correation between them, by dropping the rest as follows

In [None]:
#def find_correlation(data):
#    df_corr = data.drop(['density','age'], axis=1)
#    sns.heatmap(df_corr.corr())



I will try to find the highly correlated value to the label bodyfat and try to fit them:
* In our case we can see that the features, weight,chest,abdomen, hip,thigh are all closely correlated.
* we will try to find the correlation again by dropping the other fewatures for our consideration

In [None]:
# add new coloumn waist to hip ratio
def waist_to_hip(data):
    data["waist_to_hip"] = data['waist']/data['hip']

In [None]:
def finalise_correlation(data, cols_to_drop):
    df_corr = data.drop(cols_to_drop, axis=1)
    sns.heatmap(df_corr.corr(), annot=True)
    return df_corr

NOTE:we witness here that abdomen circumference somewhat highly correlated and is a key contributor to the Bodyfat Percentage. But according to my Domain knowledge, waist(abdomen) to hip ratio is a significant contributor to calculate the bodyfat percentage.
so with this in mind I will do some "Feature Engineering", with WHR(waist to hip ratio) as another feature in the table.

sources:
1. https://www.bhf.org.uk/informationsupport/heart-matters-magazine/nutrition/weight/best-way-to-measure-body-fat
    
2. https://www.medicalnewstoday.com/articles/319439#how-does-waist-to-hip-ratio-affect-health
        

Conclusion: Since the features Bodyfat and Waist asre highly correlated to the WHR, we will construct a linear regression model around Bodyfat and WHR 

# Collinearity, Transformation

Checking the Linear Hypothesis

In [None]:
# find linear Hypotheis using a scatter plot
def plot_scatter(X,y):
    #figure(figsize=(8, 6), dpi=80)
    X_np = X.to_numpy()
    y_np = y.to_numpy()
    sns.set()
    plt.plot(X_np, y_np, 'o')
    m, b = np.polyfit(X_np, y_np, 1)
    plt.plot(X_np, m*X_np + b)
    #sns.set()
    #sns.scatterplot(X,y)
    #plt.tight_layout()
    #plt.show()
    
    

Normalising the Distribution:
Since we do not have multiple features to predict the label, we don't use any transformation methods liske standard scalar or Min-Max scalar

we go on to create a 1. model after train test split 2.check the error metrics to check the accuracy 3. save the model 4.use external data on this model to predict the accuracy

Separate the features from the labels

In [None]:
#def separate_label_features(data):
#    y = data['TARGET_D']
#    X = data.drop(['TARGET_D'], axis=1)
    
    

In [None]:
#pd.set_option('display.max_columns', None)

In [None]:
# Fucntion calls and declarations
numerical_values = pd.DataFrame()
continuous_values = pd.DataFrame()
categorical_values = pd.DataFrame()
cols_to_drop = ['density','age','neck','thigh','height','knee','ankle','forearm','wrist']
df_corr = pd.DataFrame()
data["waist_to_hip"] = ''
rename_coloumn(data)
coloumn_names_lwr(data)
check_missing_data(data)
numerical_values,continuous_values,categorical_values = check_datatypes(data)
convert_weight_kg(data)
convert_inch_to_cm(numerical_values)
#find_correlation(data)#find_correlation(numerical_values)#input numerical values
waist_to_hip(data)
df_corr = finalise_correlation(data, cols_to_drop)#pick highly correlated feature dataframes to predict the label
df_corr

#numerical_values.head(20)


plot the features distribution

In [None]:
sns.displot(data=data,x="waist");
plt.show()

In [None]:
sns.displot(data=data,x="waist_to_hip");
plt.show()

In [None]:
sns.displot(data=data,x="bodyfat");
plt.show()

Splitting the data

In [None]:
plot_scatter(X,y)

# Splitting the data
#X = data[["waist_to_hip"]] #after confirming the highest correlations fill X and y
X = data[["waist"]] #after confirming the highest correlations fill X and y
y = data["bodyfat"]
X_train,X_test,y_train,y_test = train_test_split( X, y, test_size=0.35, random_state=4)

# Split outputs
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
plot_scatter(data['waist'],data['bodyfat'])

Transform the feature(s) before training

In [None]:
transformer = StandardScaler()
transformer.fit(X)
x_standardized = transformer.transform(X)
x_standardized_df = pd.DataFrame(x_standardized, columns=X.columns)
plot_scatter(x_standardized_df,data['bodyfat'])

Regression on Training set

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)

Both on training and test set

In [None]:
y_pred_train = lm.predict(X_train)
y_pred_test  = lm.predict(X_test)

# Processing Data

X-y splitting
Normalizing (numericals)
Concating DataFrames ...