In [271]:
"""
This is the functions portion of the module for the
final project where it aims to better understand the impact
of different variables on obesity.
"""
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import accuracy_score


def import_and_clean(dataset: str):
    """
    This function imports and cleans the dataset
    "obesity.csv", which contains information about
    obesity rates and its related variables. It replaces
    the categorical variable "NObeyesdad" with numerical
    values 0-6, from insufficient weight to obesity type III.
    Parameters
    ==========
    dataset: string
        name of the csv dataset file
    Returns
    =======
    "cleaned" dataset with missing values removed and "NObeyesdad"
    column original values replaced with numerical values for easier
    analysis.
    """
    obesity = pd.read_csv(dataset).dropna()
    # replace values in the "gender" column where male=0 and female=1
    obesity["Gender"].replace(["Male","Female"],[0,1],inplace=True)
    # replace values in the "family_history_with_overweight" column
    # where no=0 and yes=1.
    obesity["family_history_with_overweight"].replace(["no","yes"],[0,1],inplace=True)
    # replace values in the "SCC" column where no corresponds to 0
    # and yes corresponds to 1.
    obesity["FAVC"].replace(["no","yes"],[0,1],inplace=True)
    # replace values in the "SCC" column where no corresponds to 0
    # and yes corresponds to 1.
    obesity["SCC"].replace(["no","yes"],[0,1],inplace=True)
    # replace values in the "SMOKE" column where no corresponds to 0
    # and yes corresponds to 1.
    obesity["SMOKE"].replace(["no","yes"],[0,1],inplace=True)
    # replace values in the "NObeyesdad" column where categorical variables are
    # replaced with numericals from 0-6 for easier analysis.
    obesity["NObeyesdad"].replace(["Insufficient_Weight", "Normal_Weight",
                    "Overweight_Level_I", "Overweight_Level_II",
                    "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"],
                    [0, 1, 2, 3, 4, 5, 6], inplace=True)
    # replace values in the "CAEC" column where categorical variables are
    # replaced with numericals from 0-3 for easier analysis.
    obesity["CAEC"].replace(["no", "Sometimes", "Always", "Frequently"],
                            [0,1,2,3], inplace=True)
    # replace values in the "NObeyesdad" column where categorical variables are
    # replaced with numericals from 0-3 for easier analysis.
    obesity["CALC"].replace(["no", "Sometimes", "Always", "Frequently"],
                            [0,1,2,3], inplace=True)
    return obesity


def data_visualization(dataset: str):
    """
    This function generates graphs and other visuals to show
    the relationships between each variable and obesity
    rates. This will give us a better undertstanding of
    what factors can make someone more susceptible to obesity.
    Parameters
    ==========
    dataset: string
        name of the csv dataset file
    """
    obesity = import_and_clean(dataset)
    # lmplot that shows relationship between alcohol consumption
    # and obesity level.
    sns.lmplot(x = "CALC", y = "NObeyesdad", data = obesity)
    # lmplot that shows relationship between frequency of physical
    # activities and obesity level.
    sns.lmplot(x = "FAF", y = "NObeyesdad", data = obesity)
    # barplot that shows the difference in obesity level between
    # males and females.
    sns.barplot(x = "Gender", y = "NObeyesdad", data = obesity)
    # barplot that shows the difference in obesity level between
    # people who have overweight family history and people who don't
    sns.barplot(x = "family_history_with_overweight", y = "NObeyesdad",
                data = obesity)


def descriptive_statistics(dataset: str):
    """
    This function provides descriptive statistics like median,
    mean and standard deviations etc for us to better understand
    the dataset. Specifically, comparison of obesity level between
    male vs female, family history vs no family history, smoker vs
    nonsmoker.
    Parameters
    ==========
    dataset: string
        name of the csv dataset file
    """
    obesity = import_and_clean(dataset)
    # Overview of dataset
    relevant_vars = obesity[["Age","Height","Weight","NCP","FAF","CALC","NObeyesdad"]]
    return relevant_vars.describe()


def data_model(dataset: str):
    """
    This function generates a model for obesity data and estimates
    such model by producing an OLS regression table.
    Parameters
    ==========
    dataset: string
        name of the csv dataset file
    """
    obesity = import_and_clean(dataset)
    lhs = obesity["NObeyesdad"]
    ind_vars = ["Gender","Age","Weight","Height","family_history_with_overweight",
                "FAVC","FCVC","NCP","CAEC","SMOKE","CH2O","FAF","TUE","CALC"]
    rhs = obesity.loc[:, ind_vars]
    rhs = sm.add_constant(rhs)
    mod = sm.OLS(lhs, rhs)
    res = mod.fit()
    print(res.summary())
    yhat = res.predict(rhs)
    prediction = list(map(round, yhat))
    accuracy = accuracy_score(lhs, prediction)
    print('Test accuracy = ', accuracy*100, '%')
    

In [264]:
descriptive_statistics("obesity.csv")

Unnamed: 0,Age,Height,Weight,NCP,FAF,CALC,NObeyesdad
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.685628,1.010298,0.764093,3.112269
std,6.345968,0.093305,26.191172,0.778039,0.850592,0.616717,1.985062
min,14.0,1.45,39.0,1.0,0.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.658738,0.124505,0.0,1.0
50%,22.77789,1.700499,83.0,3.0,1.0,1.0,3.0
75%,26.0,1.768464,107.430682,3.0,1.666678,1.0,5.0
max,61.0,1.98,173.0,4.0,3.0,3.0,6.0


In [265]:
data_model("obesity.csv")

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,NObeyesdad,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.951
Method:,Least Squares,F-statistic:,2899.0
Date:,"Fri, 17 Dec 2021",Prob (F-statistic):,0.0
Time:,23:51:05,Log-Likelihood:,-1261.2
No. Observations:,2111,AIC:,2552.0
Df Residuals:,2096,BIC:,2637.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.0185,0.269,33.585,0.000,8.492,9.545
Gender,-0.0653,0.027,-2.460,0.014,-0.117,-0.013
Age,0.0137,0.002,8.166,0.000,0.010,0.017
Weight,0.0771,0.001,143.382,0.000,0.076,0.078
Height,-7.5931,0.166,-45.816,0.000,-7.918,-7.268
family_history_with_overweight,0.3233,0.030,10.889,0.000,0.265,0.382
FAVC,0.0386,0.032,1.204,0.229,-0.024,0.102
FCVC,-0.0008,0.020,-0.039,0.969,-0.040,0.038
NCP,0.0266,0.013,2.054,0.040,0.001,0.052

0,1,2,3
Omnibus:,124.101,Durbin-Watson:,1.244
Prob(Omnibus):,0.0,Jarque-Bera (JB):,294.634
Skew:,-0.349,Prob(JB):,1.05e-64
Kurtosis:,4.692,Cond. No.,3050.0


In [266]:
descriptive_statistics("obesity.csv")

Unnamed: 0,Age,Height,Weight,NCP,FAF,CALC,NObeyesdad
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.685628,1.010298,0.764093,3.112269
std,6.345968,0.093305,26.191172,0.778039,0.850592,0.616717,1.985062
min,14.0,1.45,39.0,1.0,0.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.658738,0.124505,0.0,1.0
50%,22.77789,1.700499,83.0,3.0,1.0,1.0,3.0
75%,26.0,1.768464,107.430682,3.0,1.666678,1.0,5.0
max,61.0,1.98,173.0,4.0,3.0,3.0,6.0
