In [1]:
import pandas as pd                                                       # data manipulation and analysis
import numpy as np                                                        # numerical computations and array operations
import seaborn as sns                                                     # statistical data visualization
import matplotlib.pyplot as plt                                           # creating plots and visualizations
from sklearn.model_selection import train_test_split, cross_val_score     # splitting data and cross-validation
from sklearn.preprocessing import StandardScaler, OneHotEncoder           # preprocessing tools for scaling and encoding
from sklearn.impute import SimpleImputer                                  # preprossessing tools for handling missing values
from sklearn.linear_model import LinearRegression                         # regression analysis
import tensorflow as tf                                                   # neural network
from sklearn.neighbors import KNeighborsRegressor                         # knn model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error        

In [2]:
fitness = pd.read_csv('flexfield_fitness.csv')
fitness.head(3)

Unnamed: 0,Customer ID,Gender,Age,Hours at Gym (per week),Fitness Goal,Gym Membership Length (years),Calorie Intake
0,221958,Female,27,7.3,Build muscle,4,
1,771155,Female,24,6.2,Endurance training,4,2173.0
2,231932,Male,22,7.4,Fat loss,2,1752.0


In [3]:
coreboost = pd.read_csv('coreboost.csv')
coreboost.head(3)

Unnamed: 0,Customer ID,Gender,Age,Hours at Gym (per week),Fitness Goal,Preferred Sports Drink Type,Average Weekly Consumption (Bottles)
0,221958,Female,27,7.3,Build muscle,Electrolyte,10
1,771155,Female,24,6.2,Endurance training,Electrolyte,11
2,231932,Male,22,7.4,Fat loss,Electrolyte,12


In [4]:
fitness['Hours at Gym (per week)'].fillna(0, inplace = True)
coreboost['Hours at Gym (per week)'].fillna(0, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fitness['Hours at Gym (per week)'].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  coreboost['Hours at Gym (per week)'].fillna(0, inplace = True)


In [5]:
fitness = fitness.dropna(how='any',axis=0)
coreboost = coreboost.dropna(how='any', axis=0)

In [6]:
merged_data = pd.merge(fitness, coreboost, on = ['Customer ID', 'Gender', 'Age', 'Hours at Gym (per week)', 'Fitness Goal'], how = 'inner')
merged_data.head(3)

Unnamed: 0,Customer ID,Gender,Age,Hours at Gym (per week),Fitness Goal,Gym Membership Length (years),Calorie Intake,Preferred Sports Drink Type,Average Weekly Consumption (Bottles)
0,771155,Female,24,6.2,Endurance training,4,2173.0,Electrolyte,11
1,231932,Male,22,7.4,Fat loss,2,1752.0,Electrolyte,12
2,465838,Female,23,6.0,Build muscle,0,2884.0,Protein shakes,10


In [11]:
import statsmodels.formula.api as smf
# includes interaction terms between hours at gym, dietary preferences, calorie intake, and gym membership length
formula = 'Q("Average Weekly Consumption (Bottles)") ~ Q("Hours at Gym (per week)")  * Q("Fitness Goal") * Q("Gym Membership Length (years)") * Q("Preferred Sports Drink Type")'

# fit the linear regression model
model = smf.ols(formula=formula, data=merged_data).fit()

model_summary = model.summary()
model_summary

0,1,2,3
Dep. Variable:,"Q(""Average Weekly Consumption (Bottles)"")",R-squared:,0.198
Model:,OLS,Adj. R-squared:,0.122
Method:,Least Squares,F-statistic:,2.595
Date:,"Sat, 26 Oct 2024",Prob (F-statistic):,2.5e-11
Time:,21:32:27,Log-Likelihood:,-2029.2
No. Observations:,911,AIC:,4218.0
Df Residuals:,831,BIC:,4604.0
Df Model:,79,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,12.6762,1.575,8.049,0.000,9.585,15.768
"Q(""Fitness Goal"")[T.Endurance training]",-2.3180,2.565,-0.904,0.366,-7.353,2.717
"Q(""Fitness Goal"")[T.Fat loss]",-2.6631,2.183,-1.220,0.223,-6.948,1.621
"Q(""Fitness Goal"")[T.Flexibility]",-11.9515,6.739,-1.773,0.077,-25.180,1.277
"Q(""Fitness Goal"")[T.General health]",-5.5578,2.099,-2.648,0.008,-9.677,-1.438
"Q(""Preferred Sports Drink Type"")[T.Energy drinks]",-4.2705,3.581,-1.192,0.233,-11.300,2.759
"Q(""Preferred Sports Drink Type"")[T.Fresh juice]",-9.5830,5.247,-1.826,0.068,-19.882,0.716
"Q(""Preferred Sports Drink Type"")[T.Protein shakes]",-4.5568,1.857,-2.454,0.014,-8.202,-0.912
"Q(""Fitness Goal"")[T.Endurance training]:Q(""Preferred Sports Drink Type"")[T.Energy drinks]",14.0121,10.887,1.287,0.198,-7.358,35.382

0,1,2,3
Omnibus:,23.312,Durbin-Watson:,2.042
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.419
Skew:,0.388,Prob(JB):,4.98e-06
Kurtosis:,3.204,Cond. No.,42000.0
