In [17]:
#importing libraries 
import pandas as pd
#from fancyimpute import IterativeImputer
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
# Load the CSV file into a DataFrame
df_imputed = pd.read_csv('df_imputed.csv')

In [20]:
df_imputed.dtypes

# List of continuous columns
continuous_cols = ['Frequency_Racket', 'Duration_Racket','Mins_Badminton_Perweek',
                   'Badminton_Duration','Badminton_12_Momths','Age','Time','IMD']

# Convert variables not in continuous_cols to object data type
for col in df_imputed.columns:
    if col not in continuous_cols:
        df_imputed[col] = df_imputed[col].astype('object')




# Convert variables to numeric data type
df_imputed['Plays_badminton'] = pd.to_numeric(df_imputed['Plays_badminton'], errors='coerce')
df_imputed['Plays_racket'] = pd.to_numeric(df_imputed['Plays_racket'], errors='coerce')
df_imputed['Serial'] = df_imputed['Serial'].astype("string")


# Check the data types
print(df_imputed.dtypes)

Frequency_Racket          float64
Duration_Racket           float64
Mins_Badminton_Perweek    float64
Badminton_Duration        float64
Badminton_12_Momths       float64
Time                        int64
IMD                       float64
Serial                     string
LA                         object
Age                       float64
Child                      object
Diability                  object
Education                  object
Ethnicity                  object
Gender                     object
Workstatus                 object
BMI                        object
Badminton_Frequency        object
Plays_badminton           float64
Plays_racket              float64
dtype: object


## Data Encoding

In [30]:
df_imputed

Unnamed: 0,Frequency_Racket,Duration_Racket,Mins_Badminton_Perweek,Badminton_Duration,Badminton_12_Momths,Time,IMD,Serial,LA,Age,Child,Diability,Education,Ethnicity,Gender,Workstatus,BMI,Badminton_Frequency,Plays_badminton,Plays_racket
0,0.0,0.0,0.0,0.0,0.0,1,10.0,160480126774181.0,155.0,8.0,1.0,2.0,1.0,7.0,2.0,5.0,2.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1,6.0,151160011004881.0,123.0,6.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1,10.0,151160011005981.0,123.0,4.0,3.0,2.0,3.0,1.0,2.0,6.0,2.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1,9.0,151160011007481.0,123.0,7.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1,9.0,151160011007482.0,123.0,7.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117750,0.0,0.0,0.0,0.0,0.0,6,6.0,211090336822241.0,174.0,2.0,1.0,1.0,2.0,1.0,2.0,8.0,2.0,0.0,0.0,0.0
1117751,0.0,0.0,0.0,0.0,0.0,6,9.0,211090336822991.0,78.0,5.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0
1117752,0.0,0.0,0.0,0.0,0.0,6,5.0,211090336823721.0,78.0,6.0,1.0,2.0,3.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0
1117753,0.0,0.0,0.0,0.0,0.0,6,5.0,211090336823722.0,78.0,6.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0


In [31]:
#Encoding the data 
X = pd.get_dummies(df_imputed[[ 'Child','Diability','Education', 
                               'Ethnicity', 'Gender', 'Workstatus', 'BMI']],drop_first=True)



X["Time"] = df_imputed['Time']
X["IMD"] = df_imputed['IMD']
X['Age'] = df_imputed['Age']
#Standardising the data 
# Scale the data using StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

  X = pd.get_dummies(df_imputed[[ 'Child','Diability','Education',


In [32]:
X_scaled

Unnamed: 0,Child_2.0,Child_3.0,Child_4.0,Diability_2.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,Education_6.0,Ethnicity_2.0,Ethnicity_3.0,Ethnicity_4.0,Ethnicity_5.0,Ethnicity_6.0,Ethnicity_7.0,Gender_2.0,Gender_3.0,Workstatus_2.0,Workstatus_3.0,Workstatus_4.0,Workstatus_5.0,Workstatus_6.0,Workstatus_7.0,Workstatus_8.0,Workstatus_9.0,Workstatus_10.0,BMI_2.0,BMI_3.0,BMI_4.0,BMI_5.0,Time,IMD,Age
0,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,11.434535,0.89269,-0.037767,-0.431869,-0.118028,-0.120707,1.583657,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.543002,1.522406
1,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,-1.068711,1.541757,-0.399065,-0.12427,-1.406496,0.146220,0.407251
2,-0.360761,2.957257,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,0.89269,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,5.507812,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.543002,-0.707905
3,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.193806,0.964828
4,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.193806,0.964828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117750,-0.360761,-0.338151,-0.176585,-2.199386,2.490704,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,0.89269,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,5.326256,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,1.503443,0.146220,-1.823060
1117751,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,-0.436941,-0.142971,-0.222499,-0.294742,4.226031,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,1.503443,1.193806,-0.150327
1117752,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,-1.068711,1.541757,-0.399065,-0.12427,1.503443,-0.202976,0.407251
1117753,-0.360761,-0.338151,-0.176585,0.454672,2.490704,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,0.89269,-0.037767,2.315516,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,1.503443,-0.202976,0.407251


## Logistic Regression

In [33]:
#performing logistic regression to explore the more simple patterns. 
# Separate the features and target variable

import statsmodels.api as sm

# Separate the features and target variable
X = X_scaled
y = df_imputed['Plays_badminton']

# Add constant to the features
X = sm.add_constant(X)

# Count the number of samples in each class
class_counts = df_imputed['Plays_badminton'].value_counts()

# Calculate inverse probability weights
weights = class_counts.sum() / (class_counts.shape[0] * class_counts)

# Create and fit logistic regression model with inverse probability weights
logit = sm.Logit(y, X)
result = logit.fit(method='bfgs', maxiter=100, disp=False, w=weights[y])

# Print summary of the model
print(result.summary())



                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:              1117755
Model:                          Logit   Df Residuals:                  1117721
Method:                           MLE   Df Model:                           33
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.05452
Time:                        06:08:34   Log-Likelihood:            -1.0451e+05
converged:                       True   LL-Null:                   -1.1054e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.1592      0.009   -481.574      0.000      -4.176      -4.142
Child_2.0           0.0199      0.007      3.067      0.002       0.007       0.033
Child_3.0           0.0451      

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


#Time Based Regression 

# replace 'Time', 'X_scaled', and 'Y' with your actual data
time_col = X_scaled['Time']
feature_col = X_scaled.drop(['Time'], axis=1)
target_col = df_imputed['Plays_badminton']

# Get the unique times
unique_times = time_col.unique()

# DataFrame to hold results
results = pd.DataFrame(columns=["Time", "Coefficients", "Intercept", "MSE", "R2 score"])

# Time based regression
for time in unique_times:
    time_indices = time_col[time_col == time].index

    X_time = feature_col.loc[time_indices]
    y_time = target_col.loc[time_indices]

    X_train, X_test, y_train, y_test = train_test_split(X_time, y_time, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results = results.append({
        "Time": time if pd.notnull(time) else "Combined",
        "Coefficients": model.coef_[0],
        "Intercept": model.intercept_,
        "MSE": mse,
        "R2 score": r2}, ignore_index=True)

# Combined times regression
X = feature_col
y = target_col

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

results = results.append({
    "Time": "Combined",
    "Coefficients": model.coef_[0],
    "Intercept": model.intercept_,
    "MSE": mse,
    "R2 score": r2}, ignore_index=True)

print(results)

  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({


       Time  Coefficients  Intercept       MSE  R2 score
0 -1.406496     -0.000672   0.027173  0.024318  0.011053
1 -0.824508     -0.001790   0.025839  0.024222  0.013556
2  -0.24252      0.000672   0.022174  0.021318  0.009327
3  0.339467      0.000168   0.021478  0.021179  0.006219
4  0.921455      0.002445   0.013089  0.013540  0.008014
5  1.503443      0.000956   0.008966  0.008286  0.005860
6  Combined      0.000087   0.020182  0.019808  0.009971


  results = results.append({


In [35]:
feature_names = feature_col.columns

# Create a new DataFrame to hold coefficients
coef_df = pd.DataFrame(columns=['Time'] + list(feature_names))

# Then inside your loop, instead of the results.append do:
coef_df = coef_df.append(pd.Series([time]+list(model.coef_), index=coef_df.columns), ignore_index=True)

# And for the combined model
coef_df = coef_df.append(pd.Series(['Combined']+list(model.coef_), index=coef_df.columns), ignore_index=True)

# Print the coefficients
print(coef_df)

       Time  Child_2.0  Child_3.0  Child_4.0  Diability_2.0  Education_2.0  \
0  1.503443   0.000087     0.0009    -0.0001       0.003069      -0.001984   
1  Combined   0.000087     0.0009    -0.0001       0.003069      -0.001984   

   Education_3.0  Education_4.0  Education_5.0  Education_6.0  Ethnicity_2.0  \
0      -0.001568      -0.000932       -0.00106      -0.001736      -0.001415   
1      -0.001568      -0.000932       -0.00106      -0.001736      -0.001415   

   Ethnicity_3.0  Ethnicity_4.0  Ethnicity_5.0  Ethnicity_6.0  Ethnicity_7.0  \
0       0.004203      -0.001003        0.00273       0.000118      -0.000116   
1       0.004203      -0.001003        0.00273       0.000118      -0.000116   

   Gender_2.0  Gender_3.0  Workstatus_2.0  Workstatus_3.0  Workstatus_4.0  \
0   -0.003054   -0.000689        -0.00066       -0.000822       -0.001173   
1   -0.003054   -0.000689        -0.00066       -0.000822       -0.001173   

   Workstatus_5.0  Workstatus_6.0  Workstatus_7.0  

  coef_df = coef_df.append(pd.Series([time]+list(model.coef_), index=coef_df.columns), ignore_index=True)
  coef_df = coef_df.append(pd.Series(['Combined']+list(model.coef_), index=coef_df.columns), ignore_index=True)


## Time Based and Time Combined Regression Analysis

## With class-Weights Applied 

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set pandas display options
pd.set_option('display.max_columns', None)  # Print all columns

# replace 'Time', 'X_scaled', and 'Y' with your actual data
time_col = X_scaled['Time']
feature_col = X_scaled.drop(['Time'], axis=1)
target_col = df_imputed['Plays_badminton']

# Get the unique times
unique_times = time_col.unique()

# Define column names for results dataframe
column_names = ["Time", "Intercept", "Precision", "Recall", "F1-score"] + list(feature_col.columns)

# DataFrame to hold results
results = pd.DataFrame(columns=column_names)

# Time-based regression
for time in unique_times:
    time_indices = time_col[time_col == time].index

    X_time = feature_col.loc[time_indices]
    y_time = target_col.loc[time_indices]

    X_train, X_test, y_train, y_test = train_test_split(X_time, y_time, test_size=0.2, random_state=42)

    # Calculate class weights
    class_weights = len(y_train) / (2 * np.bincount(y_train))

    model = LogisticRegression(class_weight={0: class_weights[0], 1: class_weights[1]})
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Create a dictionary of feature names and coefficients
    coef_dict = dict(zip(feature_col.columns, model.coef_.flatten()))
    coef_dict.update({"Time": time if pd.notnull(time) else "Combined", "Intercept": model.intercept_[0], "Precision": precision, "Recall": recall, "F1-score": f1})

    results = results.append(coef_dict, ignore_index=True)

# Combined times regression
X = feature_col
y = target_col

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Replacing periods in column names with an underscore
X_train.columns = X_train.columns.str.replace('.', '_')
X_test.columns = X_test.columns.str.replace('.', '_')

# Calculate class weights
class_weights = len(y_train) / (2 * np.bincount(y_train))

model = LogisticRegression(class_weight={0: class_weights[0], 1: class_weights[1]})
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Create a dictionary of feature names and coefficients
coef_dict = dict(zip(feature_col.columns, model.coef_.flatten()))
coef_dict.update({"Time": "Combined", "Intercept": model.intercept_[0], "Precision": precision, "Recall": recall, "F1-score": f1})

results = results.append(coef_dict, ignore_index=True)

results

  results = results.append(coef_dict, ignore_index=True)
  results = results.append(coef_dict, ignore_index=True)
  results = results.append(coef_dict, ignore_index=True)
  results = results.append(coef_dict, ignore_index=True)
  results = results.append(coef_dict, ignore_index=True)
  results = results.append(coef_dict, ignore_index=True)
  X_train.columns = X_train.columns.str.replace('.', '_')
  X_test.columns = X_test.columns.str.replace('.', '_')
  results = results.append(coef_dict, ignore_index=True)


Unnamed: 0,Time,Intercept,Precision,Recall,F1-score,Child_2.0,Child_3.0,Child_4.0,Diability_2.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,Education_6.0,Ethnicity_2.0,Ethnicity_3.0,Ethnicity_4.0,Ethnicity_5.0,Ethnicity_6.0,Ethnicity_7.0,Gender_2.0,Gender_3.0,Workstatus_2.0,Workstatus_3.0,Workstatus_4.0,Workstatus_5.0,Workstatus_6.0,Workstatus_7.0,Workstatus_8.0,Workstatus_9.0,Workstatus_10.0,BMI_2.0,BMI_3.0,BMI_4.0,BMI_5.0,IMD,Age
0,-1.406496,-0.281341,0.041484,0.613569,0.077713,0.005641,0.043943,0.005421,0.275261,-0.064313,-0.079681,-0.057508,-0.058145,-0.209877,-0.094158,0.143013,-0.03766,0.076995,0.01511,0.009095,-0.132019,0.01061,-0.009536,-0.018607,-0.093374,0.002975,-0.077997,-0.091958,0.08146,-0.00845,-0.025184,0.047491,-0.005305,-0.083453,-0.035091,-0.211051,-0.404033
1,-0.824508,-0.283943,0.040957,0.625621,0.076881,-0.026548,-0.016775,-0.000424,0.306814,-0.081721,-0.058063,-0.090467,-0.06824,-0.185542,-0.070107,0.141139,-0.010398,0.07179,0.00568,-0.005415,-0.127807,0.010708,-0.039563,-0.030953,-0.030541,0.057518,-0.084662,-0.120333,0.04326,0.003204,-0.062338,-0.024014,-0.078946,-0.087481,-0.077639,-0.218384,-0.449212
2,-0.24252,-0.217636,0.034187,0.686473,0.065131,0.028008,0.037399,-0.029355,0.339772,-0.062816,-0.07668,-0.016494,-0.042722,-0.187671,-0.022499,0.133247,-0.057348,0.077682,0.000169,0.002742,-0.159578,0.013358,-0.014204,-0.032873,-0.060818,-0.027412,-0.030267,-0.158206,0.109037,0.011161,-0.019656,-0.069762,-0.063744,-0.178429,-0.157449,-0.176235,-0.220897
3,0.339467,-0.195027,0.03223,0.605563,0.061202,0.010438,0.028916,0.003089,0.260512,-0.083174,-0.14853,-0.07501,-0.086183,-0.195395,-0.055756,0.162009,-0.079742,0.072385,0.019313,0.020006,-0.163161,-0.020514,-0.028006,-0.004083,-0.078177,0.060449,-0.09796,-0.102253,0.088617,0.008747,-0.012706,0.047224,0.043508,-0.075206,-0.073515,-0.201614,-0.24997
4,0.921455,-0.36402,0.024453,0.621951,0.047055,0.137799,0.162885,0.060167,0.31492,-0.166016,-0.163961,-0.14583,-0.082443,-0.167981,-0.044604,0.137438,-0.040083,0.05283,0.011548,-0.003099,-0.084121,-0.059365,0.021105,-0.020311,-0.056565,-0.047751,-0.059903,-0.09784,0.145003,0.038569,-0.006344,-0.147689,-0.175107,-0.157472,-0.18918,-0.229039,-0.26833
5,1.503443,-0.334317,0.015116,0.630872,0.029525,0.081267,0.131756,0.048425,0.217154,-0.072586,-0.11692,-0.047812,-0.054317,-0.156006,0.03486,0.189231,-0.114383,0.091857,0.023512,-0.009935,-0.203961,-0.032613,-0.00749,-0.017108,-0.041243,-0.045679,-0.064472,-0.164667,0.13236,0.048202,-0.053716,0.037528,-0.029078,-0.205464,-0.037423,0.272444,-0.265678
6,Combined,-0.235301,0.033216,0.648927,0.063197,0.020635,0.049534,0.009409,0.293337,-0.082065,-0.082839,-0.053011,-0.065702,-0.183925,-0.057441,0.13198,-0.04631,0.069251,0.007862,0.001645,-0.140047,-0.040349,-0.01276,-0.032662,-0.076163,0.034865,-0.067862,-0.129739,0.083166,0.011351,-0.03141,-0.031399,-0.053176,-0.138719,-0.082701,-0.18943,-0.352395


In [37]:
time_col = X_scaled['Time']
feature_col = X_scaled.drop(['Time'], axis=1)
target_col = df_imputed['Plays_badminton']

# Get the unique times
unique_times = time_col.unique()

# DataFrame to hold results
results = []

# Time-based regression
for time in unique_times:
    time_indices = time_col[time_col == time].index

    X_time = feature_col.loc[time_indices]
    y_time = target_col.loc[time_indices]

    # Calculate class weights
    class_weights = len(y_time) / (2 * np.bincount(y_time))

    model = sm.Logit(y_time, sm.add_constant(X_time))
    results_time = model.fit(class_weight={0: class_weights[0], 1: class_weights[1]})

    # Store the results in a list
    results.append((time, results_time))

# Combined times regression
X = feature_col
y = target_col

# Calculate class weights
class_weights_combined = len(y) / (2 * np.bincount(y))

model = sm.Logit(y, sm.add_constant(X))
results_combined = model.fit(class_weight={0: class_weights_combined[0], 1: class_weights_combined[1]})

# Store the results for the combined model
results.append(("Combined", results_combined))

# Print results
for time, result in results:
    print(f"\nRegression Model for {time}:\n")
    print(result.summary())



Optimization terminated successfully.
         Current function value: 0.117230
         Iterations 9




Optimization terminated successfully.
         Current function value: 0.114214
         Iterations 9




Optimization terminated successfully.
         Current function value: 0.101374
         Iterations 10




Optimization terminated successfully.
         Current function value: 0.099687
         Iterations 9




Optimization terminated successfully.
         Current function value: 0.070074
         Iterations 10




Optimization terminated successfully.
         Current function value: 0.048867
         Iterations 10




Optimization terminated successfully.
         Current function value: 0.094405
         Iterations 9

Regression Model for -1.4064956221653189:

                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               201579
Model:                          Logit   Df Residuals:                   201547
Method:                           MLE   Df Model:                           31
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.05609
Time:                        06:08:45   Log-Likelihood:                -23631.
converged:                       True   LL-Null:                       -25035.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Child_2.0          -0.0051      0.014     -0.380      

                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               181535
Model:                          Logit   Df Residuals:                   181502
Method:                           MLE   Df Model:                           32
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.03708
Time:                        06:08:46   Log-Likelihood:                -18097.
converged:                       True   LL-Null:                       -18794.
Covariance Type:            nonrobust   LLR p-value:                7.536e-273
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.0049      0.020   -205.162      0.000      -4.043      -3.967
Child_2.0           0.0266      0.016      1.672      0.095      -0.005       0.058
Child_3.0           0.0320      

                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:              1117755
Model:                          Logit   Df Residuals:                  1117722
Method:                           MLE   Df Model:                           32
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.04540
Time:                        06:08:48   Log-Likelihood:            -1.0552e+05
converged:                       True   LL-Null:                   -1.1054e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.1159      0.008   -489.109      0.000      -4.132      -4.099
Child_2.0           0.0152      0.006      2.335      0.020       0.002       0.028
Child_3.0           0.0411      

In [38]:
time_col = X_scaled['Time']
feature_col = X_scaled.drop(['Time'], axis=1)
target_col = df_imputed['Plays_badminton']

# Get the unique times
unique_times = time_col.unique()

# DataFrame to hold results
results = []

# Train-test split parameters
test_size = 0.2
random_state = 42

# Time-based regression
for time in unique_times:
    time_indices = time_col[time_col == time].index

    X_time = feature_col.loc[time_indices]
    y_time = target_col.loc[time_indices]

    # Calculate class weights
    class_weights = len(y_time) / (2 * np.bincount(y_time))

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_time, y_time, test_size=test_size, random_state=random_state
    )

    # Fit the model
    model = sm.Logit(y_train, sm.add_constant(X_train))
    results_time = model.fit(class_weight={0: class_weights[0], 1: class_weights[1]})

    # Store the results in a list
    results.append((time, results_time, X_test, y_test))

# Combined times regression
X = feature_col
y = target_col

# Calculate class weights
class_weights_combined = len(y) / (2 * np.bincount(y))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

# Fit the model
model = sm.Logit(y_train, sm.add_constant(X_train))
results_combined = model.fit(class_weight={0: class_weights_combined[0], 1: class_weights_combined[1]})

# Store the results for the combined model
results.append(("Combined", results_combined, X_test, y_test))

# Print results
for time, result, X_test, y_test in results:
    print(f"\nRegression Model for {time}:\n")
    print(result.summary())

    # Additional evaluation on the test set
    y_pred = result.predict(sm.add_constant(X_test))
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    print("Test Set Evaluation:")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")



Optimization terminated successfully.
         Current function value: 0.118551
         Iterations 9




Optimization terminated successfully.
         Current function value: 0.115013
         Iterations 9




Optimization terminated successfully.
         Current function value: 0.101429
         Iterations 10




Optimization terminated successfully.
         Current function value: 0.099195
         Iterations 9




Optimization terminated successfully.
         Current function value: 0.070238
         Iterations 10




Optimization terminated successfully.
         Current function value: 0.049552
         Iterations 10




Optimization terminated successfully.
         Current function value: 0.094284
         Iterations 9

Regression Model for -1.4064956221653189:

                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               161263
Model:                          Logit   Df Residuals:                   161231
Method:                           MLE   Df Model:                           31
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.05757
Time:                        06:08:53   Log-Likelihood:                -19118.
converged:                       True   LL-Null:                       -20286.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Child_2.0          -0.0047      0.015     -0.316      

  _warn_prf(average, modifier, msg_start, len(result))


                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               159908
Model:                          Logit   Df Residuals:                   159876
Method:                           MLE   Df Model:                           31
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.05592
Time:                        06:08:53   Log-Likelihood:                -18391.
converged:                       True   LL-Null:                       -19481.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Child_2.0          -0.0322      0.016     -2.057      0.040      -0.063      -0.002
Child_3.0          -0.0203      0.015     -1.313      0.189      -0.051       0.010
Child_4.0          -0.0044      

  _warn_prf(average, modifier, msg_start, len(result))


                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               143797
Model:                          Logit   Df Residuals:                   143764
Method:                           MLE   Df Model:                           32
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.04002
Time:                        06:08:54   Log-Likelihood:                -14585.
converged:                       True   LL-Null:                       -15193.
Covariance Type:            nonrobust   LLR p-value:                4.130e-235
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.0076      0.023   -175.999      0.000      -4.052      -3.963
Child_2.0           0.0241      0.018      1.378      0.168      -0.010       0.058
Child_3.0           0.0434      

  _warn_prf(average, modifier, msg_start, len(result))


                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               145228
Model:                          Logit   Df Residuals:                   145195
Method:                           MLE   Df Model:                           32
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.03861
Time:                        06:08:54   Log-Likelihood:                -14406.
converged:                       True   LL-Null:                       -14984.
Covariance Type:            nonrobust   LLR p-value:                1.154e-222
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.0156      0.022   -183.306      0.000      -4.058      -3.973
Child_2.0           0.0084      0.018      0.463      0.643      -0.027       0.044
Child_3.0           0.0266      

  _warn_prf(average, modifier, msg_start, len(result))


                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               142188
Model:                          Logit   Df Residuals:                   142155
Method:                           MLE   Df Model:                           32
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.04973
Time:                        06:08:54   Log-Likelihood:                -9987.0
converged:                       True   LL-Null:                       -10510.
Covariance Type:            nonrobust   LLR p-value:                4.836e-199
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.5999      0.032   -142.428      0.000      -4.663      -4.537
Child_2.0           0.1407      0.020      6.926      0.000       0.101       0.181
Child_3.0           0.1649      

  _warn_prf(average, modifier, msg_start, len(result))


                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               141818
Model:                          Logit   Df Residuals:                   141785
Method:                           MLE   Df Model:                           32
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.05257
Time:                        06:08:54   Log-Likelihood:                -7027.3
converged:                       True   LL-Null:                       -7417.3
Covariance Type:            nonrobust   LLR p-value:                2.673e-143
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.9906      0.037   -135.814      0.000      -5.063      -4.919
Child_2.0           0.0876      0.026      3.390      0.001       0.037       0.138
Child_3.0           0.1194      

  _warn_prf(average, modifier, msg_start, len(result))


                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:               894204
Model:                          Logit   Df Residuals:                   894171
Method:                           MLE   Df Model:                           32
Date:                Sat, 15 Jul 2023   Pseudo R-squ.:                 0.04470
Time:                        06:08:55   Log-Likelihood:                -84309.
converged:                       True   LL-Null:                       -88254.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.1143      0.009   -438.410      0.000      -4.133      -4.096
Child_2.0           0.0123      0.007      1.689      0.091      -0.002       0.027
Child_3.0           0.0431      

  _warn_prf(average, modifier, msg_start, len(result))
