In [15]:
#importing libraries 
import pandas as pd
#from fancyimpute import IterativeImputer
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [16]:
# Load the CSV file into a DataFrame
df_imputed = pd.read_csv('df_imputed.csv')

In [17]:
df_imputed.dtypes

# List of continuous columns
continuous_cols = ['Frequency_Racket', 'Duration_Racket','Mins_Badminton_Perweek',
                   'Badminton_Duration','Badminton_12_Momths','Age','Time','IMD']

# Convert variables not in continuous_cols to object data type
for col in df_imputed.columns:
    if col not in continuous_cols:
        df_imputed[col] = df_imputed[col].astype('object')




# Convert variables to numeric data type
df_imputed['Plays_badminton'] = pd.to_numeric(df_imputed['Plays_badminton'], errors='coerce')
df_imputed['Plays_racket'] = pd.to_numeric(df_imputed['Plays_racket'], errors='coerce')
df_imputed['Serial'] = df_imputed['Serial'].astype("string")


# Check the data types
print(df_imputed.dtypes)

Frequency_Racket          float64
Duration_Racket           float64
Mins_Badminton_Perweek    float64
Badminton_Duration        float64
Badminton_12_Momths       float64
Time                        int64
IMD                       float64
Serial                     string
LA                         object
Age                       float64
Child                      object
Diability                  object
Education                  object
Ethnicity                  object
Gender                     object
Workstatus                 object
BMI                        object
Badminton_Frequency        object
Plays_badminton           float64
Plays_racket              float64
dtype: object


## Data Encoding

In [18]:
df_imputed

Unnamed: 0,Frequency_Racket,Duration_Racket,Mins_Badminton_Perweek,Badminton_Duration,Badminton_12_Momths,Time,IMD,Serial,LA,Age,Child,Diability,Education,Ethnicity,Gender,Workstatus,BMI,Badminton_Frequency,Plays_badminton,Plays_racket
0,0.0,0.0,0.0,0.0,0.0,1,10.0,160480126774181.0,155.0,8.0,1.0,2.0,1.0,7.0,2.0,5.0,2.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1,6.0,151160011004881.0,123.0,6.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1,10.0,151160011005981.0,123.0,4.0,3.0,2.0,3.0,1.0,2.0,6.0,2.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1,9.0,151160011007481.0,123.0,7.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1,9.0,151160011007482.0,123.0,7.0,1.0,2.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117750,0.0,0.0,0.0,0.0,0.0,6,6.0,211090336822241.0,174.0,2.0,1.0,1.0,2.0,1.0,2.0,8.0,2.0,0.0,0.0,0.0
1117751,0.0,0.0,0.0,0.0,0.0,6,9.0,211090336822991.0,78.0,5.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0
1117752,0.0,0.0,0.0,0.0,0.0,6,5.0,211090336823721.0,78.0,6.0,1.0,2.0,3.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0
1117753,0.0,0.0,0.0,0.0,0.0,6,5.0,211090336823722.0,78.0,6.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0


In [19]:
#Encoding the data 
X = pd.get_dummies(df_imputed[[ 'Child','Diability','Education', 
                               'Ethnicity', 'Gender', 'Workstatus', 'BMI']],drop_first=True)



X["Time"] = df_imputed['Time']
X["IMD"] = df_imputed['IMD']
X['Age'] = df_imputed['Age']
#Standardising the data 
# Scale the data using StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

  X = pd.get_dummies(df_imputed[[ 'Child','Diability','Education',


In [20]:
X_scaled

Unnamed: 0,Child_2.0,Child_3.0,Child_4.0,Diability_2.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,Education_6.0,Ethnicity_2.0,Ethnicity_3.0,Ethnicity_4.0,Ethnicity_5.0,Ethnicity_6.0,Ethnicity_7.0,Gender_2.0,Gender_3.0,Workstatus_2.0,Workstatus_3.0,Workstatus_4.0,Workstatus_5.0,Workstatus_6.0,Workstatus_7.0,Workstatus_8.0,Workstatus_9.0,Workstatus_10.0,BMI_2.0,BMI_3.0,BMI_4.0,BMI_5.0,Time,IMD,Age
0,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,11.434535,0.89269,-0.037767,-0.431869,-0.118028,-0.120707,1.583657,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.543002,1.522406
1,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,-1.068711,1.541757,-0.399065,-0.12427,-1.406496,0.146220,0.407251
2,-0.360761,2.957257,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,0.89269,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,5.507812,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.543002,-0.707905
3,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.193806,0.964828
4,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,-1.406496,1.193806,0.964828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117750,-0.360761,-0.338151,-0.176585,-2.199386,2.490704,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,0.89269,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,5.326256,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,1.503443,0.146220,-1.823060
1117751,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,-0.436941,-0.142971,-0.222499,-0.294742,4.226031,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,1.503443,1.193806,-0.150327
1117752,-0.360761,-0.338151,-0.176585,0.454672,-0.401493,2.288636,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,-1.12021,-0.037767,-0.431869,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,-1.068711,1.541757,-0.399065,-0.12427,1.503443,-0.202976,0.407251
1117753,-0.360761,-0.338151,-0.176585,0.454672,2.490704,-0.436941,-0.142971,-0.222499,-0.294742,-0.236629,-0.208818,-0.122359,-0.075131,-0.108405,-0.087454,0.89269,-0.037767,2.315516,-0.118028,-0.120707,-0.631450,-0.181560,-0.158558,-0.187749,-0.051772,-0.166945,0.935707,-0.648611,-0.399065,-0.12427,1.503443,-0.202976,0.407251


## Logistic Regression

In [21]:
#performing logistic regression to explore the more simple patterns. 
# Separate the features and target variable

import statsmodels.api as sm

# Separate the features and target variable
X = X_scaled
y = df_imputed['Plays_badminton']

# Add constant to the features
X = sm.add_constant(X)

# Count the number of samples in each class
class_counts = df_imputed['Plays_badminton'].value_counts()

# Calculate inverse probability weights
weights = class_counts.sum() / (class_counts.shape[0] * class_counts)

# Create and fit logistic regression model with inverse probability weights
logit = sm.Logit(y, X)
result = logit.fit(method='bfgs', maxiter=100, disp=False, w=weights[y])

# Print summary of the model
print(result.summary())



                           Logit Regression Results                           
Dep. Variable:        Plays_badminton   No. Observations:              1117755
Model:                          Logit   Df Residuals:                  1117721
Method:                           MLE   Df Model:                           33
Date:                Sat, 05 Aug 2023   Pseudo R-squ.:                 0.05452
Time:                        01:52:51   Log-Likelihood:            -1.0451e+05
converged:                       True   LL-Null:                   -1.1054e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -4.1592      0.009   -481.574      0.000      -4.176      -4.142
Child_2.0           0.0199      0.007      3.067      0.002       0.007       0.033
Child_3.0           0.0451      

In [22]:
feature_names = feature_col.columns

# Create a new DataFrame to hold coefficients
coef_df = pd.DataFrame(columns=['Time'] + list(feature_names))

# Then inside your loop, instead of the results.append do:
coef_df = coef_df.append(pd.Series([time]+list(model.coef_), index=coef_df.columns), ignore_index=True)

# And for the combined model
coef_df = coef_df.append(pd.Series(['Combined']+list(model.coef_), index=coef_df.columns), ignore_index=True)

# Print the coefficients
print(coef_df)

AttributeError: 'Logit' object has no attribute 'coef_'

## Time Based and Time Combined Regression Analysis

## With class-Weights Applied 

In [None]:
# with year added in 

time_col = X_scaled['Time']
feature_col = X_scaled.drop(['Time'], axis=1)
target_col = df_imputed['Plays_badminton']

# Get the unique times
unique_times = time_col.unique()

# DataFrame to hold results
results = []

# Train-test split parameters
test_size = 0.2
random_state = 42

# Time-based regression
for time in unique_times:
    time_indices = time_col[time_col == time].index

    X_time = feature_col.loc[time_indices]
    y_time = target_col.loc[time_indices]

    # Calculate class weights
    class_weights = len(y_time) / (2 * np.bincount(y_time))

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_time, y_time, test_size=test_size, random_state=random_state
    )

    # Fit the model
    model = sm.Logit(y_train, sm.add_constant(X_train))
    results_time = model.fit(class_weight={0: class_weights[0], 1: class_weights[1]})

    # Store the results in a list
    results.append((time, results_time, X_test, y_test))

# Combined times regression
X_combined = X_scaled  # Including Time column for the combined model
y = target_col

# Calculate class weights
class_weights_combined = len(y) / (2 * np.bincount(y))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=test_size, random_state=random_state
)

# Fit the model
model = sm.Logit(y_train, sm.add_constant(X_train))
results_combined = model.fit(class_weight={0: class_weights_combined[0], 1: class_weights_combined[1]})

# Store the results for the combined model
results.append(("Combined", results_combined, X_test, y_test))

# Print results
for time, result, X_test, y_test in results:
    print(f"\nRegression Model for {time}:\n")
    print(result.summary())

    # Additional evaluation on the test set
    y_pred = result.predict(sm.add_constant(X_test))
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    print("Test Set Evaluation:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")