In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
train_df = pd.read_csv('train.csv')

In [4]:
train_df

Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,5,49,No,7,5,36
1,1562,2,48,Yes,7,6,25
2,1671,2,81,No,7,2,59
3,6088,2,46,No,6,1,22
4,6670,8,47,No,9,0,40
...,...,...,...,...,...,...,...
7995,5735,8,50,Yes,6,6,48
7996,5192,4,68,No,9,3,51
7997,5391,9,48,No,7,6,44
7998,861,1,47,No,9,0,20


In [5]:
processed_df = pd.get_dummies(train_df, columns=["Lifestyle Activities"], drop_first=True)

In [18]:
labels = processed_df["Recovery Index"]
features = processed_df.drop("Recovery Index", axis=1)
features.drop("Id", axis=1, inplace=True)

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error

In [19]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)

Making a basic Linear Regression Model to predict Recovery Index based on other features in the dataset.

In [22]:
model = LinearRegression()
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [21]:
y_test_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [12]:
def calc_and_print_metrics(true_values, predictions, model_name):
    r2 = r2_score(true_values, predictions)
    mae = mean_absolute_error(true_values, predictions)
    mse = mean_squared_error(true_values, predictions)
    rmse = np.sqrt(mse)
    
    print(f"{model_name} Performance Metrics:")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print("-" * 40)

In [13]:
def convert_into_csv(og_df, predictions, filename):
    submission_df = pd.DataFrame({'Id': og_df['Id'], 'Recovery Index': predictions})
    submission_df.to_csv(f"submission_{filename}.csv", index=False)

In [23]:
calc_and_print_metrics(y_test, y_test_pred, "Normal Linear Regression")

Normal Linear Regression Performance Metrics:
R² Score: 0.9879
Mean Absolute Error (MAE): 1.6578
Mean Squared Error (MSE): 4.4117
Root Mean Squared Error (RMSE): 2.1004
----------------------------------------


In [24]:
X_train.head()

Unnamed: 0,Therapy Hours,Initial Health Score,Average Sleep Hours,Follow-Up Sessions,Lifestyle Activities_Yes
1001,6,90,9,3,True
7360,2,96,6,3,True
5234,8,99,5,8,False
7390,2,50,7,5,True
6841,4,65,7,3,True


In [25]:
test_df = pd.read_csv('test.csv')
test_processed_df = pd.get_dummies(test_df, columns=["Lifestyle Activities"], drop_first=True)
test_processed_df.drop("Id", axis=1, inplace=True)
test_processed_df.head()

Unnamed: 0,Therapy Hours,Initial Health Score,Average Sleep Hours,Follow-Up Sessions,Lifestyle Activities_Yes
0,5,69,8,2,False
1,2,46,4,8,True
2,7,56,7,5,True
3,6,42,8,5,True
4,7,53,4,6,False


In [26]:
test_preds = model.predict(test_processed_df)
convert_into_csv(test_df, test_preds, "linear_regression")

Trying Polynomial Features to see if it improves the model performance.

In [27]:
polynomial_features = PolynomialFeatures(degree=2)

X_poly_train = polynomial_features.fit_transform(X_train)
X_poly_test = polynomial_features.transform(X_test)

polynomial_model = LinearRegression()
polynomial_model.fit(X_poly_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [28]:
y_poly_test_pred = polynomial_model.predict(X_poly_test)

In [29]:
calc_and_print_metrics(y_test, y_poly_test_pred, "Polynomial Regression (Degree 2)")

Polynomial Regression (Degree 2) Performance Metrics:
R² Score: 0.9879
Mean Absolute Error (MAE): 1.6586
Mean Squared Error (MSE): 4.4220
Root Mean Squared Error (RMSE): 2.1028
----------------------------------------


As we can see, the Polynomial Regression model does not perform better than the basic Linear Regression model on our dataset.

As the recovery index is very much correlated with Inital Health Index and Therapy Hours, let us try training on only these two features.

In [30]:
X_simple = processed_df[['Initial Health Score', 'Therapy Hours']]

In [31]:
X_simple_train, X_simple_test, y_simple_train, y_simple_test = train_test_split(X_simple, labels, test_size=0.2, random_state=0)

In [32]:
model_simple = LinearRegression()
model_simple.fit(X_simple_train, y_simple_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [33]:
test_preds_simple = model_simple.predict(X_simple_test)
calc_and_print_metrics(y_simple_test, test_preds_simple, "Simple Linear Regression with 2 features")

Simple Linear Regression with 2 features Performance Metrics:
R² Score: 0.9849
Mean Absolute Error (MAE): 1.8577
Mean Squared Error (MSE): 5.4884
Root Mean Squared Error (RMSE): 2.3427
----------------------------------------


AS you can see, it performed poorly

Trying StandardScaler on the features to see if it improves model performance.

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
scaled_model = LinearRegression()
scaled_model.fit(X_train_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [36]:
y_pred_scaled = scaled_model.predict(X_test_scaled)

In [37]:
calc_and_print_metrics(y_test, y_pred_scaled, "Linear Regression with StandardScaler")

Linear Regression with StandardScaler Performance Metrics:
R² Score: 0.9879
Mean Absolute Error (MAE): 1.6578
Mean Squared Error (MSE): 4.4117
Root Mean Squared Error (RMSE): 2.1004
----------------------------------------


No betterment seen with StandardScaler.

Trying to alter random state in train-test-split to see if it improves model performance.

In [39]:
best_rs = -1
best_mse = float('inf')

for rs in range(0, 101):
    X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(features, labels, test_size=0.2, random_state=rs)
    model_rs = LinearRegression()
    model_rs.fit(X_train_rs, y_train_rs)
    y_test_rs_pred = model_rs.predict(X_test_rs)
    mse_rs = mean_squared_error(y_test_rs, y_test_rs_pred)
    
    if mse_rs < best_mse:
        best_mse = mse_rs
        best_rs = rs

print(f"Best Random State: {best_rs} with MSE: {best_mse:.4f}")

Best Random State: 49 with MSE: 3.8690


In [40]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, labels, test_size=0.2, random_state=49)

In [41]:
model_49 = LinearRegression()
model_49.fit(X_train2, y_train2)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [42]:
y_preds_49 = model_49.predict(X_test2)
calc_and_print_metrics(y_test2, y_preds_49, "Linear Regression with Random State 49")

Linear Regression with Random State 49 Performance Metrics:
R² Score: 0.9894
Mean Absolute Error (MAE): 1.5679
Mean Squared Error (MSE): 3.8690
Root Mean Squared Error (RMSE): 1.9670
----------------------------------------


In [43]:
test_preds_49 = model_49.predict(test_processed_df)
convert_into_csv(test_df, test_preds_49, "linear_regression_rs49")