In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
train_df

Unnamed: 0,Id,Therapy Hours,Initial Health Score,Lifestyle Activities,Average Sleep Hours,Follow-Up Sessions,Recovery Index
0,9255,5,49,No,7,5,36
1,1562,2,48,Yes,7,6,25
2,1671,2,81,No,7,2,59
3,6088,2,46,No,6,1,22
4,6670,8,47,No,9,0,40
...,...,...,...,...,...,...,...
7995,5735,8,50,Yes,6,6,48
7996,5192,4,68,No,9,3,51
7997,5391,9,48,No,7,6,44
7998,861,1,47,No,9,0,20


In [4]:
processed_df = pd.get_dummies(train_df, columns=["Lifestyle Activities"], drop_first=True)

In [5]:
labels = processed_df["Recovery Index"]
features = processed_df.drop("Recovery Index", axis=1)
features.drop("Id", axis=1, inplace=True)

Let us add some additional features in hopes of improving the performance of our model.

In [18]:
processed_df2 = processed_df.copy()
processed_df2.drop("Id", axis=1, inplace=True)
processed_df2 ['therapy_x_health'] = processed_df2 ['Therapy Hours'] * processed_df2['Initial Health Score']
processed_df2 ['therapy_x_followup'] = processed_df2 ['Therapy Hours'] * processed_df2 ['Follow-Up Sessions']
processed_df2 ['sleep_x_health'] = processed_df2 ['Average Sleep Hours'] * processed_df2['Initial Health Score']
processed_df2 ['therapy_x_sleep'] = processed_df2 ['Therapy Hours'] * processed_df2 ['Average Sleep Hours']
# Ratio Features
processed_df2 ['therapy_by_followup'] = processed_df2 ['Therapy Hours'] / (processed_df2 [ 'Follow-Up Sessions'] +1)
processed_df2 ['health_by_sleep'] = processed_df2['Initial Health Score'] / (processed_df2 ['Average Sleep Hours' ] + 1)
processed_df2 ['therapy_by_sleep'] = processed_df2 ['Therapy Hours'] / (processed_df2 ['Average Sleep Hours'] + 1)
processed_df2 ['followup_by_therapy'] = processed_df2['Follow-Up Sessions'] / (processed_df2 ['Therapy Hours'] + 1)
# Polynomial Features (degree 2 for most important features)
processed_df2 ['therapy_hours_squared'] = processed_df2 ['Therapy Hours'] ** 2
processed_df2 ['health_score_squared'] = processed_df2['Initial Health Score'] ** 2
processed_df2 ['sleep_hours_squared'] = processed_df2['Average Sleep Hours'] ** 2
processed_df2 ['followup_squared']= processed_df2['Follow-Up Sessions'] ** 2

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [20]:
X_train, X_test, y_train, y_test = train_test_split(processed_df2.drop("Recovery Index", axis=1), processed_df2["Recovery Index"], test_size=0.11, random_state=73)

In [21]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [22]:
def calc_and_print_metrics(true_values, predictions, model_name):
    r2 = r2_score(true_values, predictions)
    mae = mean_absolute_error(true_values, predictions)
    mse = mean_squared_error(true_values, predictions)
    rmse = np.sqrt(mse)
    
    print(f"{model_name} Performance Metrics:")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print("-" * 40)

In [23]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
calc_and_print_metrics(y_test, y_pred, "Linear Regression")

Linear Regression Performance Metrics:
R² Score: 0.9903
Mean Absolute Error (MAE): 1.5306
Mean Squared Error (MSE): 3.6983
Root Mean Squared Error (RMSE): 1.9231
----------------------------------------


In [24]:
def convert_into_csv(og_df, predictions, filename):
    submission_df = pd.DataFrame({'Id': og_df['Id'], 'Recovery Index': predictions})
    submission_df.to_csv(f"submission_{filename}.csv", index=False)

In [25]:
test_df = pd.read_csv('test.csv')
processed_test_df = pd.get_dummies(test_df, columns=["Lifestyle Activities"], drop_first=True)
processed_test_df2 = processed_test_df.copy()

In [26]:
processed_test_df2 ['therapy_x_health'] = processed_test_df2 ['Therapy Hours'] * processed_test_df2['Initial Health Score']
processed_test_df2 ['therapy_x_followup'] = processed_test_df2 ['Therapy Hours'] * processed_test_df2 ['Follow-Up Sessions']
processed_test_df2 ['sleep_x_health'] = processed_test_df2 ['Average Sleep Hours'] * processed_test_df2['Initial Health Score']
processed_test_df2 ['therapy_x_sleep'] = processed_test_df2 ['Therapy Hours'] * processed_test_df2 ['Average Sleep Hours']
# Ratio Features
processed_test_df2 ['therapy_by_followup'] = processed_test_df2 ['Therapy Hours'] / (processed_test_df2 [ 'Follow-Up Sessions'] +1)
processed_test_df2 ['health_by_sleep'] = processed_test_df2['Initial Health Score'] / (processed_test_df2 ['Average Sleep Hours' ] + 1)
processed_test_df2 ['therapy_by_sleep'] = processed_test_df2 ['Therapy Hours'] / (processed_test_df2 ['Average Sleep Hours'] + 1)
processed_test_df2 ['followup_by_therapy'] = processed_test_df2['Follow-Up Sessions'] / (processed_test_df2 ['Therapy Hours'] + 1)
# Polynomial Features (degree 2 for most important features)
processed_test_df2 ['therapy_hours_squared'] = processed_test_df2 ['Therapy Hours'] ** 2
processed_test_df2 ['health_score_squared'] = processed_test_df2['Initial Health Score'] ** 2
processed_test_df2 ['sleep_hours_squared'] = processed_test_df2['Average Sleep Hours'] ** 2
processed_test_df2 ['followup_squared']= processed_test_df2['Follow-Up Sessions'] ** 2

In [27]:
processed_df2.head(1)

Unnamed: 0,Therapy Hours,Initial Health Score,Average Sleep Hours,Follow-Up Sessions,Recovery Index,Lifestyle Activities_Yes,therapy_x_health,therapy_x_followup,sleep_x_health,therapy_x_sleep,therapy_by_followup,health_by_sleep,therapy_by_sleep,followup_by_therapy,therapy_hours_squared,health_score_squared,sleep_hours_squared,followup_squared
0,5,49,7,5,36,False,245,25,343,35,0.833333,6.125,0.625,0.833333,25,2401,49,25


In [29]:
processed_test_df2.head(1)

Unnamed: 0,Id,Therapy Hours,Initial Health Score,Average Sleep Hours,Follow-Up Sessions,Lifestyle Activities_Yes,therapy_x_health,therapy_x_followup,sleep_x_health,therapy_x_sleep,therapy_by_followup,health_by_sleep,therapy_by_sleep,followup_by_therapy,therapy_hours_squared,health_score_squared,sleep_hours_squared,followup_squared
0,6253,5,69,8,2,False,345,10,552,40,1.666667,7.666667,0.555556,0.333333,25,4761,64,4


In [30]:
processed_test_df2.drop("Id", axis=1, inplace=True)

In [31]:
test_preds = model.predict(processed_test_df2)
convert_into_csv(test_df, test_preds, "linear_regression_feature_engineering")