In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [2]:
df = pd.read_csv('Data/selected_features.csv')

In [3]:
df.head()

Unnamed: 0,NACCID,CDRSUM,DECIN,MOSLOW,MEMORY,SPEECH,BILLS,TRAVEL,MOFALLS,MOTREM,...,TOBAC30,TOBAC100,SMOKYRS,PACKSPER,QUITSMOK,ALCOHOL,DIABETES,HYPERTEN,CVHATT,SEX
0,NACC235922,3.5,1,0,1.0,-4,1,0,0,0,...,0,1,22,2,40,0,0,1,0,2
1,NACC383757,0.5,8,0,0.5,-4,0,0,0,0,...,0,1,15,5,28,2,0,1,0,1
2,NACC418253,2.0,1,0,1.0,-4,0,0,0,0,...,0,1,10,2,30,0,0,0,0,2
3,NACC433163,0.5,1,0,0.5,-4,8,0,0,1,...,0,1,2,2,17,0,0,0,0,1
4,NACC476187,5.0,1,0,1.0,-4,3,3,0,0,...,0,1,25,2,45,0,0,1,0,1


In [5]:
df = df.drop('NACCID', axis=1)

**Linear Regression**

In [7]:
#dummy coding categorical variables
df_encoded = pd.get_dummies(df, columns=['SEX', 'TOBAC30', 'TOBAC100', 'ALCOHOL', 'DIABETES', 'HYPERTEN', 'CVHATT', 'DECIN', 'MOSLOW', 'MEMORY', 'SPEECH', 'BILLS', 'TRAVEL', 'MOFALLS', 'MOTREM'], drop_first=True)

In [10]:
y = df_encoded['CDRSUM']
X = df_encoded.drop(columns=['CDRSUM'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a LinearRegression object
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Once the model is trained, you can make predictions on the test data
predictions = model.predict(X_test)

In [15]:
print('model intercept :', model.intercept_) 
print('model coefficients : ', model.coef_) 
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error (MSE):", mse)

# Calculate R-squared
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

model intercept : -0.6478909571391449
model coefficients :  [-4.92143018e-03  2.30510592e-03  3.36029315e-03  6.38953729e-03
 -1.85078725e-03 -6.73891172e-02 -1.52729421e-02  2.42826573e-02
  1.06026135e+00  1.23399245e+00  1.15816289e-01  9.00604315e-02
 -6.68366980e-01  1.43187531e-02  1.19899009e-01 -1.85863447e-01
 -8.99719864e-03 -2.91861827e-03  2.82636735e-01 -2.49965792e-01
 -6.44238763e-03 -3.65654584e-01  1.38055139e-01 -1.53682661e-02
  6.69278757e-03  3.27659648e-01 -1.22924783e-01  7.62181583e-01
  2.78859383e+00  6.68046904e+00  1.19121448e+01  4.43175975e-02
  8.49283917e-02  6.50359197e-01  1.62483121e+00  2.99179363e+00
  1.79084529e+00 -3.00287251e-02  5.59773103e-02  3.85432470e-01
  1.11253406e+00  4.49824516e-01 -4.41892092e-02 -1.12589757e-01
  1.26846058e-01  5.44820099e-01  1.60634405e+00  2.33514134e-01
 -4.69384166e-01  5.79410179e-01  5.29548656e-01 -1.53173162e-01
 -4.11361988e-01]
Mean Squared Error (MSE): 0.7459778745788452
R-squared: 0.9130803861566442
Me

**Random Forest**

**100 trees**

In [22]:
y2 = df['CDRSUM']
X2 = df.drop(columns=['CDRSUM'])

# Split the data into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=42)

# Create a RandomForestRegressor object
model2 = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust the number of trees with n_estimators

# Fit the model to the training data
model2.fit(X_train2, y_train2)

# Make predictions on the test data
predictions = model2.predict(X_test2)

In [24]:
# Calculate evaluation metrics
mse_2 = mean_squared_error(y_test2, predictions)
r2_2 = r2_score(y_test2, predictions)
mae_2 = mean_absolute_error(y_test2, predictions)
print("Mean Squared Error (MSE):", mse_2)
print("R-squared:", r2_2)
print("Mean Absolute Error (MAE):", mae_2)

Mean Squared Error (MSE): 0.422129687119552
R-squared: 0.9508144267456152
Mean Absolute Error (MAE): 0.2566581446311176


**150 trees**

In [30]:
# Split the data into training and testing sets
X_train3, X_test3, y_train3, y_test3 = train_test_split(X2, y2, test_size=0.3, random_state=42)

# Create a RandomForestRegressor object
model3 = RandomForestRegressor(n_estimators=150, random_state=42)  # You can adjust the number of trees with n_estimators

# Fit the model to the training data
model3.fit(X_train3, y_train3)

# Make predictions on the test data
predictions = model3.predict(X_test3)

In [31]:
# Calculate evaluation metrics
mse_3 = mean_squared_error(y_test3, predictions)
r2_3 = r2_score(y_test3, predictions)
mae_3 = mean_absolute_error(y_test3, predictions)
print("Mean Squared Error (MSE):", mse_3)
print("R-squared:", r2_3)
print("Mean Absolute Error (MAE):", mae_3)

Mean Squared Error (MSE): 0.42130975840705576
R-squared: 0.9509099629397811
Mean Absolute Error (MAE): 0.2561967372778184
