## Note: Due to privacy reasons, Output of this file will not be displayed ##

In [None]:
import pandas as pd
import numpy as np

## 1.  Import Average Monthly Engagement Data ##

In [None]:
# import engagement level ground truth
engagement_level_file = pd.read_csv("data/groundTruth.csv")
engagement_level_file = engagement_level_file.rename(columns={'IdPaciente' : 'Patient ID' })
engagement_level_file = engagement_level_file.loc[:, ~engagement_level_file.columns.str.contains('^Unnamed')]
engagement_level_file

# process level_of_engagement 
engagement_level_file['predict'] = engagement_level_file['Level_of_Engagement'].shift(1)
useless = engagement_level_file.groupby(['Patient ID']).head(1)
engagement_level_file = engagement_level_file.append(useless)
engagement_level_file = engagement_level_file.drop_duplicates(keep = False)

engagement_level_file

## 2. Model 1: using last's month engagement and Lab findings to predict next month's engagement ##

### import lab findings predictors ##

In [None]:
predictors = pd.read_csv("data/pyschData.csv")
predictors = predictors.loc[:, ~predictors.columns.str.contains('^Unnamed')]
result = pd.merge(predictors, engagement_level_file, on=['Patient ID', 'Year', 'Month'], how="inner")
result = result.dropna(axis=0)
result.sort_values(by=['Patient ID'])


In [None]:
pip install plotly

In [None]:
import matplotlib.pyplot as plt

plt.xlabel('engagement Level')
plt.ylabel('neuropathy')
plt.xlim(xmax=12,xmin=0)
plt.ylim(ymax=2,ymin=0)

area = np.pi * 4**2 
colors1 = '#00CED1'
colors2='#DC143C'

# scatter plot
plt.scatter(result['Level_of_Engagement'], result['PERIFERICA'], s=area, c=colors1, alpha=0.4)


#plt.legend()

plt.show()

In [None]:
result[['PERIFERICA','Level_of_Engagement']].groupby('PERIFERICA').describe()

In [None]:
result[['Neuropathy','Level_of_Engagement']].groupby('Neuropathy').describe()

In [None]:
result[['VISCERAL','Level_of_Engagement']].groupby('VISCERAL').describe()

In [None]:
result[['AUTONOMICA','Level_of_Engagement']].groupby('AUTONOMICA').describe()

### In Lab Findings, Neuropathy, Periferica, visceral are most unlike each other, which indicate they are good features to include ###

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
corrMatrix = result.corr()
sn.heatmap(corrMatrix,annot=True)
plt.show()

###  Select PERIFERICA, VISCERAL,  AUTONOMICA and last month engagement as features ###

In [None]:
labFinding_features = ['PERIFERICA', 'VISCERAL',
       'AUTONOMICA', 'predict']
X = result[labFinding_features]
y = result.Level_of_Engagement

### Model : Lab findings and previous month engagement predicting current month engagement using Support Vector Machine ###

In [None]:
#Support Vector Machine
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.02, kernel = 'rbf'))
regr.fit(train_X, train_y)
val_predictions = regr.predict(val_X)
print( "Mean absolute error: " + str(mean_absolute_error(val_y, val_predictions)))

### Visualize accuracy of prediction: Heatmap of predicted engagement vs actual engagement ###
Areas where the prediction is correct lights up. An accurate model should have blocks light up on the diagnal line from bottom left to top right. Our model did a good job in predicting engagement level because a good chunk of the diagnal line lights up. 

In [None]:
heatmap, xedges, yedges = np.histogram2d(val_y, val_predictions, bins=10)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower',cmap='hot')
plt.show()

## 3. Model 2: Lifestyle, diet, and previous month engagement predicting current month engagement using Decision Tree ##

In [None]:
data4 = pd.read_csv("data/lifestyle_and_diet.csv",na_values="?")
data4 = data4.rename(columns={'Patient Id' : 'Patient ID' })
data4 = data4[['Patient ID', 'Total Energy Expenditure', 'Age', 'BMI', 'Calories']]
lifeStyleData= pd.merge(data4, engagement_level_file, on=['Patient ID'], how="inner")
lifeStyleData.sort_values(by=['Patient ID'])

In [None]:
corrMatrix = lifeStyleData.corr()
sn.heatmap(corrMatrix,annot=True)
plt.show()

###  Select Total Engergy Expenditure, Age, BMI and last month engagement as features ###

In [None]:
y2 = lifeStyleData.Level_of_Engagement
bodyFeatures = ['Total Energy Expenditure','Age','BMI', 'predict']
X2 = lifeStyleData[bodyFeatures]

### Build the model with Decision Tree ###

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
train_X2, val_X2, train_y2, val_y2 = train_test_split(X2, y2, random_state = 1)
# Define model
lab_finding_predict = DecisionTreeRegressor()
lab_finding_predict.fit(train_X2, train_y2)

val_predictions2 = lab_finding_predict.predict(val_X2)
print("Mean absolute error " + str(mean_absolute_error(val_y2, val_predictions2)))

### Visualize accuracy of prediction: Heatmap of predicted engagement vs actual engagement ###
This model also showed significant improvement over the baseline mean absolute error

In [None]:
heatmap, xedges, yedges = np.histogram2d(val_y2, val_predictions2, bins=10)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower',cmap='hot')
plt.show()

## 4. Appendix ##

## Below are the models and features we tried, but found them not very usful, as they are worse than baseline prediction ##

### Data import ###

- patient demographics

In [None]:
data6 = pd.read_csv("data/patient_demographics.csv",na_values="?")
#adjust female/male and maritial status to number category
data6 = data6.replace('H', 0)
data6 = data6.replace('M', 1)

data6 = data6.replace('SOLTERÍA', 0)
data6 = data6.replace('MATRIMONIO', 1)
data6 = data6.replace('UNIÓN LIBRE', 2)
data6 = data6.replace('VIUDEZ', 3)
data6 = data6.replace('DIVORCIO', 4)

data6

- patient geography

In [None]:
data7 = pd.read_csv("data/patient_geography.csv",na_values="?")
data7

### merge engagement predictor ###

In [None]:
model2Data = pd.merge(data7, engagement_level_file, on=['Patient ID'], how="inner")
#result = pd.merge(predictors2, result, on=['Patient ID', 'Year', 'Month'], how="outer")
result = result.dropna(axis=0)
#result = result.fillna(0)
model2Data.sort_values(by=['Patient ID'])

In [None]:
model3Data = pd.merge(data6, engagement_level_file, on=['Patient ID'], how="inner")
model3Data.sort_values(by=['Patient ID'])

In [None]:
model4Data = pd.merge(data4, engagement_level_file, on=['Patient ID'], how="inner")
model4Data.sort_values(by=['Patient ID'])

### Decision Tree on Lab findings - performance not as well as Support Vector Machine ###

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)
# Define model
lab_finding_predict = DecisionTreeRegressor()
lab_finding_predict.fit(train_X, train_y)

val_predictions = lab_finding_predict.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

### Linear Regression on Lab findings - performance not as well as Support Vector Machine ###

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
lab_finding_predict = LinearRegression()
lab_finding_predict.fit(train_X, train_y)

val_predictions = lab_finding_predict.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

### Decision tree model based on Patient demographics  - worse error comparing to baseline ###

In [None]:
y3 = model3Data.Level_of_Engagement
demographicFeatures = ['Sex','Marital status','YearBirth', 'predict']
# labFinding_features = ['ApoyoSocialEmocional']
X3 = model3Data[demographicFeatures]

In [None]:
#Decision Tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
train_X, val_X, train_y, val_y = train_test_split(X3, y3, random_state = 1)
# Define model
lab_finding_predict = DecisionTreeRegressor()
lab_finding_predict.fit(train_X, train_y)

val_predictions = lab_finding_predict.predict(val_X)
print("Mean absolute error: " + str(mean_absolute_error(val_y, val_predictions)))

### Linear regression model based on Patient demographics  - worse error comparing to baseline ###

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
lab_finding_predict = LinearRegression()
lab_finding_predict.fit(train_X, train_y)

val_predictions = lab_finding_predict.predict(val_X)
print("Mean absolute error: " + str(mean_absolute_error(val_y, val_predictions)))

### Linear regression model for Patient Lifestyle ###

In [None]:
bodyFeatures = ['Total Energy Expenditure','Age','BMI', 'predict']
# labFinding_features = ['ApoyoSocialEmocional']
X4 = model4Data[bodyFeatures]

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
lab_finding_predict = LinearRegression()
lab_finding_predict.fit(train_X, train_y)

val_predictions = lab_finding_predict.predict(val_X)
print("Mean absolute error: " + str(mean_absolute_error(val_y, val_predictions)))

### All features above were also put in Support Vector Machine model, and the model did not converge ###