In [1]:
## importing the libraries
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from sklearn.decomposition import PCA
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


## setting the maximum columns and rows
import os
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## removing warnings
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [2]:
## importing datasets
master_data = pd.read_csv(r"../data/processed_data/master_data_for_modelling.csv")
data = master_data.iloc[:,2:]

In [4]:
## converting categorical columns into strings
categorical_cols = ['flag_ip','Gender','Race', 'RenalDiseaseIndicator','ChronicCond_Alzheimer','ChronicCond_Heartfailure','ChronicCond_KidneyDisease','ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary','ChronicCond_Depression','ChronicCond_Diabetes','ChronicCond_IschemicHeart','ChronicCond_Osteoporasis',
'ChronicCond_rheumatoidarthritis','ChronicCond_stroke','Aged','community_institutional','Disability_condn','Benefits',
'medicaid_flag','Disease_intraction_DIABETES_HF','Disease_intraction_HF_CHR_LUNG','Disease_intraction_HF_KIDNEY','Disease_intraction_CHR_LUNG_CARD_RESP_FAIL',
'Disease_intraction_HF_HCC238','Disease_intraction_gSubUseDisorder_gPsych_','DISABLED_HF','DISABLED_ULCER_',
'DISABLED_CANCER','DISABLED_NEURO','DISABLED_CHR_LUNG','disability']
data[categorical_cols] = data[categorical_cols].astype(str)

In [5]:
## saclling and onehot encoding of data

# Define the column transformer
ct = make_column_transformer(
    (StandardScaler(), data.select_dtypes(include=['int', 'float']).columns.tolist()),
    (OneHotEncoder(drop = "first"), data.select_dtypes(include=['object']).columns.tolist()))

# Define the pipeline
pipeline = make_pipeline(ct)

# Transform the data
transformed_data = pipeline.fit_transform(data)

# Get the names of the encoded columns
encoded_columns = pipeline.named_steps['columntransformer'].transformers_[1][1].get_feature_names(data.select_dtypes(include=['object']).columns.tolist())

# Combine the column names
column_names = data.select_dtypes(include=['int', 'float']).columns.tolist() + encoded_columns.tolist()

# Convert the transformed data to a DataFrame with column names
transformed_data_df = pd.DataFrame(transformed_data, columns=column_names)


In [6]:
## missing values imputation with median
transformed_data_df = transformed_data_df.fillna(transformed_data_df.median())

## Decision Tree

In [7]:
## finding top features based on decision tree
X = transformed_data_df.drop(columns="other_disease_mean_hcc_factor")
y = master_data['HCC']

# Train the decision tree model
model = DecisionTreeClassifier()
model.fit(X, y)

# Get the feature importances
importances = model.feature_importances_

# Create a new DataFrame with the feature names and importances
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort the features by importance (descending order)
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index()
top_features = feature_importances['feature'][:19]

## random forest model

In [8]:
##creating a classification model on topfeatures to see its performance
X = transformed_data_df[top_features]
y = master_data['HCC']

# Split the data into train and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a random forest model with 100 trees
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Use the trained model to predict the labels of the test set
y_pred = model.predict(X_test)

# Calculate the accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred,average="macro")
recall = recall_score(y_test, y_pred,average="macro")
f1 = f1_score(y_test, y_pred,average="macro")

# Print the results
print('Accuracy:', accuracy*100)
print('Precision:', precision*100)
print('Recall:', recall*100)
print('F1-score:', f1*100)

Accuracy: 17.637558551021595
Precision: 16.65748642251216
Recall: 6.052531765412421
F1-score: 7.96130067166186


matrix with all features-

- Accuracy: 31.020968759367046
- Precision: 28.64776065297239
- Recall: 08.417712716824272
- F1-score: 10.492380559764748



## PCA

In [9]:
# Apply PCA to combine the variables into a single factor
pca = PCA(n_components = 0.8)
factors_PCA = pca.fit_transform(transformed_data_df[top_features])

In [10]:
pca.explained_variance_ratio_

array([0.17510434, 0.1270369 , 0.10549301, 0.10129828, 0.08093678,
       0.05449135, 0.04909844, 0.04720394, 0.0439949 , 0.03881596])

## LDA

In [11]:
# Apply LDA to combine the variables into a single factor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis(n_components = 1)
factors_LDA = clf.fit_transform(transformed_data_df[top_features], master_data['HCC'])

In [12]:
## checking variance captured by top LDA feature created using Top 19 raw features
clf.explained_variance_ratio_

array([0.67402423])

## MDA

In [13]:
## using MDA to get the variance capture
import prince
# Fit and transform the MCA
mca = prince.MCA(n_components=1, n_iter=3, copy=True, check_input=True, engine='sklearn')
mca.fit(transformed_data_df[encoded_columns])
# mca_transformed = mca.transform(transformed_data_df[encoded_columns])

# Get the explained variance ratio
mca.eigenvalues_summary

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.187,8.79%,8.79%


## FAMD

In [15]:
## using FAMD to create a feature

import prince
# Fit and transform the MCA
FAMD = prince.FAMD(n_components=1, n_iter=3, copy=True, check_input=True, engine='sklearn')
FAMD.fit(data.head(10000).dropna())
FAMD_transformed = mca.transform(data.dropna())

# Get the explained variance ratio
FAMD.eigenvalues_summary

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,44.409,8.15%,8.15%


## surprise SVD algorithm

In [16]:
# Add the factors to the data
master_data['factors'] = factors_LDA

## selecting data in a formate which Surprise library wants
recommender_data = master_data[['BeneID','HCC','factors']].drop_duplicates(subset=["BeneID","HCC"], keep = "first")

In [17]:
# Define the Reader
reader = Reader(rating_scale=(recommender_data["factors"].min(), recommender_data["factors"].max()))

# Load the data into Surprise's Dataset object
recommender_data_surprise = Dataset.load_from_df(recommender_data[['BeneID','HCC','factors']], reader)

# Define the SVD algorithm
algo = SVD()

# Train the algorithm on the data
trainset = recommender_data_surprise.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22a9be190a0>

In [18]:
# Make recommendations
user_id = 'BENE100178'
num_recommendations = 10
recommendations = []
for item_id in recommender_data_surprise.df['HCC'].unique():
    rating = algo.predict(user_id, item_id).est
    recommendations.append((item_id, rating))
recommendations.sort(key=lambda x: x[1], reverse=True)
top_recommendations = recommendations[:num_recommendations]
for recommendation in top_recommendations:
    print(recommendation)

("{'463'}", 0.5815363066627643)
("{'223'}", 0.5746145067284626)
("{'283'}", 0.49996631730888874)
("{'277'}", 0.4967712149625485)
("{'77'}", 0.4948164094379495)
("{'405'}", 0.4874632798403174)
("{'200'}", 0.4811086859409559)
("{'49'}", 0.4773960485615641)
("{'50'}", 0.47152438182298373)
("{'197'}", 0.4639417507447894)


In [19]:
master_data[master_data["BeneID"] == "BENE100178"]

Unnamed: 0,BeneID,HCC,InscClaimAmtReimbursed,DeductibleAmtPaid,flag_ip,Gender,Race,RenalDiseaseIndicator,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,Aged,claim_duration_num,community_institutional,Disability_condn,Benefits,medicaid_flag,Disease_intraction_DIABETES_HF,Disease_intraction_HF_CHR_LUNG,Disease_intraction_HF_KIDNEY,Disease_intraction_CHR_LUNG_CARD_RESP_FAIL,Disease_intraction_HF_HCC238,Disease_intraction_gSubUseDisorder_gPsych_,DISABLED_HF,DISABLED_ULCER_,DISABLED_CANCER,DISABLED_NEURO,DISABLED_CHR_LUNG,ACS_TOT_POP_WT_ZC,ACS_TOT_WORKER_HH_ZC,ACS_TOT_HH_ZC,WORKER_PER_HH,ACS_PCT_FEMALE_ZC,ACS_PCT_MALE_ZC,MALE_TO_FEMALE_RATIO,ACS_PCT_HH_LIMIT_ENGLISH_ZC,ACS_PCT_AGE_ABOVE65_ZC,ACS_PCT_AGE_ABOVE80_ZC,ACS_PCT_HH_NO_COMP_DEV_ZC,ACS_PCT_HH_SMARTPHONE_ZC,ACS_PCT_HH_PC_ZC,ACS_PCT_HH_NO_INTERNET_ZC,PCT_HAS_INTERNET,ACS_MEDIAN_HH_INC_ZC,ACS_PCT_INC50_ABOVE65_ZC,ACS_PCT_HEALTH_INC_BELOW137_ZC,ACS_PCT_HEALTH_INC_138_199_ZC,ACS_PCT_HEALTH_INC_200_399_ZC,ACS_PCT_HEALTH_INC_ABOVE400_ZC,ACS_PER_CAPITA_INC_ZC,ACS_PCT_COLLEGE_ASSOCIATE_DGR_ZC,ACS_PCT_BACHELOR_DGR_ZC,ACS_PCT_HS_GRADUATE_ZC,ACS_PCT_LT_HS_ZC,ACS_PCT_POSTHS_ED_ZC,ACS_PCT_HH_ABOVE65_ZC,ACS_PCT_HH_ALONE_ABOVE65_ZC,CEN_POPDENSITY_ZC,ACS_PCT_HU_NO_VEH_ZC,ACS_PCT_PUBL_TRANSIT_ZC,ACS_PCT_TAXICAB_2WORK_ZC,AVG_DIST_TO_MEDI_CARE,ACS_PCT_MEDICARE_ONLY_ZC,PCT_ANY_OTHER_INSUR,ACS_PCT_UNINSURED_ZC,disability,other_disease_mean_hcc_factor,factors
603931,BENE100178,{'225'},140,0.0,out,0,0,0,12,12,1,0,1,1,1,1,0,0,1,1,1,0,0,430,440,72.0,1,0,Institutional,Non_disabled,,0,0,0,0,0,0,0,0,0,0,0,0,1501.0,914.0,524.0,2.0,45.97,54.03,1.175332,0.0,11.06,0.8,1.34,91.79,90.65,4.58,95.42,102045.0,0.0,5.02,2.47,35.99,56.52,41871.0,34.45,23.65,31.31,4.42,64.28,19.66,2.67,92.62,1.15,0.0,0.0,8.55875,5.07,83.78,0.73,1,0.217,0.536411
603932,BENE100178,{'225'},140,0.0,out,0,0,0,12,12,1,0,1,1,1,1,0,0,1,1,1,0,0,430,440,71.0,1,0,Community,Non_originally_disabled,NonDual,0,0,0,0,0,0,0,1,0,0,0,0,2139.0,707.0,618.0,1.0,46.94,53.06,1.130379,0.0,15.15,5.05,19.74,67.48,58.58,28.96,71.04,43409.0,3.51,40.56,13.24,29.95,16.26,17373.0,28.77,12.93,36.28,15.93,47.79,34.14,9.71,14.13,7.61,2.86,1.36,22.82375,7.81,31.74,24.72,1,0.36,0.512781
603933,BENE100178,{'21'},10,0.0,out,0,0,0,12,12,1,0,1,1,1,1,0,0,1,1,1,0,0,430,440,72.0,1,0,Community,Non_disabled,PBDual,1,0,0,0,0,0,0,0,0,0,0,0,15051.0,6852.0,6316.0,1.0,49.09,50.91,1.037075,9.01,18.08,5.93,12.7,73.69,64.66,20.8,79.2,50340.0,9.54,26.14,8.15,42.72,22.99,27067.0,27.0,14.7,32.62,16.77,50.61,30.3,15.17,48.94,4.54,0.0,0.92,24.08,5.37,64.9,10.18,1,0.627,0.543838
603934,BENE100178,{'21'},10,0.0,out,0,0,0,12,12,1,0,1,1,1,1,0,0,1,1,1,0,0,430,440,71.0,1,0,Community,Non_disabled,FBDual,1,0,0,0,0,0,0,0,0,0,0,0,9293.0,4151.0,3884.0,1.0,48.7,51.3,1.053388,1.29,18.0,4.53,9.47,80.02,73.22,8.65,91.35,52390.0,4.24,19.91,13.44,31.5,35.16,29527.0,31.05,20.51,30.88,5.23,63.88,27.01,12.46,29.33,7.08,0.5,0.6,3.83375,5.01,65.6,3.38,1,0.654,0.539787
603935,BENE100178,{'94'},40,0.0,out,0,0,0,12,12,1,0,1,1,1,1,0,0,1,1,1,0,0,430,440,72.0,1,0,Community,Non_disabled,NonDual,0,0,0,0,0,0,0,0,0,0,0,0,2138.0,1062.0,906.0,1.0,51.78,48.22,0.931248,0.0,19.83,6.97,4.86,83.22,86.75,9.05,90.95,67414.0,5.94,12.11,9.39,35.08,43.42,35323.0,31.16,19.29,34.93,6.38,58.68,27.92,7.51,9.38,2.1,0.0,0.0,21.39,8.91,77.26,4.34,1,0.268,0.528356
603936,BENE100178,{'94'},40,0.0,out,0,0,0,12,12,1,0,1,1,1,1,0,0,1,1,1,0,0,430,440,71.0,1,0,Community,Non_disabled,PBDual,1,0,0,0,0,0,0,0,0,0,0,0,1603.0,459.0,754.0,1.0,51.28,48.72,0.950078,0.0,40.99,9.92,19.5,57.03,68.7,21.09,78.91,44881.0,1.2,19.93,20.53,33.03,26.51,29212.0,42.57,11.27,32.29,8.45,59.25,55.97,18.83,6.42,4.24,0.67,0.45,26.57125,7.86,57.11,3.01,1,0.224,0.515032


Since the model is not performing well so will not use this model for recommendations

Possible reasons of inaccuracy-

1. we are using only 19 features to find the 1 master feature.
2. even this master feature is only able to capture 74% variablity from 19 features whearas total features are 81.