In [1]:
## importing the libraries
from surprise import Dataset, Reader, SVD
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## setting the maximum columns and rows
import os
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## removing warnings
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [2]:
# Load the data
master_data = pd.read_csv(r"../data/processed_data/master_data_for_modelling.csv")
df_features = pd.read_excel(r"C:../data/processed_data/all_features_for_modelling.xlsx")
df_hcc_description = pd.read_excel("..\data\PY 2024 Proposed Clinical Revision Part C Model ICD-10 Mappings.xlsx"
                             ,skiprows=3,dtype = {'2020_CMS-HCC': str, '2024_CMS-HCC': str})
df_hcc_description = df_hcc_description[["2024_CMS-HCC","Description"]].rename(columns=
                                                                               {"2024_CMS-HCC":"HCC_code","Description":"HCC_Description"})
data = master_data.iloc[:,2:]

In [3]:
## converting categorical columns into strings
categorical_cols = ['flag_ip','Gender','Race', 'RenalDiseaseIndicator','ChronicCond_Alzheimer','ChronicCond_Heartfailure','ChronicCond_KidneyDisease','ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary','ChronicCond_Depression','ChronicCond_Diabetes','ChronicCond_IschemicHeart','ChronicCond_Osteoporasis',
'ChronicCond_rheumatoidarthritis','ChronicCond_stroke','Aged','community_institutional','Disability_condn','Benefits',
'medicaid_flag','Disease_intraction_DIABETES_HF','Disease_intraction_HF_CHR_LUNG','Disease_intraction_HF_KIDNEY','Disease_intraction_CHR_LUNG_CARD_RESP_FAIL',
'Disease_intraction_HF_HCC238','DISABLED_HF','DISABLED_ULCER_','DISABLED_CANCER','DISABLED_CHR_LUNG','disability']
data[categorical_cols] = data[categorical_cols].astype(str)

In [6]:
## saclling and onehot encoding of data

# Define the column transformer
ct = make_column_transformer(
    (StandardScaler(), data.select_dtypes(include=['int', 'float']).columns.tolist()),
    (OneHotEncoder(drop = "first"), data.select_dtypes(include=['object']).columns.tolist()))

# Define the pipeline
pipeline = make_pipeline(ct)

# Transform the data
transformed_data = pipeline.fit_transform(data)

# Get the names of the encoded columns
encoded_columns = pipeline.named_steps['columntransformer'].transformers_[1][1].get_feature_names(data.select_dtypes(include=['object']).columns.tolist())

# Combine the column names
column_names = data.select_dtypes(include=['int', 'float']).columns.tolist() + encoded_columns.tolist()

# Convert the transformed data to a DataFrame with column names
transformed_data_df = pd.DataFrame(transformed_data, columns=column_names)

## missing values imputation with median
transformed_data_df = transformed_data_df.fillna(transformed_data_df.median())

## cosine similairty

#### user_user simialrity

In [17]:
# user_user cosine similairty matrix

test_beneficiaries_numbers = 10000
transformed_data_df = pd.concat([master_data[['BeneID']] , transformed_data_df], axis = 1)
user_vars = transformed_data_df.groupby('BeneID').mean()
user_vars = user_vars.head(test_beneficiaries_numbers)

In [19]:
%%time

##selecting thershold value of similairty
similairty_thershold = 0.7

## creating dataframe from cosin similarity matrix
similarity_matrix_user = cosine_similarity(user_vars)
similarity_df_user = pd.DataFrame(similarity_matrix_user, index=user_vars.index, columns=user_vars.index)

##creating empty dataframe with 3 column names
final_data = pd.DataFrame(columns = ['Target_Beneficiary', 'Similar_Beneficiary', 'Similarity_score'])

##iterating through each Beneficiary from user_user similarity matrix
for Beneficiary in similarity_df_user.columns.values.tolist():
    
    ##finding all similar services purchased by account and sorting it
    result = pd.DataFrame(similarity_df_user.loc[Beneficiary].sort_values(ascending=False))
    
    ## round of simialrity score upto 6 decimal points,
    ## finding top Similar beneficiaries whoes simialrity score is less than 0.8 and not including account itself
    result[Beneficiary] = result[Beneficiary].round(6)
    result = result[(result[Beneficiary] != 1.0000) & (result[Beneficiary] >= similairty_thershold)]
    result = result.reset_index()
    
    ## creating temperarory dataframe and putting all values of top 5 similar account
    temp_df = pd.DataFrame(columns = ['Target_Beneficiary', 'Similar_Beneficiary', 'Similarity_score'])

    for i in range(len(result)):
        temp_df = temp_df.append({'Target_Beneficiary' : Beneficiary , 'Similar_Beneficiary' : result["BeneID"][i], 'Similarity_score' : result[Beneficiary][i]}, 
                    ignore_index = True)
    
    ## appending temperarory dataframe into final dataframe
    final_data = final_data.append(temp_df, ignore_index = True)

CPU times: total: 1.47 s
Wall time: 924 ms


In [24]:
## creating target_HCC and similar_HCC columns

df_BeneID_HCC_list = master_data[["BeneID","HCC"]].groupby("BeneID").agg(lambda x: set(x.to_list())).reset_index()
final_data = final_data.merge(df_BeneID_HCC_list, how = "left", right_on="BeneID", left_on = "Target_Beneficiary").drop(columns=["BeneID"]).rename(columns = {"HCC": "Target_HCC"})
final_data = final_data.merge(df_BeneID_HCC_list, how = "left", right_on="BeneID", left_on = "Similar_Beneficiary").drop(columns=["BeneID"]).rename(columns = {"HCC": "Simialr_HCC"})

In [26]:
## creating finding common HCC's between taget and simialr HCC's
final_data['common_HCC'] = [x[0].intersection(x[1]) for x in zip(final_data['Target_HCC'], final_data['Simialr_HCC'])]
final_data['common_HCC_count'] = [len(x) for x in final_data['common_HCC']]
final_data.sort_values(["Target_Beneficiary","common_HCC_count","Similarity_score"],ascending=[True, False, False],inplace=True)
final_data = final_data.groupby('Target_Beneficiary').apply(lambda x: x.head(5))
final_data = final_data.drop(columns=["Target_Beneficiary"]).reset_index().drop(columns=["level_1","common_HCC_count"])

In [28]:
## creating individual recommendations by individual similar service for target services and converting it into a list
final_data["individual_recommended_HCC"] = final_data["Simialr_HCC"] - final_data["Target_HCC"]

In [30]:
## combining individual recommended HCC's at target beneficiary level and removing the duplicates.
df_combined_recommedned_HCC = final_data[["Target_Beneficiary","individual_recommended_HCC"]].explode(
    column = "individual_recommended_HCC").drop_duplicates(keep='first').groupby("Target_Beneficiary").agg(lambda x: x.to_list()).rename(columns={"individual_recommended_HCC":"combined_recommedned_HCC"})

final_data = final_data.merge(df_combined_recommedned_HCC, on="Target_Beneficiary", how='left')

In [32]:
final_data.head(3)

Unnamed: 0,Target_Beneficiary,Similar_Beneficiary,Similarity_score,Target_HCC,Simialr_HCC,common_HCC,individual_recommended_HCC,combined_recommedned_HCC
0,BENE100001,BENE100002,0.816317,"{152, 238}","{38, 264, 137, 201, 238, 19, 151, 280, 127}",{238},"{38, 264, 137, 201, 19, 151, 280, 127}","[38, 264, 137, 201, 19, 151, 280, 127, 226, 37..."
1,BENE100001,BENE100073,0.859016,"{152, 238}","{280, 226, 38}",{},"{280, 226, 38}","[38, 264, 137, 201, 19, 151, 280, 127, 226, 37..."
2,BENE100001,BENE100114,0.854538,"{152, 238}","{226, 37, 326, 38, 137, 298, 109, 112, 23, 151...",{},"{226, 37, 38, 326, 137, 298, 109, 112, 23, 151...","[38, 264, 137, 201, 19, 151, 280, 127, 226, 37..."


user-user recommendations are comming good. but since it is a memory based approch so we will not be able to recommend HCC's to new users. that why in next approach we will try to convert this approch into model based approch by using singualr value decomposition. also by converting df_user_vars into U, S and VT and again creating a low rank matrix that is giving more accurate results.

#### item_item similairty

In [33]:
## item_item cosine similairty matrix
transformed_data_df = pd.concat([master_data[['BeneID', 'HCC']] , transformed_data_df], axis = 1)
item_vars = transformed_data_df.drop(columns=['BeneID']).groupby('HCC').mean()
similarity_matrix_item = cosine_similarity(item_vars)
similarity_df_item = pd.DataFrame(similarity_matrix_item, index=item_vars.index, columns=item_vars.index)

In [35]:
## top recommended HCC'S based on item item recommendation of 238 HCC
hcc = 238
top_recommendation = 10
rename_dict = {"HCC":"Recommended_HCC", 1:"Similarity_Score"}
pd.DataFrame(similarity_df_item.loc[hcc].sort_values(ascending=False)[1:top_recommendation]).reset_index().rename(columns = rename_dict)

Unnamed: 0,Recommended_HCC,238
0,201,0.995111
1,229,0.994454
2,79,0.993591
3,81,0.992325
4,283,0.992312
5,38,0.992142
6,94,0.991124
7,64,0.99098
8,182,0.990736


item item similarity goes not take too much parameters into consideration. also item item simialrity is not more recommended for users recommendations.