## Medical Cost Analysis: Discretisation with Mixed Variables

This notebook seeks to analyse the algorithms robustness / sensitivity to measurements (continuous data) vs discrete data within a mixed variables data setting


In [44]:
import pandas as pd
import numpy as np

In [45]:
data = pd.read_csv('../data/medical/insurance.csv')

data['charges'] = data['charges'] / 1000
data.head(5)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16.884924
1,18,male,33.77,1,no,southeast,1.725552
2,28,male,33.0,3,no,southeast,4.449462
3,33,male,22.705,0,no,northwest,21.984471
4,32,male,28.88,0,no,northwest,3.866855


In [46]:
from sklearn.preprocessing import LabelEncoder

struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16.884924
1,18,1,33.77,1,0,2,1.725552
2,28,1,33.0,3,0,2,4.449462
3,33,1,22.705,0,0,1,21.984471
4,32,1,28.88,0,0,1,3.866855


In [47]:
struct_data.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,0.505232,30.663397,1.094918,0.204783,1.515695,13.270422
std,14.04996,0.50016,6.098187,1.205493,0.403694,1.104885,12.110011
min,18.0,0.0,15.96,0.0,0.0,0.0,1.121874
25%,27.0,0.0,26.29625,0.0,0.0,1.0,4.740287
50%,39.0,1.0,30.4,1.0,0.0,2.0,9.382033
75%,51.0,1.0,34.69375,2.0,0.0,2.0,16.639913
max,64.0,1.0,53.13,5.0,1.0,3.0,63.770428


In [48]:
struct_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16.884924
1,18,1,33.770,1,0,2,1.725552
2,28,1,33.000,3,0,2,4.449462
3,33,1,22.705,0,0,1,21.984471
4,32,1,28.880,0,0,1,3.866855
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10.600548
1334,18,0,31.920,0,0,0,2.205981
1335,18,0,36.850,0,0,2,1.629833
1336,21,0,25.800,0,0,3,2.007945


## Initial graph

In [49]:
#### To learn causal graph from full measurements data
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
import random
import numpy as np 

# Set the random seed for numpy
np.random.seed(50)
# Set the random seed for Python's built-in random module
random.seed(50)

initial_sm = from_pandas(struct_data)

viz = plot_structure(
    initial_sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.toggle_physics(False)
viz.show("graph_viz/medical_data.html")

graph_viz/medical_data.html


In [50]:
def weights_analysis(sm):

    weights = 0
    edges = 0
    max = 0
    min = 0
    # graph = {}
    for node in sm.adjacency(): 
        # weights = 
        # node_edge_weights_sum = 0
        node_edges = []
        for edge in node[1]:   
            weights += node[1][edge]['weight']
            if node[1][edge]['weight'] > max: 
                max = node[1][edge]['weight']
            if node[1][edge]['weight'] < min: 
                min = node[1][edge]['weight']

            edges += 1
            node_edges.append(node[1][edge]['weight'])
    print('sum of weights', weights)
    print('sum of edges counted', edges)
    print('weights / edges', weights / edges)
    print('max weight of edges', max)
    print('min weight of edges', min)
    return
weights_analysis(initial_sm)


sum of weights 24.88796181504645
sum of edges counted 42
weights / edges 0.5925705194058678
max weight of edges 25.144283597645828
min weight of edges -21.21182332486845


In [51]:
initial_sm.remove_edges_below_threshold(0.6)
## to decide how to choose threshold
viz = plot_structure(
    initial_sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.toggle_physics(0)
viz.show("graph_viz/0.6_medical_data.html")

graph_viz/0.6_medical_data.html


## Discrete BMI graph

In [30]:
mean = struct_data.describe()['bmi']['mean']
std = struct_data.describe()['bmi']['std']
bins = [mean-3*std, mean-2*std, mean-std, mean, mean+std, mean+2*std, mean+3*std]

labels = [1, 2, 3, 4, 5, 6]

# Use the cut function to transform the measurements into categorical labels
df = struct_data
df['bmi_discrete'] = pd.cut(df['bmi'], bins=bins, labels=labels, right=False)
# df.drop('bmi')

cols = ['bmi']
cleaned_data = df.drop(cols, axis = 1)
cleaned_data

Unnamed: 0,age,sex,children,smoker,region,charges,bmi_discrete
0,19,0,0,1,3,16.884924,3
1,18,1,1,0,2,1.725552,4
2,28,1,3,0,2,4.449462,4
3,33,1,0,0,1,21.984471,2
4,32,1,0,0,1,3.866855,3
...,...,...,...,...,...,...,...
1333,50,1,3,0,1,10.600548,4
1334,18,0,0,0,0,2.205981,4
1335,18,0,0,0,2,1.629833,5
1336,21,0,0,0,3,2.007945,3


In [33]:

cleaned_data['bmi_discrete'] = pd.to_numeric(cleaned_data['bmi_discrete'], errors='coerce')
cleaned_data

Unnamed: 0,age,sex,children,smoker,region,charges,bmi_discrete
0,19,0,0,1,3,16.884924,3.0
1,18,1,1,0,2,1.725552,4.0
2,28,1,3,0,2,4.449462,4.0
3,33,1,0,0,1,21.984471,2.0
4,32,1,0,0,1,3.866855,3.0
...,...,...,...,...,...,...,...
1333,50,1,3,0,1,10.600548,4.0
1334,18,0,0,0,0,2.205981,4.0
1335,18,0,0,0,2,1.629833,5.0
1336,21,0,0,0,3,2.007945,3.0


In [38]:
cleaned_data.isna().sum()
cleaned_data.dropna(inplace=True)
cleaned_data

Unnamed: 0,age,sex,children,smoker,region,charges,bmi_discrete
0,19,0,0,1,3,16.884924,3.0
1,18,1,1,0,2,1.725552,4.0
2,28,1,3,0,2,4.449462,4.0
3,33,1,0,0,1,21.984471,2.0
4,32,1,0,0,1,3.866855,3.0
...,...,...,...,...,...,...,...
1333,50,1,3,0,1,10.600548,4.0
1334,18,0,0,0,0,2.205981,4.0
1335,18,0,0,0,2,1.629833,5.0
1336,21,0,0,0,3,2.007945,3.0


In [39]:
#### To learn causal graph from full measurements data
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
import random
import numpy as np 

# Set the random seed for numpy
np.random.seed(50)
# Set the random seed for Python's built-in random module
random.seed(50)

disc_sm = from_pandas(cleaned_data)

viz = plot_structure(
    disc_sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.toggle_physics(False)
viz.show("graph_viz/disc_medical_data.html")

graph_viz/disc_medical_data.html


In [40]:
weights_analysis(disc_sm)

sum of weights 25.160074879746844
sum of edges counted 42
weights / edges 0.5990494018987343
max weight of edges 23.548671880355744
min weight of edges -21.615846685327643


In [42]:
disc_sm.remove_edges_below_threshold(0.6)
## to decide how to choose threshold
viz = plot_structure(
    disc_sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.toggle_physics(0)
viz.show("graph_viz/0.6_disc_medical_data.html")

graph_viz/0.6_medical_data.html


## Compare

In [56]:
## comparison

### recover adjacency matrix 
import networkx as nx

def to_adj_matrix(sm):
    data = []
    for node in sm.adjacency(): 
        data.append(node)
    
    # Create an empty directed graph
    G = nx.DiGraph()

    # Add nodes and edges to the graph
    for node, neighbors in data:
        G.add_node(node)
        for neighbor in neighbors:
            G.add_node(neighbor)
            G.add_edge(node, neighbor)

    # Convert the graph to an adjacency matrix
    adjacency_matrix = nx.adjacency_matrix(G).todense()
    return G, adjacency_matrix

G, initial_sm_adj = to_adj_matrix(initial_sm)
# Print the adjacency matrix
print(initial_sm_adj)
print(G.nodes)

[[0 0 0 0 0 0 0]
 [1 0 1 1 1 1 0]
 [1 0 0 0 0 0 0]
 [1 0 1 0 0 1 0]
 [1 0 1 0 0 1 0]
 [1 0 1 0 0 0 0]
 [1 0 1 0 0 1 0]]
['age', 'sex', 'bmi', 'children', 'region', 'charges', 'smoker']


In [58]:
disc_G, disc_sm_adj = to_adj_matrix(disc_sm)
print(disc_sm_adj)
print(disc_G.nodes)

[[0 0 0 0 0 0 0]
 [1 0 1 1 1 0 0]
 [1 0 0 0 0 1 0]
 [1 0 0 0 1 0 0]
 [1 0 0 0 0 1 0]
 [1 0 0 0 0 0 0]
 [1 0 0 0 1 1 0]]
['age', 'sex', 'children', 'region', 'bmi_discrete', 'charges', 'smoker']


In [59]:
from cdt.metrics import SHD
# from numpy.random import randint
# tar, pred = randint(2, size=(10, 10)), randint(2, size=(10, 10))
SHD(disc_sm_adj, initial_sm_adj, double_for_anticausal=False)

8.0