In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

In [7]:
data = pd.read_csv('../data/student/student-mat.csv', delimiter=';')
data.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [8]:
struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

print(non_numeric_columns)

['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [9]:
le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,0,4,4,0,4,...,4,3,4,1,1,3,6,5,6,6
1,0,0,17,1,0,1,1,1,0,2,...,5,3,3,1,1,3,4,5,5,6
2,0,0,15,1,1,1,1,1,0,2,...,4,3,2,2,3,3,10,7,8,10
3,0,0,15,1,0,1,4,2,1,3,...,3,2,2,1,1,5,2,15,14,15
4,0,0,16,1,0,1,3,3,2,2,...,4,3,2,1,2,5,4,6,10,10


In [10]:
sm = from_pandas(struct_data)

KeyboardInterrupt: 

In [11]:
sm.remove_edges_below_threshold(0.7)
sm = sm.get_largest_subgraph()

In [None]:
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz.toggle_physics(False)
viz.show('stu-original.html')

In [12]:
# Re-scale or shift the values of some columns
data_copy1 = struct_data.copy()
data_copy2 = struct_data.copy()
columns = ['famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health']
data_copy1[columns] = data_copy1[columns] - 3
data_copy2[columns] = data_copy2[columns] * 10

In [13]:
sm1 = from_pandas(data_copy1)

In [14]:
sm1.remove_edges_below_threshold(0.7)
sm1 = sm1.get_largest_subgraph()


In [15]:
viz1 = plot_structure(
    sm1,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz1.toggle_physics(False)
viz1.show('stu-shift.html')

stu-shift.html


In [None]:
sm2 = from_pandas(data_copy2)

In [23]:
sm2.remove_edges_below_threshold(0.7)
sm2 = sm2.get_largest_subgraph()

In [None]:
viz2 = plot_structure(
    sm2,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz2.toggle_physics(False)
viz2.show('stu-rescale.html')

In [32]:
data_copy3 = struct_data.copy()
columns = ['absences']
data_copy3[columns] = data_copy3[columns] / 10
data_copy3.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,0,4,4,0,4,...,4,3,4,1,1,3,0.6,5,6,6
1,0,0,17,1,0,1,1,1,0,2,...,5,3,3,1,1,3,0.4,5,5,6
2,0,0,15,1,1,1,1,1,0,2,...,4,3,2,2,3,3,1.0,7,8,10
3,0,0,15,1,0,1,4,2,1,3,...,3,2,2,1,1,5,0.2,15,14,15
4,0,0,16,1,0,1,3,3,2,2,...,4,3,2,1,2,5,0.4,6,10,10


In [35]:
sm3 = from_pandas(data_copy3)

In [40]:
sm3.remove_edges_below_threshold(0.7)
sm3 = sm3.get_largest_subgraph()

In [None]:
viz3 = plot_structure(
    sm3,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz3.toggle_physics(False)
viz3.show('stu-absence.html')