## Library Imports

In [1]:
from pomegranate import *
import pandas as pd

import warnings
warnings.simplefilter("ignore")

## Load the dataset and validate the data load

In [5]:
tae = pd.read_csv("dataset/tae.data", sep=",", header=None, names=['Eng_Spkr', 'Course_Inst', 'Course', 'Summer_Regular', 'Size', 'Class'])
tae.head()

Unnamed: 0,Eng_Spkr,Course_Inst,Course,Summer_Regular,Size,Class
0,1,23,3,1,19,3
1,2,15,3,1,17,3
2,1,23,3,2,49,3
3,1,5,2,2,33,3
4,2,7,11,2,55,3


In [6]:
# Check the data strucure
tae.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Eng_Spkr        151 non-null    int64
 1   Course_Inst     151 non-null    int64
 2   Course          151 non-null    int64
 3   Summer_Regular  151 non-null    int64
 4   Size            151 non-null    int64
 5   Class           151 non-null    int64
dtypes: int64(6)
memory usage: 7.2 KB


In [7]:
# Count of unique elements across columns 
tae.nunique(axis=0)

Eng_Spkr           2
Course_Inst       25
Course            26
Summer_Regular     2
Size              46
Class              3
dtype: int64

## Feature Engineering

### Discretize 'Size' attribute

In [10]:
# Check the unique values of Size attribute

print(sorted(tae['Size'].unique().tolist()))


[3, 5, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 42, 43, 44, 45, 46, 48, 49, 51, 54, 55, 58, 66]


In [13]:
# Discretize the attribute

bin_labels_Size = ['1', '2', '3']
tae['Size_Chg'] = pd.qcut(tae['Size'],q=3,labels=bin_labels_Size)
tae['Size_quantile'] = pd.qcut(tae['Size'], q=3)
tae['Size_quantile'].value_counts()
tae.head()

Unnamed: 0,Eng_Spkr,Course_Inst,Course,Summer_Regular,Size,Class,Size_Chg,Size_quantile
0,1,23,3,1,19,3,1,"(2.999, 20.0]"
1,2,15,3,1,17,3,1,"(2.999, 20.0]"
2,1,23,3,2,49,3,3,"(32.0, 66.0]"
3,1,5,2,2,33,3,3,"(32.0, 66.0]"
4,2,7,11,2,55,3,3,"(32.0, 66.0]"


In [14]:
tae = tae.drop(columns=['Size', 'Size_quantile'])
tae.head()

Unnamed: 0,Eng_Spkr,Course_Inst,Course,Summer_Regular,Class,Size_Chg
0,1,23,3,1,3,1
1,2,15,3,1,3,1
2,1,23,3,2,3,3
3,1,5,2,2,3,3
4,2,7,11,2,3,3


In [17]:
print(tae['Eng_Spkr'].unique().tolist())

[1, 2]


In [19]:
ES_prob_1 = tae['Eng_Spkr'].tolist().count(1)/len(tae.Eng_Spkr)
ES_prob_1

0.19205298013245034

In [20]:
ES_prob_2 = tae['Eng_Spkr'].tolist().count(2)/len(tae.Eng_Spkr)
ES_prob_2

0.8079470198675497

In [21]:
eng_spk = DiscreteDistribution({'1': ES_prob_1, '2': ES_prob_2})

In [22]:
print(sorted(tae['Course_Inst'].unique().tolist()),end=' ')

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] 

In [23]:
C_I_p_1 =  tae['Course_Inst'].tolist().count(1)/len(tae.Course_Inst)
C_I_p_2 =  tae['Course_Inst'].tolist().count(2)/len(tae.Course_Inst)
C_I_p_3 =  tae['Course_Inst'].tolist().count(3)/len(tae.Course_Inst)
C_I_p_4 =  tae['Course_Inst'].tolist().count(4)/len(tae.Course_Inst)
C_I_p_5 =  tae['Course_Inst'].tolist().count(5)/len(tae.Course_Inst)
C_I_p_6 =  tae['Course_Inst'].tolist().count(6)/len(tae.Course_Inst)
C_I_p_7 =  tae['Course_Inst'].tolist().count(7)/len(tae.Course_Inst)
C_I_p_8 =  tae['Course_Inst'].tolist().count(8)/len(tae.Course_Inst)
C_I_p_9 =  tae['Course_Inst'].tolist().count(9)/len(tae.Course_Inst)
C_I_p_10 =  tae['Course_Inst'].tolist().count(10)/len(tae.Course_Inst)
C_I_p_11 =  tae['Course_Inst'].tolist().count(11)/len(tae.Course_Inst)
C_I_p_12 =  tae['Course_Inst'].tolist().count(12)/len(tae.Course_Inst)
C_I_p_13 =  tae['Course_Inst'].tolist().count(13)/len(tae.Course_Inst)
C_I_p_14 =  tae['Course_Inst'].tolist().count(14)/len(tae.Course_Inst)
C_I_p_15 =  tae['Course_Inst'].tolist().count(15)/len(tae.Course_Inst)
C_I_p_16 =  tae['Course_Inst'].tolist().count(16)/len(tae.Course_Inst)
C_I_p_17 =  tae['Course_Inst'].tolist().count(17)/len(tae.Course_Inst)
C_I_p_18 =  tae['Course_Inst'].tolist().count(18)/len(tae.Course_Inst)
C_I_p_19 =  tae['Course_Inst'].tolist().count(19)/len(tae.Course_Inst)
C_I_p_20 =  tae['Course_Inst'].tolist().count(20)/len(tae.Course_Inst)
C_I_p_21 =  tae['Course_Inst'].tolist().count(21)/len(tae.Course_Inst)
C_I_p_22 =  tae['Course_Inst'].tolist().count(22)/len(tae.Course_Inst)
C_I_p_23 =  tae['Course_Inst'].tolist().count(23)/len(tae.Course_Inst)
C_I_p_24 =  tae['Course_Inst'].tolist().count(24)/len(tae.Course_Inst)
C_I_p_25 =  tae['Course_Inst'].tolist().count(25)/len(tae.Course_Inst)
print(C_I_p_1, C_I_p_2, C_I_p_3, C_I_p_4, C_I_p_5, C_I_p_6, C_I_p_7, C_I_p_8, C_I_p_9, C_I_p_10, C_I_p_11, C_I_p_12, C_I_p_13, C_I_p_14, C_I_p_15, C_I_p_16, C_I_p_17, C_I_p_18, C_I_p_19, C_I_p_20, C_I_p_21, C_I_p_22, C_I_p_23, C_I_p_24, C_I_p_25)

0.026490066225165563 0.026490066225165563 0.013245033112582781 0.013245033112582781 0.039735099337748346 0.052980132450331126 0.0728476821192053 0.033112582781456956 0.052980132450331126 0.052980132450331126 0.019867549668874173 0.013245033112582781 0.09271523178807947 0.052980132450331126 0.052980132450331126 0.026490066225165563 0.026490066225165563 0.052980132450331126 0.006622516556291391 0.033112582781456956 0.013245033112582781 0.07947019867549669 0.11258278145695365 0.006622516556291391 0.026490066225165563


In [24]:
C_I = DiscreteDistribution({'1': C_I_p_1, '2': C_I_p_2, '3': C_I_p_3, '4': C_I_p_4, '5': C_I_p_5, '6': C_I_p_6, '7': C_I_p_7, '8': C_I_p_8, '9': C_I_p_9, '10': C_I_p_10, '11': C_I_p_11, '12': C_I_p_12, '13': C_I_p_13, '14': C_I_p_14, '15': C_I_p_15, '16': C_I_p_16, '17': C_I_p_17, '18': C_I_p_18, '19': C_I_p_19, '20': C_I_p_20, '21': C_I_p_21, '22': C_I_p_22, '23': C_I_p_23, '24': C_I_p_24, '25': C_I_p_25})

In [25]:
print(sorted(tae['Course'].unique().tolist()),end=' ')

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26] 

In [26]:
C_p_1 =  tae['Course'].tolist().count(1)/len(tae.Course)
C_p_2 =  tae['Course'].tolist().count(2)/len(tae.Course)
C_p_3 =  tae['Course'].tolist().count(3)/len(tae.Course)
C_p_4 =  tae['Course'].tolist().count(4)/len(tae.Course)
C_p_5 =  tae['Course'].tolist().count(5)/len(tae.Course)
C_p_6 =  tae['Course'].tolist().count(6)/len(tae.Course)
C_p_7 =  tae['Course'].tolist().count(7)/len(tae.Course)
C_p_8 =  tae['Course'].tolist().count(8)/len(tae.Course)
C_p_9 =  tae['Course'].tolist().count(9)/len(tae.Course)
C_p_10 =  tae['Course'].tolist().count(10)/len(tae.Course)
C_p_11 =  tae['Course'].tolist().count(11)/len(tae.Course)
C_p_12 =  tae['Course'].tolist().count(12)/len(tae.Course)
C_p_13 =  tae['Course'].tolist().count(13)/len(tae.Course)
C_p_14 =  tae['Course'].tolist().count(14)/len(tae.Course)
C_p_15 =  tae['Course'].tolist().count(15)/len(tae.Course)
C_p_16 =  tae['Course'].tolist().count(16)/len(tae.Course)
C_p_17 =  tae['Course'].tolist().count(17)/len(tae.Course)
C_p_18 =  tae['Course'].tolist().count(18)/len(tae.Course)
C_p_19 =  tae['Course'].tolist().count(19)/len(tae.Course)
C_p_20 =  tae['Course'].tolist().count(20)/len(tae.Course)
C_p_21 =  tae['Course'].tolist().count(21)/len(tae.Course)
C_p_22 =  tae['Course'].tolist().count(22)/len(tae.Course)
C_p_23 =  tae['Course'].tolist().count(23)/len(tae.Course)
C_p_24 =  tae['Course'].tolist().count(24)/len(tae.Course)
C_p_25 =  tae['Course'].tolist().count(25)/len(tae.Course)
C_p_26 =  tae['Course'].tolist().count(26)/len(tae.Course)
print(C_p_1, C_p_2, C_p_3, C_p_4, C_p_5, C_p_6, C_p_7, C_p_8, C_p_9, C_p_10, C_p_11, C_p_12, C_p_13, C_p_14, C_p_15, C_p_16, C_p_17, C_p_18, C_p_19, C_p_20, C_p_21, C_p_22, C_p_23, C_p_24, C_p_25, C_p_26)

0.09271523178807947 0.10596026490066225 0.2980132450331126 0.006622516556291391 0.033112582781456956 0.013245033112582781 0.046357615894039736 0.026490066225165563 0.019867549668874173 0.006622516556291391 0.059602649006622516 0.006622516556291391 0.019867549668874173 0.006622516556291391 0.06622516556291391 0.019867549668874173 0.06622516556291391 0.013245033112582781 0.006622516556291391 0.006622516556291391 0.019867549668874173 0.019867549668874173 0.006622516556291391 0.006622516556291391 0.019867549668874173 0.006622516556291391


In [27]:
course = DiscreteDistribution({'1': C_p_1, '2': C_p_2, '3': C_p_3, '4': C_p_4, '5': C_p_5, '6': C_p_6, '7': C_p_7, '8': C_p_8, '9': C_p_9, '10': C_p_10, '11': C_p_11, '12': C_p_12, '13': C_p_13, '14': C_p_14, '15': C_p_15, '16': C_p_16, '17': C_p_17, '18': C_p_18, '19': C_p_19, '20': C_p_20, '21': C_p_21, '22': C_p_22, '23': C_p_23, '24': C_p_24, '25': C_p_25, '26': C_p_26})

In [28]:
SC_p_1 = tae['Size_Chg'].tolist().count('1')/len(tae.Size_Chg)
SC_p_2 = tae['Size_Chg'].tolist().count('2')/len(tae.Size_Chg)
SC_p_3 = tae['Size_Chg'].tolist().count('3')/len(tae.Size_Chg)
print(SC_p_1, SC_p_2, SC_p_2)

0.3708609271523179 0.2980132450331126 0.2980132450331126


In [29]:
size_chg = DiscreteDistribution({'1': SC_p_1, '2': SC_p_2, '3': SC_p_3})

In [30]:
tae.Summer_Regular.unique()

array([1, 2], dtype=int64)

In [31]:
SR_p_1 = tae['Summer_Regular'].tolist().count(1)/len(tae.Summer_Regular)
SR_p_2 = tae['Summer_Regular'].tolist().count(2)/len(tae.Summer_Regular)
print(SR_p_1, SR_p_2)

0.152317880794702 0.847682119205298


In [32]:
summer_regular = DiscreteDistribution({'1': SR_p_1, '2': SR_p_2})

In [33]:
print(sorted(tae['Class'].unique().tolist()))
ta

[1, 2, 3]


In [34]:
Cl_p_1 = tae['Class'].tolist().count(1)/len(tae.Class)
Cl_p_2 = tae['Class'].tolist().count(2)/len(tae.Class)
Cl_p_3 = tae['Class'].tolist().count(3)/len(tae.Class)
print(Cl_p_1, Cl_p_2, Cl_p_3)

0.32450331125827814 0.33112582781456956 0.3443708609271523


In [35]:
cl = DiscreteDistribution({'1': Cl_p_1, '2': Cl_p_2, '3': Cl_p_3})

In [36]:
s1 = Node(eng_spk, name="Eng_Spkr")
s2 = Node(C_I, name="Course_Inst")
s3 = Node(course, name="Course")
s4 = Node(summer_regular, name="Summer_Regular")
s5 = Node(size_chg, name="Size")
s6 = Node(cl, name="Class")

In [37]:
model = BayesianNetwork("TA Assessment")
model.add_states(s1, s2, s3, s4, s5, s6)

In [38]:
model.bake()

In [39]:
print(model.probability([['2', '15', '3', '1', '1', '3']]))

0.0002481531853905784


In [40]:
print(model.predict([['1', '25', '26', '2', '2', None]]))

[array(['1', '25', '26', '2', '2', '3'], dtype=object)]


In [41]:
print(model.probability([['1', '20', '7', '1', '1', '1']]))

5.404011960295112e-06
