# Tutorial on Python and pgmpy

In [1]:
print("Hello, World!")

Hello, World!


#### Tuples:

Tuples are non-modifiable (or 'non-mutable') objects

In [2]:
months = ('January','February','March','April','May','June',\
'July','August','September','October','November','  December')

In [3]:
print months[2]

March


#### Lists:

Lists are modifiable (or 'mutable', as a programmer may say), so their values can be changed. Most of the time we use lists, not tuples, because we want to easily change the values of things if we need to. 

In [4]:
fruits = ['Apple', 'Kiwi', 'Mango', 'Banana']

In [5]:
# Accessing lists

print fruits[2]

Mango


In [6]:
# Adding item to a list

fruits.append('Orange')
print(fruits)

['Apple', 'Kiwi', 'Mango', 'Banana', 'Orange']


In [7]:
# Deleting an item

del fruits[1]
print fruits

['Apple', 'Mango', 'Banana', 'Orange']


#### Dictionaries:

In [8]:
# empty dictionary
my_dict = {}


In [9]:
# dictionary with integer keys
my_dict = {1: 'apple', 2: 'ball'}

In [10]:

# dictionary with mixed keys
my_dict = {'name': 'John', 1: [2, 4, 3]}

In [11]:
# using dict()
my_dict = dict({1:'apple', 2:'ball'})

In [12]:
# from sequence having each item as a pair
my_dict = dict([(1,'apple'), (2,'ball')])

In [13]:
# Accessing the dictionary

# from sequence having each item as a pair
my_dict = {'name':'Jack', 'age':20}

# Output: Jack
print(my_dict['name'])

# Output: 26
print(my_dict.get('age'))

Jack
20


## Defining a PGM

Consider the following example from Koller's book to model a student's chance of getting a recommendation letter.

![PGM](koller_example.png)


In [15]:
from pgmpy.models import BayesianModel

Let's define a smaller network with just three nodes:

In [16]:
# Defining a Bayesian network - set of nodes and edges
student_model = BayesianModel([('Difficulty', 'Grade'), 
                              ('Intelligence', 'Grade')])

To get the summary of the network, use the following functions

In [17]:
student_model.nodes()

['Grade', 'Difficulty', 'Intelligence']

In [18]:
student_model.edges()

[('Difficulty', 'Grade'), ('Intelligence', 'Grade')]

We can also add more nodes at any point of time using add_nodes_from() method

In [19]:
student_model.add_nodes_from(['Letter','SAT'])

Adding more edges using the add_edges_from() method

In [20]:
student_model.add_edges_from([('Grade','Letter'),('Intelligence','SAT')])

**Note:** If we add an edge, but the nodes, between which the edge is, are not present in the model, pgmpy automatically adds those nodes to the model.

In [22]:
student_model.nodes()

['Grade', 'Difficulty', 'SAT', 'Letter', 'Intelligence']

In [23]:
student_model.edges()

[('Grade', 'Letter'),
 ('Difficulty', 'Grade'),
 ('Intelligence', 'Grade'),
 ('Intelligence', 'SAT')]

### Defining the Conditional Probability Distributions (CPDs)

In [25]:
from pgmpy.factors.discrete import TabularCPD

Now let's define the CPDs associated with the network

In [26]:
# defining root nodes (no evidence/parents)
cpd_diff = TabularCPD(variable='Difficulty', variable_card=2,
                      values=[[0.6], [0.4]])
cpd_intel = TabularCPD(variable='Intelligence', variable_card=2,
                      values=[[0.7], [0.3]])

In [27]:
# CPDs for nodes with evidence/parents

cpd_grade = TabularCPD(variable='Grade', variable_card=3,
                        values=[[0.3, 0.05, 0.9, 0.5],
                                [0.4, 0.25, 0.08, 0.3],
                                [0.3,0.7,0.02,0.2]],
                        evidence=['Intelligence', 'Difficulty'],
                        evidence_card=[2, 2])
print(cpd_grade)

+--------------+----------------+----------------+----------------+----------------+
| Intelligence | Intelligence_0 | Intelligence_0 | Intelligence_1 | Intelligence_1 |
+--------------+----------------+----------------+----------------+----------------+
| Difficulty   | Difficulty_0   | Difficulty_1   | Difficulty_0   | Difficulty_1   |
+--------------+----------------+----------------+----------------+----------------+
| Grade_0      | 0.3            | 0.05           | 0.9            | 0.5            |
+--------------+----------------+----------------+----------------+----------------+
| Grade_1      | 0.4            | 0.25           | 0.08           | 0.3            |
+--------------+----------------+----------------+----------------+----------------+
| Grade_2      | 0.3            | 0.7            | 0.02           | 0.2            |
+--------------+----------------+----------------+----------------+----------------+


In [28]:
cpd_SAT = TabularCPD(variable='SAT', variable_card=2,
                        values=[[0.95, 0.2],
                                [0.05, 0.8]],
                        evidence=['Intelligence'],
                        evidence_card=[2])
print(cpd_SAT)

+--------------+----------------+----------------+
| Intelligence | Intelligence_0 | Intelligence_1 |
+--------------+----------------+----------------+
| SAT_0        | 0.95           | 0.2            |
+--------------+----------------+----------------+
| SAT_1        | 0.05           | 0.8            |
+--------------+----------------+----------------+


In [29]:
cpd_letter = TabularCPD(variable='Letter', variable_card=2,
                        values=[[0.1, 0.4,0.99],
                                [0.9, 0.6,0.01]],
                        evidence=['Grade'],
                        evidence_card=[3])
print(cpd_letter)

+----------+---------+---------+---------+
| Grade    | Grade_0 | Grade_1 | Grade_2 |
+----------+---------+---------+---------+
| Letter_0 | 0.1     | 0.4     | 0.99    |
+----------+---------+---------+---------+
| Letter_1 | 0.9     | 0.6     | 0.01    |
+----------+---------+---------+---------+


In [30]:
# Associating the parameters with the model structure.
student_model.add_cpds(cpd_diff, cpd_grade, cpd_intel, cpd_letter, cpd_SAT)


In [31]:
student_model.get_cpds()

[<TabularCPD representing P(Difficulty:2) at 0xa768d30L>,
 <TabularCPD representing P(Grade:3 | Intelligence:2, Difficulty:2) at 0xa768198L>,
 <TabularCPD representing P(Intelligence:2) at 0xa768cf8L>,
 <TabularCPD representing P(Letter:2 | Grade:3) at 0xa768e48L>,
 <TabularCPD representing P(SAT:2 | Intelligence:2) at 0xa768710L>]

Additionally, pgmpy also provides a check_model() method that checks whether the model and all the associated CPDs are consistent:

In [32]:
# Checking if the cpds are valid for the model.
student_model.check_model()

True

To remove cpd's use the remove_cpds() method

In [21]:
student_model.remove_cpds('SAT','Letter')


### Inference in Bayesian Network

In [33]:
# Doing exact inference using Variable Elimination
from pgmpy.inference import VariableElimination

student_infer = VariableElimination(student_model)

# Computing the probability of Letter given Intelligence.
q0 = student_infer.query(variables=['Letter'], evidence={'Intelligence': 0})

print(q0['Letter'])


+----------+---------------+
| Letter   |   phi(Letter) |
|----------+---------------|
| Letter_0 |        0.6114 |
| Letter_1 |        0.3886 |
+----------+---------------+


In [34]:
q1 = student_infer.query(variables=['Letter'], evidence={'Intelligence': 1})

print(q1['Letter'])


+----------+---------------+
| Letter   |   phi(Letter) |
|----------+---------------|
| Letter_0 |        0.2323 |
| Letter_1 |        0.7677 |
+----------+---------------+


In [35]:
q2 = student_infer.query(variables=['Letter'])
print(q2['Letter'])

+----------+---------------+
| Letter   |   phi(Letter) |
|----------+---------------|
| Letter_0 |        0.4977 |
| Letter_1 |        0.5023 |
+----------+---------------+


### Reasoning on the Network:

In [36]:
# Doing some simple queries on the network
student_model.is_active_trail('Difficulty', 'SAT')

False

In [37]:
student_model.is_active_trail('Difficulty', 'Letter')

True

In [38]:
# Getting local independencies in the network
student_model.local_independencies('SAT')

(SAT _|_ Grade, Difficulty, Letter | Intelligence)

In [39]:
student_model.local_independencies('Grade')

(Grade _|_ SAT | Difficulty, Intelligence)

In [40]:
student_model.get_independencies()

(Grade _|_ SAT | Intelligence)
(Grade _|_ SAT | Difficulty, Intelligence)
(Grade _|_ SAT | Intelligence, Letter)
(Grade _|_ SAT | Difficulty, Letter, Intelligence)
(Difficulty _|_ Intelligence, SAT)
(Difficulty _|_ Letter | Grade)
(Difficulty _|_ SAT | Intelligence)
(Difficulty _|_ Intelligence | SAT)
(Difficulty _|_ Letter, SAT | Grade, Intelligence)
(Difficulty _|_ Letter | Grade, SAT)
(Difficulty _|_ SAT | Intelligence, Letter)
(Difficulty _|_ SAT | Grade, Intelligence, Letter)
(Difficulty _|_ Letter | Grade, Intelligence, SAT)
(SAT _|_ Difficulty)
(SAT _|_ Letter | Grade)
(SAT _|_ Grade, Difficulty, Letter | Intelligence)
(SAT _|_ Letter | Grade, Difficulty)
(SAT _|_ Difficulty, Letter | Grade, Intelligence)
(SAT _|_ Grade, Letter | Difficulty, Intelligence)
(SAT _|_ Grade, Difficulty | Intelligence, Letter)
(SAT _|_ Letter | Grade, Difficulty, Intelligence)
(SAT _|_ Difficulty | Grade, Intelligence, Letter)
(SAT _|_ Grade | Difficulty, Letter, Intelligence)
(Letter _|_ Difficulty,