In [48]:
from pgmpy.factors.discrete import TabularCPD
from pgmpy.models import BayesianModel

# Setting up your model

### First, set the structure

In [49]:
olympic_model = BayesianModel([('Genetics', 'OlympicTrials'),
                             ('Practice', 'OlympicTrials'),
                             ('OlympicTrials', 'Offer')])

In [50]:
olympic_model.__dict__

{'_adj': {'Genetics': {'OlympicTrials': {'weight': None}},
  'Offer': {},
  'OlympicTrials': {'Offer': {'weight': None}},
  'Practice': {'OlympicTrials': {'weight': None}}},
 '_node': {'Genetics': {}, 'Offer': {}, 'OlympicTrials': {}, 'Practice': {}},
 '_pred': {'Genetics': {},
  'Offer': {'OlympicTrials': {'weight': None}},
  'OlympicTrials': {'Genetics': {'weight': None},
   'Practice': {'weight': None}},
  'Practice': {}},
 '_succ': {'Genetics': {'OlympicTrials': {'weight': None}},
  'Offer': {},
  'OlympicTrials': {'Offer': {'weight': None}},
  'Practice': {'OlympicTrials': {'weight': None}}},
 'adjlist_inner_dict_factory': dict,
 'adjlist_outer_dict_factory': dict,
 'cardinalities': defaultdict(int, {}),
 'cpds': [],
 'edge_attr_dict_factory': dict,
 'graph': {},
 'node_dict_factory': dict,
 'nodes': NodeView(('Genetics', 'OlympicTrials', 'Practice', 'Offer')),
 'root_graph': <pgmpy.models.BayesianModel.BayesianModel at 0x118d2fda0>}

### Then set up the relationships (the CPDs)

In [51]:
genetics_cpd = TabularCPD(
                variable = 'Genetics',
                variable_card = 2,
                values = [[.2,.8]])

In [52]:
practice_cpd = TabularCPD(
                variable = 'Practice',
                variable_card = 2,
                values = [[.7,.3]])

In [53]:
offer_cpd = TabularCPD(
                    variable = 'Offer',
                    variable_card = 2,
                    values = [[.95, .8, .5],
                             [.05, .2, .5]],
                    evidence = ['OlympicTrials'],
                    evidence_card = [3])

In [54]:
olympic_trials_cpd = TabularCPD(
                        variable = 'OlympicTrials', 
                        variable_card = 3,
                        values = [[.5, .8, .8, .9],
                                 [.3, .15, .1, .08],
                                 [.2, .05, .1, .02]],
                        evidence = ['Genetics', 'Practice'],
                        evidence_card = [2,2])

### Add the relationships to your models

In [56]:
olympic_model.add_cpds (genetics_cpd, practice_cpd, offer_cpd, olympic_trials_cpd)



### Examine the structure of your graph

In [57]:
olympic_model.get_cpds()

[<TabularCPD representing P(Genetics:2) at 0x118d2f588>,
 <TabularCPD representing P(Practice:2) at 0x118d2f7f0>,
 <TabularCPD representing P(Offer:2 | OlympicTrials:3) at 0x118d46668>,
 <TabularCPD representing P(OlympicTrials:3 | Genetics:2, Practice:2) at 0x118d464a8>]

### Find active trail nodes

In [58]:
olympic_model.active_trail_nodes('Genetics')

{'Genetics': {'Genetics', 'Offer', 'OlympicTrials'}}

In [59]:
olympic_model.active_trail_nodes('OlympicTrials')

{'OlympicTrials': {'Genetics', 'Offer', 'OlympicTrials', 'Practice'}}

In [60]:
olympic_model.active_trail_nodes('OlympicTrials', observed = 'Genetics')

{'OlympicTrials': {'Offer', 'OlympicTrials', 'Practice'}}

### active trail nodes change depending on what you know

In [61]:
olympic_model.active_trail_nodes('Genetics')

{'Genetics': {'Genetics', 'Offer', 'OlympicTrials'}}

In [62]:
olympic_model.active_trail_nodes('Genetics', observed = 'Offer') # notice that practice comes in

{'Genetics': {'Genetics', 'OlympicTrials', 'Practice'}}

### Get all independencies

In [63]:
olympic_model.get_independencies()

(Genetics _|_ Practice)
(Genetics _|_ Offer | OlympicTrials)
(Genetics _|_ Offer | OlympicTrials, Practice)
(Practice _|_ Genetics)
(Practice _|_ Offer | OlympicTrials)
(Practice _|_ Offer | OlympicTrials, Genetics)
(Offer _|_ Practice, Genetics | OlympicTrials)
(Offer _|_ Genetics | OlympicTrials, Practice)
(Offer _|_ Practice | OlympicTrials, Genetics)

# Making inferences

### We can get probability distributions that are not explicitly spelled out in our graphs, 
### in particular the marginal distribution

In [64]:
from pgmpy.inference import VariableElimination

In [65]:
olympic_infer = VariableElimination(olympic_model)

In [66]:
prob_offer = olympic_infer.query(variables = ['Offer'])
print(prob_offer['Offer'])

╒═════════╤══════════════╕
│ Offer   │   phi(Offer) │
╞═════════╪══════════════╡
│ Offer_0 │       0.8898 │
├─────────┼──────────────┤
│ Offer_1 │       0.1102 │
╘═════════╧══════════════╛


In [67]:
prob_offer_good_genes = olympic_infer.query(
                                        variables = ['Offer', 'Genetics'])
print(prob_offer_good_genes['Genetics'])
print(prob_offer_good_genes['Offer'])

╒════════════╤═════════════════╕
│ Genetics   │   phi(Genetics) │
╞════════════╪═════════════════╡
│ Genetics_0 │          0.2000 │
├────────────┼─────────────────┤
│ Genetics_1 │          0.8000 │
╘════════════╧═════════════════╛
╒═════════╤══════════════╕
│ Offer   │   phi(Offer) │
╞═════════╪══════════════╡
│ Offer_0 │       0.8898 │
├─────────┼──────────────┤
│ Offer_1 │       0.1102 │
╘═════════╧══════════════╛


In [68]:
prob_offer_good_genes = olympic_infer.query(
                                        variables = ['Offer', 'OlympicTrials'])
print(prob_offer_good_genes['OlympicTrials'])
print(prob_offer_good_genes['Offer'])

╒═════════════════╤══════════════════════╕
│ OlympicTrials   │   phi(OlympicTrials) │
╞═════════════════╪══════════════════════╡
│ OlympicTrials_0 │               0.7820 │
├─────────────────┼──────────────────────┤
│ OlympicTrials_1 │               0.1262 │
├─────────────────┼──────────────────────┤
│ OlympicTrials_2 │               0.0918 │
╘═════════════════╧══════════════════════╛
╒═════════╤══════════════╕
│ Offer   │   phi(Offer) │
╞═════════╪══════════════╡
│ Offer_0 │       0.8898 │
├─────────┼──────────────┤
│ Offer_1 │       0.1102 │
╘═════════╧══════════════╛


### We can also get conditional probability distributions that take into account what we already know

In [69]:
prob_offer_bad_genes = olympic_infer.query(
                                        variables = ['Offer'], 
                                        evidence = {'Genetics':1})
print(prob_offer_bad_genes['Offer'])

╒═════════╤══════════════╕
│ Offer   │   phi(Offer) │
╞═════════╪══════════════╡
│ Offer_0 │       0.9017 │
├─────────┼──────────────┤
│ Offer_1 │       0.0983 │
╘═════════╧══════════════╛


In [70]:
prob_offer_good_genes = olympic_infer.query(
                                        variables = ['Offer'], 
                                        evidence = {'Genetics':0})
print(prob_offer_good_genes['Offer'])

╒═════════╤══════════════╕
│ Offer   │   phi(Offer) │
╞═════════╪══════════════╡
│ Offer_0 │       0.8420 │
├─────────┼──────────────┤
│ Offer_1 │       0.1580 │
╘═════════╧══════════════╛


In [71]:
prob_offer_good_genes_did_practice = olympic_infer.query(
                                        variables = ['Offer'], 
                                        evidence = {'Genetics':0, 'Practice':0})
print(prob_offer_good_genes_did_practice['Offer'])

╒═════════╤══════════════╕
│ Offer   │   phi(Offer) │
╞═════════╪══════════════╡
│ Offer_0 │       0.8150 │
├─────────┼──────────────┤
│ Offer_1 │       0.1850 │
╘═════════╧══════════════╛


### You can also go upstream logically. For example, evidence about performance at the Olympic Trials provides information about genetic distribution

In [72]:
prob_good_genes_if_amazing_olympic_trials = olympic_infer.query(
                                        variables = ['Genetics'], 
                                        evidence = {'OlympicTrials':2})
print(prob_good_genes_if_amazing_olympic_trials['Genetics'])
# remember 0 = good genes

╒════════════╤═════════════════╕
│ Genetics   │   phi(Genetics) │
╞════════════╪═════════════════╡
│ Genetics_0 │          0.3377 │
├────────────┼─────────────────┤
│ Genetics_1 │          0.6623 │
╘════════════╧═════════════════╛


### Some variables are only informative about other variables given 'third' variables

In [73]:
# Practice does not inherently tell us something about Genetics
prob_good_genes_if_no_practice = olympic_infer.query(
                                        variables = ['Genetics'], 
                                        evidence = {'Practice':1})
print(prob_good_genes_if_no_practice['Genetics'])
# probability distribution of genetics is no different from baseline distribution given information about practice

╒════════════╤═════════════════╕
│ Genetics   │   phi(Genetics) │
╞════════════╪═════════════════╡
│ Genetics_0 │          0.2000 │
├────────────┼─────────────────┤
│ Genetics_1 │          0.8000 │
╘════════════╧═════════════════╛


In [74]:
# BUT Practice does tell us something about genetics IF we also 
# know something about olympic trials performance
prob_good_genes_if_no_practice_good_olympic_trials = olympic_infer.query(
                                        variables = ['Genetics'], 
                                        evidence = {'Practice':1,
                                                   'OlympicTrials':2})
print(prob_good_genes_if_no_practice_good_olympic_trials['Genetics'])

╒════════════╤═════════════════╕
│ Genetics   │   phi(Genetics) │
╞════════════╪═════════════════╡
│ Genetics_0 │          0.3846 │
├────────────┼─────────────────┤
│ Genetics_1 │          0.6154 │
╘════════════╧═════════════════╛


### Easy find out the most probable state for a variable
### Note this is just a product of calculating the marginal probability distribution

In [75]:
olympic_infer.map_query(variables = ['Genetics'])

{'Genetics': 1}

In [76]:
olympic_infer.map_query(variables = ['Offer'])

{'Offer': 0}

In [77]:
olympic_infer.map_query(variables = ['OlympicTrials'])

{'OlympicTrials': 0}