In [None]:
# v0.1 by Victor Schetinger
# Notebook to test a full pipeline of extraction-processing-vis of textual information
# Data mined from http://www.learntarot.com
# D3 js copied from http://bl.ocks.org/mbostock/1153292

In [80]:
# Importing all stuff
import requests
import html2text
import pprint
import re
import pandas as pd
from IPython.core.display import HTML, display
from string import Template
import json

In [81]:
# Defines the order of the cards
major_arcana_ordered = [u'THE FOOL',u'THE MAGICIAN',u'THE HIGH PRIESTESS',u'THE EMPRESS',u'THE EMPEROR',u'THE HIEROPHANT',
                        u'THE LOVERS',u'THE CHARIOT',u'STRENGTH',u'THE HERMIT',u'JUSTICE',u'WHEEL OF FORTUNE',u'THE HANGED MAN',
                        u'DEATH',u'TEMPERANCE',u'THE DEVIL',u'THE TOWER',u'THE STAR',u'THE MOON',u'THE SUN',
                        u'JUDGEMENT',u'THE WORLD']

# Fixes the difference in card names
def fixT(cardname):
    if cardname.upper() in major_arcana_ordered:
        return cardname.upper()
    else:
        # Bugado pra minor arcana
        return u'THE ' + cardname.upper() 

    

In [82]:
# Generates urls for content extraction
urls =  ['http://www.learntarot.com/maj0' + str(i) + '.htm' for i in range(0,10)]
urls += ['http://www.learntarot.com/maj' + str(i) + '.htm' for i in range(10,22)]
# Extracts html for major arcana
htmls = [requests.get(url).content for url in urls]

In [83]:
# Clears all tags
texts = [html2text.html2text(html) for html in htmls]


In [84]:
# Extract card titles
card = re.search(r'\#\s([\w\s]*)\n\n', texts[6]).groups()[0]
print(card)

# Extracts card associations
association = re.findall(r'\*\s\*\*([\w\s]*)\*\*', texts[6])
print(association)
opposing = re.findall(r'\[([\w\s]+)\]', re.findall(r'\(howcard\.htm\#howopposite\).*REINFORCING', texts[6],re.DOTALL)[0],re.DOTALL)
print(opposing)
reinforcing = re.findall(r'\[([\w\s]+)\]', re.findall(r'\(howcard\.htm\#howreinforce\).*\[DESCR', texts[6],re.DOTALL)[0],re.DOTALL)
print(reinforcing)

THE LOVERS
[u'RELATIONSHIP', u'SEXUALITY', u'PERSONAL BELIEFS', u'VALUES']
[u'Hierophant', u'Hermit', u'Five of Cups', u'Three of Swords']
[u'Empress', u'Two of Cups', u'Nine of Cups', u'Ten of Cups', u'Ten of Pentacles']


In [9]:
# Space to save useful regexps
#[cards_and_associations[card]['Number'] for card in cards_and_associations]
#[[text[1],re.findall(r'Some Possibilities\].*\[REIN', text[0],re.DOTALL)] for text in zip(texts,range(0,22)) ]
#[re.findall(r'\[([\w\s]+)\]', re.findall(r'Some Possibilities\].*\[REIN', text[0],re.DOTALL)[0],re.DOTALL) for text in zip(texts,range(0,22))]

In [85]:
#Creates the dictionary of cards and associations
cards_and_associations = {re.search(r'\#\s([\w\s]*)\n\n', text[0]).groups()[0] : 
                          {'Number'       : text[1],
                           'Associations' : re.findall(r'\*\s\*\*([\w\s]*)\*\*', text[0]),
                           'Opposing'     : re.findall(r'\[([\w\s]+)\]', re.findall(r'\(howcard\.htm\#howopposite\).*REINFORCING', text[0],re.DOTALL)[0],re.DOTALL),
                           'Reinforcing'  : re.findall(r'\[([\w\s]+)\]', re.findall(r'\(howcard\.htm\#howreinforce\).*\[DESCR', text[0],re.DOTALL)[0],re.DOTALL)} 
                          for text in zip(texts,range(0,22))}
cards_and_associations

{u'DEATH': {'Associations': [u'ENDING',
   u'TRANSITION',
   u'ELIMINATION',
   u'INEXORABLE FORCES'],
  'Number': 13,
  'Opposing': [u'Fool', u'Empress', u'Judgement'],
  'Reinforcing': [u'Tower',
   u'Eight of Wands',
   u'Five of Cups',
   u'Eight of Cups']},
 u'JUDGEMENT': {'Associations': [u'JUDGMENT',
   u'REBIRTH',
   u'INNER CALLING',
   u'ABSOLUTION'],
  'Number': 20,
  'Opposing': [u'Death', u'Five of Cups', u'Nine of Swords'],
  'Reinforcing': [u'Fool', u'Justice', u'Seven of Pentacles']},
 u'JUSTICE': {'Associations': [u'JUSTICE',
   u'RESPONSIBILITY',
   u'DECISION',
   u'CAUSE AND EFFECT'],
  'Number': 11,
  'Opposing': [u'Two of Swords', u'Five of Swords', u'Seven of Swords'],
  'Reinforcing': [u'Emperor',
   u'Judgement',
   u'Ten of Wands',
   u'Nine of Swords',
   u'Seven of Pentacles']},
 u'STRENGTH': {'Associations': [u'STRENGTH',
   u'PATIENCE',
   u'COMPASSION',
   u'SOFT CONTROL'],
  'Number': 8,
  'Opposing': [u'Chariot',
   u'Eight of Cups',
   u'Six of Swords'

In [86]:
# Creates the Panda Container
df = pd.DataFrame({'Number'      : [cards_and_associations[card]['Number'] for card in cards_and_associations], 
                   'Card'        : [card for card in cards_and_associations], 
                   'Associations': [cards_and_associations[card]['Associations'] for card in cards_and_associations],
                   'Opposing'    : [cards_and_associations[card]['Opposing'] for card in cards_and_associations],
                   'Reinforcing' : [cards_and_associations[card]['Reinforcing'] for card in cards_and_associations]})

In [87]:
df = df.sort_values('Number', ascending=True).reset_index().drop(columns='index')
cols = ['Number','Card','Associations','Opposing','Reinforcing']
df[cols]

Unnamed: 0,Number,Card,Associations,Opposing,Reinforcing
0,0,THE FOOL,"[BEGINNING, SPONTANEITY, FAITH, APPARENT FOLLY]","[Hierophant, Death, Devil, Two of Swords, Four...","[Hanged Man, Star, Judgement, Three of Wands]"
1,1,THE MAGICIAN,"[ACTION, CONSCIOUS AWARENESS, CONCENTRATION, P...","[High Priestess, Hanged Man, Seven of Cups, Fo...","[Chariot, Two of Wands, Eight of Wands, Eight ..."
2,2,THE HIGH PRIESTESS,"[NONACTION, UNCONSCIOUS AWARENESS, POTENTIAL, ...","[Magician, Two of Wands, Seven of Wands, Eight...","[Hermit, Hanged Man, Four of Swords]"
3,3,THE EMPRESS,"[MOTHERING, ABUNDANCE, SENSES, NATURE]","[Emperor, Death, Four of Pentacles, Nine of Pe...","[Lovers, Star, Nine of Cups, Seven of Pentacle..."
4,4,THE EMPEROR,"[FATHERING, STRUCTURE, AUTHORITY, REGULATION]","[Empress, Seven of Cups, Five of Swords]","[Hierophant, Justice, Two of Wands, Three of W..."
5,5,THE HIEROPHANT,"[EDUCATION, BELIEF SYSTEMS, CONFORMITY, GROUP ...","[Fool, Lovers, Two of Wands, Seven of Swords, ...","[Emperor, Three of Cups, Three of Pentacles, E..."
6,6,THE LOVERS,"[RELATIONSHIP, SEXUALITY, PERSONAL BELIEFS, VA...","[Hierophant, Hermit, Five of Cups, Three of Sw...","[Empress, Two of Cups, Nine of Cups, Ten of Cu..."
7,7,THE CHARIOT,"[VICTORY, WILL, HARD CONTROL]","[Strength, Hanged Man, Tower, Eight of Swords,...","[Magician, Two of Wands, Six of Wands, Four of..."
8,8,STRENGTH,"[STRENGTH, PATIENCE, COMPASSION, SOFT CONTROL]","[Chariot, Eight of Cups, Six of Swords, Five o...","[Hanged Man, Nine of Wands]"
9,9,THE HERMIT,"[INTROSPECTION, SEARCHING, GUIDANCE, SOLITUDE]","[Lovers, World, Two of Cups, Three of Cups, Ni...","[High Priestess, Four of Cups, Eight of Cups, ..."


In [26]:
# Template for the d2 js graph visualization
html_template = '''
<!DOCTYPE html>
<meta charset="utf-8">
<style>

.link {
  fill: none;
  stroke: #666;
  stroke-width: 1.5px;
}

#licensing {
  fill: green;
}

.link.licensing {
  stroke: green;
}

.link.resolved {
  stroke-dasharray: 0,2 1;
}

circle {
  fill: #ccc;
  stroke: #333;
  stroke-width: 1.5px;
}

text {
  font: 10px sans-serif;
  pointer-events: none;
  text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, 0 -1px 0 #fff, -1px 0 0 #fff;
}

</style>
<body>
<script src="http://d3js.org/d3.v3.min.js" charset="utf-8"></script>
<script>

var links =  $tarot_links ;

var nodes = {};

// Compute the distinct nodes from the links.
links.forEach(function(link) {
  link.source = nodes[link.source] || (nodes[link.source] = {name: link.source});
  link.target = nodes[link.target] || (nodes[link.target] = {name: link.target});
});

var width = 960,
    height = 500;

var force = d3.layout.force()
    .nodes(d3.values(nodes))
    .links(links)
    .size([width, height])
    .linkDistance(60)
    .charge(-300)
    .on("tick", tick)
    .start();

var svg = d3.select("#graph").append("svg")
    .attr("width", width)
    .attr("height", height);

// Per-type markers, as they don't inherit styles.
svg.append("defs").selectAll("marker")
    .data(["suit", "licensing", "resolved"])
  .enter().append("marker")
    .attr("id", function(d) { return d; })
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 15)
    .attr("refY", -1.5)
    .attr("markerWidth", 6)
    .attr("markerHeight", 6)
    .attr("orient", "auto")
  .append("path")
    .attr("d", "M0,-5L10,0L0,5");

var path = svg.append("g").selectAll("path")
    .data(force.links())
  .enter().append("path")
    .attr("class", function(d) { return "link " + d.type; })
    .attr("marker-end", function(d) { return "url(#" + d.type + ")"; });

var circle = svg.append("g").selectAll("circle")
    .data(force.nodes())
  .enter().append("circle")
    .attr("r", 6)
    .call(force.drag);

var text = svg.append("g").selectAll("text")
    .data(force.nodes())
  .enter().append("text")
    .attr("x", 8)
    .attr("y", ".31em")
    .text(function(d) { return d.name; });

// Use elliptical arc path segments to doubly-encode directionality.
function tick() {
  path.attr("d", linkArc);
  circle.attr("transform", transform);
  text.attr("transform", transform);
}

function linkArc(d) {
  var dx = d.target.x - d.source.x,
      dy = d.target.y - d.source.y,
      dr = Math.sqrt(dx * dx + dy * dy);
  return "M" + d.source.x + "," + d.source.y + "A" + dr + "," + dr + " 0 0,1 " + d.target.x + "," + d.target.y;
}

function transform(d) {
  return "translate(" + d.x + "," + d.y + ")";
}

</script>
'''

In [78]:
# Builds the data for card links
links_opposing = [[{'source'        : card, 
          'target': fixT(target) ,
          'type'    : 'licensing'} for target in cards_and_associations[card]['Opposing'] ]
                                     for card in cards_and_associations ]# links
links_opposing = [j for i in links_opposing for j in i]
links_reinforcing = [[{'source'        : card, 
          'target': fixT(target) ,
          'type'    : 'suit'} for target in cards_and_associations[card]['Reinforcing']]
                                     for card in cards_and_associations]# links
links_reinforcing = [j for i in links_reinforcing for j in i]

links_order = [{'source'  : major_arcana_ordered[i], 
                'target'  : major_arcana_ordered[i+1],
                'type'    : 'resolved'} for i in range(0,len(major_arcana_ordered)-1)]
                                    

links = links_opposing + links_reinforcing + links_order
links = [link for link in links if link['target'] in major_arcana_ordered]

In [88]:
# Injects data into the original template
js_text_template = Template(html_template)
filled_template = js_text_template.substitute({"tarot_links":json.dumps(links)})#html_template.replace(u'$tarot_links',links.__str__())
print(filled_template)


<!DOCTYPE html>
<meta charset="utf-8">
<style>

.link {
  fill: none;
  stroke: #666;
  stroke-width: 1.5px;
}

#licensing {
  fill: green;
}

.link.licensing {
  stroke: green;
}

.link.resolved {
  stroke-dasharray: 0,2 1;
}

circle {
  fill: #ccc;
  stroke: #333;
  stroke-width: 1.5px;
}

text {
  font: 10px sans-serif;
  pointer-events: none;
  text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, 0 -1px 0 #fff, -1px 0 0 #fff;
}

</style>
<body>
<script src="http://d3js.org/d3.v3.min.js" charset="utf-8"></script>
<script>

var links =  [{"source": "DEATH", "type": "licensing", "target": "THE FOOL"}, {"source": "DEATH", "type": "licensing", "target": "THE EMPRESS"}, {"source": "DEATH", "type": "licensing", "target": "JUDGEMENT"}, {"source": "THE EMPRESS", "type": "licensing", "target": "THE EMPEROR"}, {"source": "THE EMPRESS", "type": "licensing", "target": "DEATH"}, {"source": "STRENGTH", "type": "licensing", "target": "THE CHARIOT"}, {"source": "THE LOVERS", "type": "licensing", "target": "TH

In [89]:
# Creates the element that will receive the graph
HTML('''<div id="graph"></div>''')

In [90]:
# Runs d3 js visualization
HTML(filled_template)