## Processing HTML Files

In [10]:
# !conda install -y beautifulsoup4
from bs4 import BeautifulSoup
from urllib.request import urlopen
# Construct soup from a string
soup1 = BeautifulSoup("<HTML><HEAD>«headers»</HEAD>«body»</HTML>") # Construct soup from a local file
# soup2 = BeautifulSoup(open("myDoc.html"))
# Construct soup from a web document
# Remember that urlopen() does not add "http://"!
# soup3 = BeautifulSoup(urlopen("http://www.networksciencelab.com/"))


In [12]:
with urlopen("http://www.networksciencelab.com/") as doc: 
    soup = BeautifulSoup(doc)
links = [(link.string, link["href"]) 
    for link in soup.find_all("a")
    if link.has_attr("href")]
links

[('Networks of Music Groups as Success Predictors',
  'http://www.slideshare.net/DmitryZinoviev/networks-of-music-groups-as-success-predictors'),
 ('Network Science Workshop',
  'http://www.slideshare.net/DmitryZinoviev/workshop-20212296'),
 ('Resilience in Transaction-Oriented Networks',
  'http://www.slideshare.net/DmitryZinoviev/resilience-in-transactional-networks'),
 ('Peer Ratings in Massive Online Social Networks',
  'http://www.slideshare.net/DmitryZinoviev/peer-ratings-in-massive-online-social-networks'),
 ('Semantic Networks of Interests in Online NSSI Communities',
  'http://www.slideshare.net/DmitryZinoviev/presentation-31680572'),
 ('Towards an Ideal Store',
  'http://www.slideshare.net/DmitryZinoviev/10-monthsymposiumbeta'),
 ('D.Zinoviev, "Analyzing Cultural Domains with Python,"',
  'https://media.pragprog.com/newsletters/2016-04-06.html'),
 ('D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury commu

# Processing Texts in Natural Languages

In [16]:
# !conda install -y nltk
import nltk
nltk.download('wordnet')
wn = nltk.corpus.wordnet # The corpus reader 
wn.synsets("cat")

[nltk_data] Downloading package wordnet to /home/chris/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


[Synset('cat.n.01'),
 Synset('guy.n.01'),
 Synset('cat.n.03'),
 Synset('kat.n.01'),
 Synset('cat-o'-nine-tails.n.01'),
 Synset('caterpillar.n.02'),
 Synset('big_cat.n.01'),
 Synset('computerized_tomography.n.01'),
 Synset('cat.v.01'),
 Synset('vomit.v.01')]

In [19]:
wn.synset("cat.n.01").definition()

'feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats'

In [20]:
 wn.synset("cat.n.02").definition()

'an informal term for a youth or man'

In [21]:
wn.synset("cat.n.01").hypernyms()

[Synset('feline.n.01')]

In [22]:
wn.synset("cat.n.01").hyponyms()

[Synset('domestic_cat.n.01'), Synset('wildcat.n.03')]

In [24]:
x = wn.synset("cat.n.01") 
y = wn.synset("lynx.n.01") 
x.path_similarity(y)

0.04

In [26]:
[simxy.definition() for simxy in max( 
    (x.path_similarity(y), x, y)
    for x in wn.synsets('cat')
    for y in wn.synsets('dog')
    if x.path_similarity(y) # Ensure the synsets are related at all 
)[1:]]

['an informal term for a youth or man', 'informal term for a man']

In [28]:
pstemmer = nltk.PorterStemmer() 
pstemmer.stem("wonderful")

'wonder'

In [29]:
lstemmer = nltk.LancasterStemmer()
lstemmer.stem("wonderful")

'wond'