Anaconda supplies several excellent modules
 - BeautifulSoup,
 - csv,
 - json, 
 - and nltk
 
 
### HTML

 - Tag Attributes Purpose
 - HTML Whole HTML document
 - HEAD Document header
 - TITLE Document title
 - BODY background, bgcolor Document body
 - H1, H2, H3, etc. Section headers
 - I, EM Emphasis
 - B, STRONG Strong emphasis
 - PRE Preformatted text
 - P, SPAN, DIV Paragraph, span, division
 - BR Line break
 - A href Hyperlink
 - IMG src, width, height Image
 - TABLE width, border Table
 - TR Table row
 - TH, TD Table header/data cell
 - OL, UL Numbered/itemized list
 - LI List item
 - DL Description list
 - DT, DD Description topic, definition
 - INPUT name User input field
 - SELECT name Pull-down menu
 
### XML
Any alphanumeric string can be a tag

In [6]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
# Construct soup from a string
soup1 = BeautifulSoup("<HTML><HEAD>«headers»</HEAD>«body»</HTML>", "lxml")
soup1.text

'«headers»«body»'

In [7]:
# Construct soup from a local file
    # soup2 = BeautifulSoup(open("myDoc.html"))


In [14]:
# Construct soup from a web document
# Remember that urlopen() does not add "http://"!
soup3 = BeautifulSoup(urlopen("https://uzay00.github.io"))
text = soup3.get_text()
text



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


'\n\nDr. Uzay Çetin\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDr. Uzay Çetin\n\nTwitter\nFacebook\n\n\n\n\n\n\n\r\n\t\t\t\t\t\t\t\t\t\t\tMerhabalar,\r\n\t\t\t\t\t\t\t\t\t\t\thoş geldiniz.\n\n\n\n\n\n\nKarmaşıklık ve Veri Bilimi \r\n\t\t\t\t\t\t\t\t\t\tKarmaşık Sistemler ve Veri Bilimi,\r\n\t\t\t\t\t\t\t\t\t\talanına ilgi duyuyor ve daha fazlasını öğrenmek istiyorsanız tıklayın.\r\n\t\t\t\t\t\t\t\t\t\t\n\n\r\n\t\t\t\t\t\t\t\t\t\t\tKa|Ve\n\n\n\n\r\n\t\t\t\t\t\t\t\t\t\t26 Mayıs 2018 Cumartesi günü \r\n\t\t\t\t\t\t\t\t\t\tKarmaşık Sistemler ve Veri Bilimi çalıştayı,\r\n\t\t\t\t\t\t\t\t\t\tİstanbul Bilgi Üniversitesi, \r\n\t\t\t\t\t\t\t\t\t\tSantral Kampüsünde E1-301 nolu amfide\r\n\t\t\t\t\t\t\t\t\t\tgerçekleşecektir. Bilgi ve kayıt için tıklayın.\r\n\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t\t\n\n\r\n\t\t\t\t\t\t\t\t\t\t\tProgram\n\n\n\n\n\n\nSuam habet fortuna rationem (Rastlantının da, kendine has bir mantığı vardır) - Petronius, Gaius \n\n\n\n\n\n\n\n\n\n\n\n'

In [17]:
import re
words = re.findall(r"\w+", text)
" ".join(words)

'Dr Uzay Çetin Dr Uzay Çetin Twitter Facebook Merhabalar hoş geldiniz Karmaşıklık ve Veri Bilimi Karmaşık Sistemler ve Veri Bilimi alanına ilgi duyuyor ve daha fazlasını öğrenmek istiyorsanız tıklayın Ka Ve 26 Mayıs 2018 Cumartesi günü Karmaşık Sistemler ve Veri Bilimi çalıştayı İstanbul Bilgi Üniversitesi Santral Kampüsünde E1 301 nolu amfide gerçekleşecektir Bilgi ve kayıt için tıklayın Program Suam habet fortuna rationem Rastlantının da kendine has bir mantığı vardır Petronius Gaius'

In [22]:
with urlopen("http://www.networksciencelab.com/") as doc:
    soup = BeautifulSoup(doc, "lxml")
links = [(link.string, link["href"]) for link in soup.find_all("a") if link.has_attr("href")]
links[:4]

[('Networks of Music Groups as Success Predictors',
  'http://www.slideshare.net/DmitryZinoviev/networks-of-music-groups-as-success-predictors'),
 ('Network Science Workshop',
  'http://www.slideshare.net/DmitryZinoviev/workshop-20212296'),
 ('Resilience in Transaction-Oriented Networks',
  'http://www.slideshare.net/DmitryZinoviev/resilience-in-transactional-networks'),
 ('Peer Ratings in Massive Online Social Networks',
  'http://www.slideshare.net/DmitryZinoviev/peer-ratings-in-massive-online-social-networks')]

#### CSV

> For potentially large files, don’t  read all records at once but use incremental, iterative, row-by-row processing

In [24]:
try:
    with open("somefile.csv", newline='') as infile:
        reader = csv.reader(infile, delimiter=',', quotechar='"')
except:
    pass

#### JSON

JSON is language-independent data interchange format.

![](json.png)

In [30]:
import json
obje = ["uzay", "Ali", "Ayse"]

# Save an object to a file
with open("data.json", "w") as out_json:
    json.dump(obje, out_json, indent=None, sort_keys=False)

In [32]:
# Load an object from a file
with open("data.json") as in_json:
    object1 = json.load(in_json)
    
object1

['uzay', 'Ali', 'Ayse']

### Processing Texts in Natural Languages

In [39]:
from nltk.tokenize import WordPunctTokenizer
word_punct = WordPunctTokenizer()
text = "}Help! :))) :[ ..... :D{"
word_punct.tokenize(text)

['}', 'Help', '!', ':)))', ':[', '.....', ':', 'D', '{']

In [37]:
import nltk
lemmatizer = nltk.WordNetLemmatizer()
lemmatizer.lemmatize("cats")

'cat'

In [38]:
nltk.pos_tag(["beautiful", "world"])
# An adjective and a noun

[('beautiful', 'JJ'), ('world', 'NN')]

In [63]:
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk import LancasterStemmer
import re


# Create a new stemmer
ls = nltk.LancasterStemmer()

# Read the file and cook a soup
with urlopen("https://mathinsight.org/") as infile:
    soup = BeautifulSoup(infile, "lxml")

words = re.findall(r"\w+", soup.text)
text = " ".join(words)
text

# Extract and tokenize the text
words = nltk.word_tokenize(text)

# Convert to lowercase
words = [w.lower() for w in words]

# Eliminate stop words and stem the rest of the words
words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()]

# Tally the words
freqs = Counter(words)

print(freqs.most_common())

[('docu', 14), ('funct', 14), ('pag', 12), ('shid', 9), ('display', 8), ('styl', 8), ('getelementbyid', 8), ('math', 6), ('ga', 6), ('applet', 6), ('introduc', 6), ('vect', 6), ('jquery', 6), ('el', 6), ('insight', 5), ('0', 5), ('top', 5), ('illust', 5), ('bas', 5), ('rect', 5), ('var', 4), ('highlight', 4), ('graph', 4), ('field', 4), ('conceiv', 4), ('linear', 4), ('non', 4), ('window', 4), ('tru', 3), ('com', 3), ('welcom', 3), ('vary', 3), ('interact', 3), ('two', 3), ('plot', 3), ('us', 3), ('solv', 3), ('dimend', 3), ('system', 3), ('mathem', 3), ('unav', 3), ('return', 3), ('js', 2), ('push', 2), ('script', 2), ('https', 2), ('http', 2), ('skip', 2), ('navig', 2), ('press', 2), ('ent', 2), ('threads', 2), ('index', 2), ('rec', 2), ('paramet', 2), ('curv', 2), ('map', 2), ('ide', 2), ('curl', 2), ('diff', 2), ('intuit', 2), ('green', 2), ('differenty', 2), ('discuss', 2), ('plan', 2), ('lin', 2), ('on', 2), ('rad', 2), ('jan', 2), ('loop', 2), ('transform', 2), ('revers', 2), ('