In [None]:
from IPython.core.display import HTML
from bs4 import BeautifulSoup, UnicodeDammit
import pandas as pd
import StringIO

# Objectives:
* Learn how the internet works so we can make it do what we want.
 * www vs. internet
 * GET, PUT, POST, DELETE
 * javascript, APIs.
* Understand basic approach for
 * Simple web scraping
 * Session-based web scraping
 * Crawling?
 * Parsing documents.
* Know that there's a lot more to know, specifically related to
 * Anonymity
 * Selenium

## http vs internet
 * The internet is tubes. 

 * HTTP is a protocol for transfering documents (especially Hyptertext) across the internet.
 * There are other protocols: like git, smtp, ftp etc.

## HTTP can't actually do much.
 * Stateless
 * Dumb framework
 
### Can do
 * GET - think of this as "read"
 * PUT - think of this like an "edit"
 * POST - think of this as "create"
 * DELETE - be careful
 
Everything you do on the web is one of these things.

### Morning:
* *Describe* a typical web scraping data pipeline
* *Compare and Contrast* SQL and noSQL
* *Perform* basic operations using Mongo
* *Explain* the basic concepts of HTML

## 2. Installing Mongo and PyMongo

### Mongo
1. Install MongoDB: `brew install mongodb`
2. Start MongoDB: `brew services start mongodb`

#### Do *not* run services as `root`.

### PyMongo
2. Install PyMongo: `conda install pymongo`

## 3. Typical Pipeline
<img src="images/pipeline.png" width = 500>

## 4. SQL vs NoSQL

* Contrary to what some folks may want, NoSQL does not stand for 'No SQL'.
* Different Paradigm to deal with messy data that does not lend itself to an RDBMS
* A NoSQL stack may include a RDBMS component, Redis to handle queuing and Hadoop for Big Data processing
* NoSQL ==> "Not Only SQL"

## 5. MongoDB Concepts

* MongoDB is a document-oriented database, an alternative to RDBMS
* Used for storing semi-structured data
* JSON-like objects form the data model, rather than RDBMS tables
* No schema, No joins, No transactions
* Sub-optimal for complicated queries

* MongoDB is made up of databases which contain collections (tables)
* A collection is made up of documents (analogous to rows or records)
* Each document is made up of key-value pairs (analogous to columns)

* RDBMS defines columns at the table level, document oriented database defines its fields at a document level.

* CURSOR: When you ask MongoDB for data, it returns a pointer to the result set called a cursor.

* Actual execution is delayed until necessary.



### Mongo Clients
<img src="images/client-server.png" width = 500>


## 6.  Create a Database and do some operations

* Mongo can create databases, collections, documents, etc. on the fly. 
* To create a new database simply try to use the database you haven't created: use my_new_database

## Inserting Data
```
db.users.insert({name: 'Jon',
                 age: '45',
                 friends: ['Henry', 'Ashley']
                 })

show dbs
db.getCollectionNames()

db.users.insert({name: 'Ashley',
                 age: '37',
                 friends: ['Jon', 'Henry']
                 })
                 
db.users.insert({name: 'Frank',
                 age: '17',
                 friends: ['Billy'],
                 car: 'Civic'})

db.users.find()
```
* Mongo creates the _id field by default

## Querying Data
```
// find by single field
db.users.find({ name: 'Jon'})

// find by presence of field
db.users.find({ car: { $exists : true } })

// find by value in array
db.users.find({ friends: 'Henry' })

// field selection (only return name)
db.users.find({}, { name: true })
```


## Updating data 
```
// replaces friends array
db.users.update({name: "Jon"}, { $set: {friends: ["Phil"]}})

// adds to friends array
db.users.update({name: "Jon"}, { $push: {friends: "Susie"}})

// upsert
db.users.update({name: "Stevie"}, { $push: {friends: "Nicks"}}, true)

// multiple updates
db.users.update({}, { $set: { activated : false } }, false, true)
```

## Deleting Data
```
db.users.remove({})
```

# MongoDB Example
## PyMongo


In [None]:
# import MongoDB modules
from pymongo import MongoClient

In [None]:
# connect to the hosted MongoDB instance
client = MongoClient('mongodb://localhost:27017/')

In [None]:
db = client.lb_tst1

In [None]:
# Create a collection called users
users = db.lb_tst1

In [None]:
users.insert_one({'name':'lekha', 'city':'seattle'})

In [None]:
users.insert_one({'name':'joe', 'city':'new york' })

In [None]:
users.find().count()

In [None]:
users.find_one()

In [None]:
t = users.find_one({'name': 'lekha'})
t

In [10]:
users.find().count()

NameError: name 'users' is not defined

# 7. HTML Concepts
* HyperText Markup Language
* A markup language that forms the building blocks of all websites
* Consists of tags enclosed in angle brackets (like <html>)

### Important Tags

```html
<div>Defines a division or section</div>
<a href="http://www.w3schools.com">Link to W3Schools.com!</a>
<table>Will contain a table</table>
<p>This is a paragraph</p>
<h1>This is a header!<h1>
<ul>
    <li>This is a list</li>
</ul>
```

# 8. CSS
(Cascading Style Sheets)
* Enable the separation of document content from document presentation
* Controls aspects such as the layout, colors, and fonts.
* "Cascading" is used because the most specific rule is chosen


## CSS Syntax

* A CSS rule-set consists of a selector and a declaration block:
* Example:
```
p {
    color: red;
    text-align: center;
}
```

* Learn more about CSS Syntax here: http://www.w3schools.com/css/css_syntax.asp

# Scraping a webpage.

 * Use [Requests](http://docs.python-requests.org/en/latest/).
 * Use [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/)
 * Or get fancy and try [lxml](http://lxml.de/)

In [None]:
import requests
z = requests.get('http://galvanize.com')


# I guess I can't scrape this site.
Can you see it?

* Basic auth. 

In [None]:
import requests
z = requests.get('http://galvanizesf.roomzilla.net')


In [None]:
HTML(z.content)

In [None]:
import requests
z = requests.get('http://galvanizesf.roomzilla.net', auth=('', 'gVIP543'))

In [None]:
HTML(z.content)

In [None]:
#But python can't see it.
z = requests.get('https://accounts.craigslist.org/login/home')
HTML(z.content)

In [None]:
clist_pwd = 'G))XwjxV'

In [None]:
s = requests.Session()
#z_ = s.get('https://accounts.craigslist.org/login')
form_data = {'step':'confirmation',
         'p': 0,
         'rt': '',
         'rp': '',
         'inputEmailHandle':'isaac.laughlin@gmail.com',
         'inputPassword':clist_pwd}
headers = {"Host": "accounts.craigslist.org",
           "Origin": "https://accounts.craigslist.org",
           "Referer": "https://accounts.craigslist.org/login",
           'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36"}

s.headers.update(headers)
z = s.post('https://accounts.craigslist.org/login', data=form_data)
z

In [None]:
HTML(z.text)

In [None]:
HTML(z.text)

In [None]:
z.content

# Beautiful Soup (or getting data from your webpage).

In [None]:
#w = UnicodeDammit(z.content, is_html=True)
soup = BeautifulSoup(z.content, from_encoding='UTF-8')
t = soup.findAll('table')
listings = t[1]

In [None]:
rows = listings.findAll('tr')
data = [[x.text for x in row.findAll('td')] for row in rows]
data

In [None]:
[row.findAll('td') for row in rows]

In [None]:
t[1]

In [None]:
f = open('web scraping.ipynb', 'r')

In [None]:
f.next()

In [None]:
filestr = f.read()

In [None]:
sio = StringIO.StringIO(t[1])
sio.next()

In [None]:
df = pd.read_html(StringIO.StringIO(t[1]),
                header=0)[0]
df.head()
#WE'RE IN BUSINESS

# Baseball scores



In [None]:
scoreboard_html = requests.get('http://m.mlb.com/scoreboard#date=6/25/2015')
HTML(scoreboard_html.content)

# Where's the data?

AJAX. (YAY).

APIs and Javascript.

* A lot of webpages are actually built out of many small APIs (Application Programming Interface). 
  * http://m.mlb.com/scoreboard/
  * http://mlb.mlb.com/gdcross/components/game/mlb/year_2015/month_04/day_19/master_scoreboard.json
* AJAX: the page is updated using javascript.
* This might make scraping hard (requests doesn't resolve javascript)
 * Or easy.

In [None]:
sboard = requests.get('http://m.mlb.com/gdcross/components/game/mlb/year_2015/month_06/day_25/master_scoreboard.json')

In [None]:
import json
sb_dict = json.loads(sboard.text)

sb_dict['copyright']

# Logging in to galvanize roomzilla page.

In [None]:
gz = s.get('https://accounts.craigslist.org/login')

In [None]:
s.post?

# Mongo DB

* NOSQL: Not Only SQL.
* Documents vs. Schema
* Good for:
* Bad for:

## Why for scraping?
Oftentimes, in scraping you don't know what you'll get!

In [None]:
import pymongo


In [None]:
mdb = pymongo.MongoClient('localhost', 27017)

In [None]:
test_db = mdb.test_db

In [None]:
users = test_db.users

In [None]:
users.find_one()

In [None]:
post_id = users.insert({'name':'Boaz'})
post_id

In [None]:
post_id

In [None]:
users.find_one({'_id': post_id})

In [None]:
users.find_one({'_id': '5591a3866291224d1d7afb40'})

In [None]:
users