In [None]:
# tell Python that you want to use a library with the import statement
import requests

In [None]:
# Get the HU Wikipedia page
req = requests.get("https://en.wikipedia.org/wiki/Harvard_University")

In [None]:
type(req)

In [None]:
dir(req)

In [None]:
page = req.text
page

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(page, 'html.parser')

In [None]:
soup

In [None]:
type(soup)

In [None]:
dir(soup)

In [None]:
soup.title


In [None]:
len(soup.find_all('p'))

In [None]:
soup.table["class"]

In [None]:
[t['class'] for t in soup.find_all() if t.get('class')]

In [None]:
table_html = str(soup.find('table', 'wikitable'))

In [22]:
from IPython.core.display import HTML
HTML(table_html)

Unnamed: 0,Undergraduate,Graduate and Professional,U.S. Census
Asian/Pacific Islander,17%,11%,5%
Black/Non-Hispanic,6%,4%,12%
Hispanics of any race,9%,5%,16%
White/non-Hispanic,46%,43%,64%
Mixed Race/Other,10%,8%,9%
International students,11%,27%,


In [23]:
rows = [row for row in soup.find('table', 'wikitable').find_all('tr')]
rows

[<tr>
 <th></th>
 <th>Undergraduate</th>
 <th>Graduate<br/>
 and Professional</th>
 <th>U.S. Census</th>
 </tr>, <tr>
 <th>Asian/Pacific Islander</th>
 <td>17%</td>
 <td>11%</td>
 <td>5%</td>
 </tr>, <tr>
 <th>Black/Non-Hispanic</th>
 <td>6%</td>
 <td>4%</td>
 <td>12%</td>
 </tr>, <tr>
 <th>Hispanics of any race</th>
 <td>9%</td>
 <td>5%</td>
 <td>16%</td>
 </tr>, <tr>
 <th>White/non-Hispanic</th>
 <td>46%</td>
 <td>43%</td>
 <td>64%</td>
 </tr>, <tr>
 <th>Mixed Race/Other</th>
 <td>10%</td>
 <td>8%</td>
 <td>9%</td>
 </tr>, <tr>
 <th>International students</th>
 <td>11%</td>
 <td>27%</td>
 <td>N/A</td>
 </tr>]

In [42]:
rem_nl = lambda s: s.replace('\n', ' ')

In [26]:
print(rem_nl)

<function <lambda> at 0x000001A595AEC8C8>


### Functions

In [27]:
def power(x,y):
    return x**y
power(2,3)

8

In [28]:
def print_greeting():
    print("Hello!")
print_greeting()

Hello!


In [29]:
def get_multiple(x, y=1):
    return x*y
print("With x and y: ", get_multiple(10,2))
print("With x only: ", get_multiple(10))

With x and y:  20
With x only:  10


In [30]:
def print_special_greeting(name, leaving=False, condition="nice"):
    print("Hi ", name)
    print("How are you doting in this ", condition, " day?")
    if leaving:
        print("Please come back!")
    

In [31]:
print_special_greeting("John")

Hi  John
How are you doting in this  nice  day?


In [32]:
print_special_greeting("John", True, "rainy")

Hi  John
How are you doting in this  rainy  day?
Please come back!


In [33]:
print_special_greeting("John", True)

Hi  John
How are you doting in this  nice  day?
Please come back!


In [34]:
print_special_greeting("John", condition="horrible")

Hi  John
How are you doting in this  horrible  day?


In [37]:
def print_sibling(name, *siblings):
    print(name,"has the following sibling")
    for sibling in siblings:
        print(sibling)
    print()

print_sibling("John", 'Ashley', 'Lauren', "Authur")
print_sibling("Mike", "John")
print_sibling("Terry")

John has the following sibling
Ashley
Lauren
Authur

Mike has the following sibling
John

Terry has the following sibling



In [38]:
def print_brothers_sisters(name, **siblings):
    print(name, "has the following siblings")
    for sibling in siblings:
        print(sibling , ":", siblings[sibling])
    print()

print_brothers_sisters("John", Ashley="sister", Lauren="sister", Arthur="brother")

John has the following siblings
Arthur : brother
Ashley : sister
Lauren : sister



In [43]:
columns = [rem_nl(col.get_text()) for col in rows[0].find_all('th') if col.get_text()]
columns

['Undergraduate', 'Graduate and Professional', 'U.S. Census']

In [44]:
indexes = [row.find('th') for row in rows[1:]]
indexes

[<th>Asian/Pacific Islander</th>,
 <th>Black/Non-Hispanic</th>,
 <th>Hispanics of any race</th>,
 <th>White/non-Hispanic</th>,
 <th>Mixed Race/Other</th>,
 <th>International students</th>]

In [45]:
HTML(table_html)

Unnamed: 0,Undergraduate,Graduate and Professional,U.S. Census
Asian/Pacific Islander,17%,11%,5%
Black/Non-Hispanic,6%,4%,12%
Hispanics of any race,9%,5%,16%
White/non-Hispanic,46%,43%,64%
Mixed Race/Other,10%,8%,9%
International students,11%,27%,


In [46]:
to_num = lambda s: s[-1] == '%' and int(s[:-1]) or None

In [48]:
values = [to_num(value.get_text()) for row in rows[1:] for value in row.find_all('td')]
values

[17, 11, 5, 6, 4, 12, 9, 5, 16, 46, 43, 64, 10, 8, 9, 11, 27, None]

In [59]:
stacked_values = zip(*[values[i::3] for i in range(len(columns))])
list(stacked_values)

[(17, 11, 5), (6, 4, 12), (9, 5, 16), (46, 43, 64), (10, 8, 9), (11, 27, None)]

### Dictionary comprehension

In [60]:
{ind: value for ind, value in zip(indexes, stacked_values)}

{}

In [56]:
stacked_values

<zip at 0x1a595b0fac8>

In [82]:
import pandas as pd
import numpy as np

In [65]:
data_dicts = [{col: val for col, val in zip(columns, col_values)} for col_values in stacked_values]
data_dicts

[]

In [67]:
stacked_by_col = [values[i::3] for i in range(len(columns))]
stacked_by_col

[[17, 6, 9, 46, 10, 11], [11, 4, 5, 43, 8, 27], [5, 12, 16, 64, 9, None]]

In [69]:
data_lists = {col: val for col, val in zip(columns, stacked_by_col)}
data_lists

{'Graduate and Professional': [11, 4, 5, 43, 8, 27],
 'U.S. Census': [5, 12, 16, 64, 9, None],
 'Undergraduate': [17, 6, 9, 46, 10, 11]}

In [85]:
t = pd.DataFrame(data_lists)


KeyError: 0

KeyError: 0