In [33]:
import requests
import pandas as pd

In [2]:
req = requests.get('https://en.wikipedia.org/wiki/City_University_of_New_York')

In [3]:
req.status_code

200

In [4]:
from bs4 import BeautifulSoup as BS

In [10]:
soup = BS(req.text,'html.parser') #give a messy look of the HTML code
dir(soup) #give directory

In [9]:
soup.title #Give the title
soup.p #give all paragraphs

<p>The <b>City University of New York</b> (<b>CUNY</b> <span class="nowrap"><span class="IPA nopopups noexcerpt"><a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ˈ/: primary stress follows">ˈ</span><span title="'k' in 'kind'">k</span><span title="/juː/: 'u' in 'cute'">juː</span><span title="'n' in 'nigh'">n</span><span title="/i/: 'y' in 'happy'">i</span></span>/</a></span></span>) is the <a href="/wiki/Public_university" title="Public university">public</a> <a href="/wiki/University_system" title="University system">university system</a> of <a href="/wiki/Education_in_New_York_City" title="Education in New York City">New York City</a>, and the largest urban university system in the <a href="/wiki/United_States" title="United States">United States</a>. CUNY and the <a href="/wiki/State_University_of_New_York" title="State University of New York">State University of New York</a> (SUNY) are separate and independent university

In [12]:
all_tables = soup.find_all('table')
tables_class = [table['class'] for table in all_tables]
tables_class

[['infobox', 'vcard'],
 ['wikitable', 'sortable'],
 ['plainlinks', 'metadata', 'ambox', 'ambox-content', 'ambox-Unreferenced'],
 ['wikitable', 'sortable'],
 ['mbox-small', 'plainlinks', 'sistersitebox'],
 ['nowraplinks', 'collapsible', 'autocollapse', 'navbox-inner'],
 ['nowraplinks', 'collapsible', 'autocollapse', 'navbox-inner'],
 ['nowraplinks', 'navbox-subgroup'],
 ['nowraplinks', 'collapsible', 'autocollapse', 'navbox-inner']]

In [13]:
wikitables = [table for table in all_tables if 'wikitable' in table['class']] #List Comprehension

In [14]:
from IPython.display import HTML #display a piece of a notebook

In [16]:
HTML(str(wikitables[1]))

Name,Grad.,College,Notable for
Kenneth Arrow,1940,City,American economist and joint winner of the Nobel Prize in Economics
Robert Aumann,1950,City,mathematician and winner of the Nobel Prize in Economics
Herman Badillo,1951,City,civil rights activist and the first Puerto Rican elected to the U.S. Congress
Arlene Davila,1996,City,author and Anthropology and American Studies professor at New York University
Jesse Douglas,1916,City,mathematician and winner of one of the first two Fields Medals
Abraham Foxman,,City,"national director, Anti-Defamation League"
Felix Frankfurter,1902,City,U.S. Supreme Court Justice
Andy Grove,1960,City,"former chairman and CEO, Intel Corporation"
Herbert A. Hauptman,1937,City,mathematician and winner of the Nobel Prize in Chemistry
Leonard Kleinrock,1957,City,"computer scientist, Internet pioneer"


In [17]:
rows = [row for row in wikitables[1].find_all('tr')]

In [18]:
rows[0].find_all('th')

[<th scope="col" style="text-align: left;">Name
 </th>, <th scope="col" style="text-align: left;">Grad.
 </th>, <th scope="col" style="text-align: left;">College
 </th>, <th class="unsortable" scope="col" style="text-align: left;">Notable for
 </th>]

In [19]:
colheads = [col.get_text().strip().lower() for col in rows[0].find_all('th')]
colheads

['name', 'grad.', 'college', 'notable for']

In [20]:
colheads = [col.get_text().strip().lower().replace(' ', '_').replace('.','') for col in rows[0].find_all('th')]
colheads

['name', 'grad', 'college', 'notable_for']

In [21]:
data_rows = rows[1:] #Only want the TD
data_rows[0]

<tr style="vertical-align: top;">
<td data-sort-value="Arrow, Kenneth"><a href="/wiki/Kenneth_Arrow" title="Kenneth Arrow">Kenneth Arrow</a></td>
<td>1940</td>
<td>City</td>
<td>American economist and joint winner of the Nobel Prize in Economics
</td></tr>

In [28]:
value_rows =[ [ col.get_text() for col in row.find_all('td') ] for row in rows[1:]] #nested structure
value_rows

[['Kenneth Arrow',
  '1940',
  'City',
  'American economist and joint winner of the Nobel Prize in Economics\n'],
 ['Robert Aumann',
  '1950',
  'City',
  'mathematician and winner of the Nobel Prize in Economics\n'],
 ['Herman Badillo',
  '1951',
  'City',
  'civil rights activist and the first Puerto Rican elected to the U.S. Congress\n'],
 ['Arlene Davila',
  '1996',
  'City',
  'author and Anthropology and American Studies professor at New York University\n'],
 ['Jesse Douglas',
  '1916',
  'City',
  'mathematician and winner of one of the first two Fields Medals\n'],
 ['Abraham Foxman', '', 'City', 'national director, Anti-Defamation League\n'],
 ['Felix Frankfurter', '1902', 'City', 'U.S. Supreme Court Justice\n'],
 ['Andy Grove',
  '1960',
  'City',
  'former chairman and CEO, Intel Corporation\n'],
 ['Herbert A. Hauptman',
  '1937',
  'City',
  'mathematician and winner of the Nobel Prize in Chemistry\n'],
 ['Leonard Kleinrock',
  '1957',
  'City',
  'computer scientist, Inter

In [26]:
#we want a different kind of list where names are in 1 list

In [30]:
value_series = list(zip(*value_rows))  #the *: I have the list of list, line them up as argument -> trick to flip it
value_series

[('Kenneth Arrow',
  'Robert Aumann',
  'Herman Badillo',
  'Arlene Davila',
  'Jesse Douglas',
  'Abraham Foxman',
  'Felix Frankfurter',
  'Andy Grove',
  'Herbert A. Hauptman',
  'Leonard Kleinrock',
  'Guillermo Linares',
  'Lisa Nakamura',
  'Barnett Newman',
  "John O'Keefe",
  'Colin Powell',
  'Mario Puzo',
  'Faith Ringgold',
  'A. M. Rosenthal',
  'Jonas Salk',
  'Daniel Schorr',
  'Elliott Fitch Shepard',
  'Bernard Weinraub',
  'Egemen Bağış',
  'Abraham Beame',
  'Robin Byrd',
  'Fernando Ferrer',
  'Sidney Harman',
  'Marcia A. Karrow',
  'James Lam',
  'Ralph Lauren',
  'Dolly Lenz',
  'Dennis Levine',
  'Jennifer Lopez',
  'Craig A. Stanley',
  'Tarkan',
  'Bella Abzug',
  'Carmen Beauchamp Ciparick',
  'Robert R. Davila',
  'Ruby Dee',
  'Martin Garbus',
  'Florence Howe',
  'Audre Lorde',
  'Mohamed Mahmoud Ould Mohamedou',
  'Soia Mentschikoff',
  'Thomas J. Murphy, Jr.',
  'Pauli Murray',
  'Edward Thomas Brady',
  'Jennings Michael Burch',
  'Marcos Crespo',
  'Edw

In [32]:
data_dict = dict(zip(colheads,value_series))
data_dict

{'name': ('Kenneth Arrow',
  'Robert Aumann',
  'Herman Badillo',
  'Arlene Davila',
  'Jesse Douglas',
  'Abraham Foxman',
  'Felix Frankfurter',
  'Andy Grove',
  'Herbert A. Hauptman',
  'Leonard Kleinrock',
  'Guillermo Linares',
  'Lisa Nakamura',
  'Barnett Newman',
  "John O'Keefe",
  'Colin Powell',
  'Mario Puzo',
  'Faith Ringgold',
  'A. M. Rosenthal',
  'Jonas Salk',
  'Daniel Schorr',
  'Elliott Fitch Shepard',
  'Bernard Weinraub',
  'Egemen Bağış',
  'Abraham Beame',
  'Robin Byrd',
  'Fernando Ferrer',
  'Sidney Harman',
  'Marcia A. Karrow',
  'James Lam',
  'Ralph Lauren',
  'Dolly Lenz',
  'Dennis Levine',
  'Jennifer Lopez',
  'Craig A. Stanley',
  'Tarkan',
  'Bella Abzug',
  'Carmen Beauchamp Ciparick',
  'Robert R. Davila',
  'Ruby Dee',
  'Martin Garbus',
  'Florence Howe',
  'Audre Lorde',
  'Mohamed Mahmoud Ould Mohamedou',
  'Soia Mentschikoff',
  'Thomas J. Murphy, Jr.',
  'Pauli Murray',
  'Edward Thomas Brady',
  'Jennings Michael Burch',
  'Marcos Crespo'

In [37]:
df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,name,grad,college,notable_for
0,Kenneth Arrow,1940,City,American economist and joint winner of the Nob...
1,Robert Aumann,1950,City,mathematician and winner of the Nobel Prize in...
2,Herman Badillo,1951,City,civil rights activist and the first Puerto Ric...
3,Arlene Davila,1996,City,author and Anthropology and American Studies p...
4,Jesse Douglas,1916,City,mathematician and winner of one of the first t...


In [40]:
#Group everything by college
bycollege = df.groupby('college')

In [41]:
#
size_table = bycollege.size()
size_table

college
Baruch          13
Brooklyn        26
City            22
Hunter          11
John Jay        14
Medgar Evers     2
Queens          10
dtype: int64

In [42]:
size_table.sort_values(ascending=False)

college
Brooklyn        26
City            22
John Jay        14
Baruch          13
Hunter          11
Queens          10
Medgar Evers     2
dtype: int64