## Analyze Website

In [1]:
import requests
import os

In [2]:
def get_one_page(url):
    '''
    Get the html for one page
    '''
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

In [3]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
print(html)

outF = open('./FellowInfo.txt', 'w')
outF.write(html)
outF.write('\n')
outF.close()


<!DOCTYPE html><!-- Last Published: Thu Jun 20 2019 23:58:24 GMT+0000 (UTC) --><html data-wf-domain="www.insightdatascience.com" data-wf-page="5bd3f311a37dacd7c38d6cce" data-wf-site="575a31d2ce5d01dc7a20de45" lang="en"><head><meta charset="utf-8"/><title>Insight Data Science Fellows</title><meta content="Fellows from Insight Data Science are now at over 100 top companies like Facebook, Airbnb, Twitter, Pinterest, Uber, and many more." name="description"/><meta content="Insight Data Science Fellows" property="og:title"/><meta content="Fellows from Insight Data Science are now at over 100 top companies like Facebook, Airbnb, Twitter, Pinterest, Uber, and many more." property="og:description"/><meta content="summary" name="twitter:card"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://assets-global.website-files.com/575a31d2ce5d01dc7a20de45/css/data-science.cdd8beaaa.min.css" rel="stylesheet" type="text/css"/><script src="https://ajax.googleapis.co

## Dealing with Exceptions

In [4]:
import requests
from requests.exceptions import RequestException

In [5]:
def get_one_page(url):
    '''
    Get the html for one page
    With the exceptions
    '''
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

In [6]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
print(html)

<!DOCTYPE html><!-- Last Published: Thu Jun 20 2019 23:58:24 GMT+0000 (UTC) --><html data-wf-domain="www.insightdatascience.com" data-wf-page="5bd3f311a37dacd7c38d6cce" data-wf-site="575a31d2ce5d01dc7a20de45" lang="en"><head><meta charset="utf-8"/><title>Insight Data Science Fellows</title><meta content="Fellows from Insight Data Science are now at over 100 top companies like Facebook, Airbnb, Twitter, Pinterest, Uber, and many more." name="description"/><meta content="Insight Data Science Fellows" property="og:title"/><meta content="Fellows from Insight Data Science are now at over 100 top companies like Facebook, Airbnb, Twitter, Pinterest, Uber, and many more." property="og:description"/><meta content="summary" name="twitter:card"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://assets-global.website-files.com/575a31d2ce5d01dc7a20de45/css/data-science.cdd8beaaa.min.css" rel="stylesheet" type="text/css"/><script src="https://ajax.googleapis.co

## Using re (regular expression) to identify information

In [7]:
import requests
from requests.exceptions import RequestException
import re

In [8]:
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
    
def parse_one_page(html):
    pattern = re.compile('tooltip_name">(.*?)</div>.*?'
                         + 'toottip_title">(.*?)</div>.*?'
                         + 'tooltip_company">(.*?)</div>.*?'
                         + 'tooltip_project">(.*?)</div>.*?'
                         + 'tooltip_background">(.*?)</div>.*?'
                         + '&quot;(.*?)&quot;'
                         ,re.S)
    
    items = re.findall(pattern, html)
    print(items)

In [9]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
# print(html)
parse_one_page(html)

[('JP Bida', 'Director of Risk', 'Capital One', 'Measuring the Impact of Open Source Coders on Github', 'Bio-X Games, Stanford, Postdoc', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576976f76364789c4e58a0e8_JP-Pic.jpg'), ('Douglas Mason', 'Data Scientist', 'Twitter', 'Email Searcher: Search &amp; Visualize Your Mailbox', 'Physics, Harvard University,Â\xa0PhD\n', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57697251e8628b5103c6b117_Doug-Pic.jpg'), ('Virot Ta Chiraphadhanakul', 'Data Scientist', 'Facebook', 'Tweet Timeline: Visualizing the Impact of Social Media', 'Operations Research, MIT, PhD', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769760f6364789c4e589ce7_Ta-Pic.jpg'), ('Julia Viladomat', 'Data Scientist', 'Adobe', 'Sentiment Analysis Based on Social Media', 'Statistics, Stanford, Postdoc', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57685673bc51c12e173d3ba2_Julia%20Pic.jpg'), ('David Freeman', 'Head of Anti-Abuse Engine

## Converting data into orgnized dictionary structure

In [10]:
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
    
def parse_one_page(html):
    pattern = re.compile('tooltip_name">(.*?)</div>.*?'
                         + 'toottip_title">(.*?)</div>.*?'
                         + 'tooltip_company">(.*?)</div>.*?'
                         + 'tooltip_project">(.*?)</div>.*?'
                         + 'tooltip_background">(.*?)</div>.*?'
                         + '&quot;(.*?)&quot;'
                         ,re.S)
    
    items = re.findall(pattern, html)
    
    for item in items:
        yield{
            'name': item[0],
            'title': item[1],
            'company': item[2],
            'project': item[3],
            'background': item[4],
            'imagelink': item[5]
        }

In [11]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
# print(html)
for idx, item in enumerate(parse_one_page(html)):
    print(idx)
    print(item)

0
{'name': 'JP Bida', 'title': 'Director of Risk', 'company': 'Capital One', 'project': 'Measuring the Impact of Open Source Coders on Github', 'background': 'Bio-X Games, Stanford, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576976f76364789c4e58a0e8_JP-Pic.jpg'}
1
{'name': 'Douglas Mason', 'title': 'Data Scientist', 'company': 'Twitter', 'project': 'Email Searcher: Search &amp; Visualize Your Mailbox', 'background': 'Physics, Harvard University,Â\xa0PhD\n', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57697251e8628b5103c6b117_Doug-Pic.jpg'}
2
{'name': 'Virot Ta Chiraphadhanakul', 'title': 'Data Scientist', 'company': 'Facebook', 'project': 'Tweet Timeline: Visualizing the Impact of Social Media', 'background': 'Operations Research, MIT, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769760f6364789c4e589ce7_Ta-Pic.jpg'}
3
{'name': 'Julia Viladomat', 'title': 'Data Scientist', 'company': 'Ad

## Storing data as json + Scraping multiple pages

In [12]:
import requests
from requests.exceptions import RequestException
import re
import json

from multiprocessing import Pool

In [13]:
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
    
def parse_one_page(html):
    pattern = re.compile('tooltip_name">(.*?)</div>.*?'
                         + 'toottip_title">(.*?)</div>.*?'
                         + 'tooltip_company">(.*?)</div>.*?'
                         + 'tooltip_project">(.*?)</div>.*?'
                         + 'tooltip_background">(.*?)</div>.*?'
                         + '&quot;(.*?)&quot;'
                         ,re.S)
    
    items = re.findall(pattern, html)
    
    for item in items:
        yield{
            'name': item[0],
            'title': item[1],
            'company': item[2],
            'project': item[3],
            'background': item[4],
            'imagelink': item[5]
        }
        
def write_to_file(content):
    with open('./FellowInfo8Page_v1.json', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')
        f.close()
        
def get_content(offset):
    print('page_' + str(offset))
    url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=' + str(offset)
    html = get_one_page(url)
    for idx, item in enumerate(parse_one_page(html)):
        print(idx)
        print(item)
        datalist.append(item)
        write_to_file(item)

In [14]:
Insight = {}
Insight['fellows'] = []
datalist = Insight['fellows']
    
for i in range(1,9):
    get_content(i)
    
with open('./FellowInfo8Page_v2.txt', 'w', encoding='utf-8') as f:
    json.dump(Insight, f)

page_1
0
{'name': 'JP Bida', 'title': 'Director of Risk', 'company': 'Capital One', 'project': 'Measuring the Impact of Open Source Coders on Github', 'background': 'Bio-X Games, Stanford, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576976f76364789c4e58a0e8_JP-Pic.jpg'}
1
{'name': 'Douglas Mason', 'title': 'Data Scientist', 'company': 'Twitter', 'project': 'Email Searcher: Search &amp; Visualize Your Mailbox', 'background': 'Physics, Harvard University,Â\xa0PhD\n', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57697251e8628b5103c6b117_Doug-Pic.jpg'}
2
{'name': 'Virot Ta Chiraphadhanakul', 'title': 'Data Scientist', 'company': 'Facebook', 'project': 'Tweet Timeline: Visualizing the Impact of Social Media', 'background': 'Operations Research, MIT, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769760f6364789c4e589ce7_Ta-Pic.jpg'}
3
{'name': 'Julia Viladomat', 'title': 'Data Scientist', 'compan

52
{'name': 'Jingjing Huang', 'title': 'Sr. Associate, Business Analytics', 'company': 'LinkedIn', 'project': 'NuTube: Classifying recent YouTube videosÂ\xa0', 'background': 'Mathematics/Electrical Engineering, California Institute of Technology, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57688265bc51c12e173d6ddf_Jingjing.jpg'}
53
{'name': 'Joshua Lande', 'title': 'Senior Data Scientist', 'company': 'Twitter', 'project': 'ReviewSkimmer: Find what&#x27;s good or bad in a movie, before it&#x27;s too late!', 'background': 'Astrophysics, Stanford, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5768832dbc51c12e173d6e18_Josh.jpg'}
54
{'name': 'Eli Bressert', 'title': 'Data Scientist ', 'company': 'Netflix', 'project': 'InspectorGit: Discover Github&#x27;s awesome repositories', 'background': 'Astrophysics, University of Exeter, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769733ce8628b5103

99
{'name': 'Jesus Martinez', 'title': 'Data Scientist', 'company': 'Capital One', 'project': 'EdXPredictor: Predict your performance on edX courses', 'background': 'Astrophysics, University of Florida, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769a5d041db86a74ee125aa_Jesus-0274.jpg'}
page_2
0
{'name': 'Matt George', 'title': 'Data Scientist', 'company': 'Square', 'project': 'StreetsAhead: Image recognition with deep learning in Street View', 'background': 'Astrophysics, UC Berkeley, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769ae8f6364789c4e5919c5_Matt-0288.jpg'}
1
{'name': 'Carlos Cunha', 'title': 'Senior Data Scientist', 'company': 'Bosch', 'project': 'Jaunt: Recommendations along the way!', 'background': 'Cosmology and Astrophysics, Stanford, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769af69e8628b5103c72979_Carlos-0332.jpg'}
2
{'name': 'Adam Morgan', 'title': 'Lead Dat

35
{'name': 'Yiping Yuan', 'title': 'Machine Learning Researcher', 'company': 'LinkedIn', 'project': 'Career Wizard: A career path recommender', 'background': 'Statistics, University of Minnesota, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576ad84330d44538603187af_Yiping.jpg'}
36
{'name': 'Pouria Fewzee', 'title': 'Senior Data Scientist', 'company': 'Zoom.ai', 'project': 'HOSPARENCIE: Transparency in hospitalization fees', 'background': 'Electrical &amp; Computer Engineering, University of Waterloo, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576c8279a2b1564932b1112a_Pouria.jpg'}
37
{'name': 'Genevieve Smith', 'title': 'Director of Product', 'company': 'Insight Data Science', 'project': 'Neutral_Opinion: Visualizing opinions on net neutrality', 'background': 'Ecology &amp; Evolution, The University of Texas at Austin, PostdocÂ\xa0', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576acef530d445

82
{'name': 'Yvonne Edmonds', 'title': 'Generics Analytics-Data Scientist', 'company': 'McKesson', 'project': 'FantasyFilm: Can you make a better film than Hollywood?', 'background': 'Physics, Stanford, Systems EngineerÂ\xa0', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576c808866169b0e32c1600f_Yvonne.jpg'}
83
{'name': 'Rachel Reddick', 'title': 'Data Scientist', 'company': 'Bosch', 'project': 'IndieRecommender: Find Indie games like video games you already love', 'background': 'Â\xa0Astrophysics, Stanford, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576d949e41b11cde12d56479_Rachel%20Reddick-0989.jpg'}
84
{'name': 'Alex Smolyanskaya', 'title': 'Data Scientist ', 'company': 'Stitch Fix', 'project': 'beautifulcity: Discover street art all around the world', 'background': 'Neuroscience, University of Pennsylvania, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57818ace9c205dc62cd22733_Aleksand

39
{'name': 'Thomas Moran', 'title': 'Senior Data Scientist', 'company': 'Enigma', 'project': 'Cli-Migration: A recommender for the climate exodus', 'background': 'Physics, UC Berkeley, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576c830b66169b0e32c1615c_Thomas-.jpg'}
40
{'name': 'Jonathan Eckel', 'title': 'Senior Data Scientist', 'company': 'Splash', 'project': 'beersomme: Find a bar, drink awesome beer!', 'background': 'Physics, University of Arizona, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576c82c79356cec31248f051_Jon-.jpg'}
41
{'name': 'Josiah Walton', 'title': 'Director of Data Science', 'company': 'Agentis', 'project': 'SmartSourcery: Data-driven job sourcing recommendations', 'background': 'Experimental Neutrino Physics, University of Illinois at Urbana-Champaign, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/59c1a20613b7f8000140eaf3_Josiah-.jpg'}
42
{'name': 'Alexander Mc

82
{'name': 'Luis Vargas', 'title': 'Senior Data Scientist', 'company': 'SecurityScorecard', 'project': 'underServed: Where to open a restaurant?', 'background': 'Astrophysics, Yale, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/577c195b435f4ca271e38c9c_Luis.jpg'}
83
{'name': 'Matthew Kretschmer', 'title': 'Senior Data Analyst', 'company': 'McKinsey &amp; Company', 'project': 'WillThereBeSpace: Find an estimate of future Citibike availability', 'background': 'Physics, University of Maryland, College Park, PhDÂ\xa0', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/577c1bf1abd560a3050d188e_Matt%20K.jpg'}
84
{'name': 'Olivia Dincia', 'title': 'Data Scientist', 'company': 'Simulmedia, Inc.', 'project': 'ImmerseSpace: Immerse yourself in music that fits your mood and preferences', 'background': 'Computational Chemistry, University of Texas at Austin, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/577c1f98

36
{'name': 'Kirstin Aschbacher', 'title': 'Data Scientist', 'company': 'Jawbone', 'project': 'Doc Topics: Find a good doctor by summarizing Yelp reviews', 'background': 'Psychology/Behavioral Medicine, UCSF, Assistant Professor, Clinical Health', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/577ed7b23b0a73a9072b0bae_Kirstin%20Aschbacher.jpg'}
37
{'name': 'Philip Hebda', 'title': 'Data Scientist', 'company': 'Netflix', 'project': 'GitWatch: Revealing where to contribute to open source projects', 'background': 'Particle Physics, Princeton, PhDÂ\xa0', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/577ff4773d6d3ed24adee394_PhilipHebda.jpg'}
38
{'name': 'Karen Hayrapetyan', 'title': 'Data Scientist', 'company': 'H20', 'project': 'buzzeat:Â\xa0Find the dish you crave', 'background': 'Physics,Â\xa0Purdue University, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/589b6b5e1276aa5d4893b0f9_Karen%20Hayrapetyan.jpg'

{'name': 'Barath Ezhilan', 'title': 'Applied Scientist', 'company': 'Amazon', 'project': 'ClickInsight: Increasing link clicks on Facebook news posts', 'background': 'Mechanical Engineering &amp; Computational Biophysics, UCSD, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5786cd80518bb3c0732c90f5_BarathEzhilan.jpg'}
80
{'name': 'Francois Charest', 'title': 'Data Science Associate', 'company': 'J.P. Morgan', 'project': 'WalkSafr: Finding safer walking routes', 'background': 'Mathematics, Columbia University, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5786cf137181d8cb735e405b_FrancoisCharest.jpg'}
81
{'name': 'Judith Li', 'title': 'Data Scientist', 'company': 'SAP', 'project': 'Smartan: A smart workout recommender', 'background': 'Computational Geoscience, Stanford University, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5786d6915704f4a0321b49a3_JudithYueLi.jpg'}
82
{'name': 'Katie Am

43
{'name': 'Taro Naoi', 'title': 'Senior Data Engineering Developer', 'company': 'NBCUniversal', 'project': 'Humor Me: Comedy analytics and recommendation.', 'background': 'Materials Science &amp; Engineering, Cornell University, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57c5b61d07d0b11913618f72_Taro-.jpg'}
44
{'name': 'Theodore Siu', 'title': 'Jr. Data Scientist', 'company': 'Playdots', 'project': 'Pearings: Intelligent ingredient combinations for home cooks.', 'background': 'Physics, Rutgers University, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57c5b6422171309b53b78af4_Theo-.jpg'}
45
{'name': 'Matthew Oberhardt  ', 'title': 'Program Director, Research Science', 'company': 'New York Presbyterian', 'project': 'memoryPower: Tracking Parkinsonâ\x80\x99s wellness using a mobile app memory game.', 'background': 'Computational Biology, Tel Aviv University / University of Maryland, Postdoc', 'imagelink': 'https://assets

86
{'name': 'Jamie Tolan', 'title': 'Ceres Imaging', 'company': 'Remote Sensing Scientist', 'project': 'Parking Predictor: Find available parking spots in San Francisco', 'background': 'Physics, Stanford University, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5895186a8e13d7bd205f13c2_Jamie%20Tolan.jpg'}
87
{'name': 'Jianbo Xiao', 'title': 'Data Scientist', 'company': 'Chegg', 'project': 'Automatic Email Sorting: Identifying crucial emails with natural language processing at 93% recall rate', 'background': 'Neuroscience, University of Wisconsin-Madison, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/589518aefb363ace20382b93_Jianbo%20Xiao.jpg'}
88
{'name': 'Katie Heineman', 'title': 'Database Research Coordinator', 'company': 'San Diego Zoo', 'project': 'Weeding Leads: Prioritizing sales calls for ezhome gardening', 'background': 'Ecology, University of Illinois, PhD', 'imagelink': 'https://assets.website-files.com/575a

46
{'name': 'Hugo Liberal Fernandes', 'title': 'Data Scientist', 'company': 'Rockets of Awesome', 'project': 'A.I.tendance: Attendance forecasting for art house movie screenings', 'background': 'Computational Neuroscience, Rehabilitation Institute of Chicago/Northwestern University, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/59c19767b8b7ef0001d51e2e_HugoLiberalFernandes.jpg'}
47
{'name': 'Keld Lundgaard', 'title': 'Data Scientist', 'company': 'Salesforce', 'project': 'Clever Forecasting: Predicting back-to-school user growth for Clever', 'background': 'Physics, Stanford University, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/59c197ac13b7f8000140e9a6_KeldLundgaard.jpg'}
48
{'name': 'Michael Lock', 'title': 'Research Fellow', 'company': 'Strat11', 'project': 'Tweet Off: Classify tweets and track hashtags over time', 'background': 'Differential Geometry, Mathematical Sciences Research Institute, Postdoc', 'imagel

96
{'name': 'Nick DiQuattro', 'title': 'Analytics Manager', 'company': 'eBay Advertising', 'project': 'RedCarpet', 'background': 'Astrophysics, Academia Sinica, Taiwan, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5930a579e916bc2d34691818_NicoleCzakon.jpg'}
97
{'name': 'Rich Winslow', 'title': 'Data Science Engineer', 'company': 'Sojern', 'project': 'LiveBeat', 'background': 'Mechanical Engineering, UC Berkeley, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5930a67074bc772290755af4_RichWinslow.jpg'}
98
{'name': 'Sean McCurdy', 'title': 'Product Analyst, Core Experience', 'company': 'Pinterest', 'project': 'beaut.ai', 'background': 'Bioinformatics, University of Toronto, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5930a7173a7f95407d147d9e_SeanMccurdy.jpg'}
page_7
0
{'name': 'Shu-Han Chao', 'title': 'Analytics Manager', 'company': 'eBay ', 'project': 'Fleetr', 'background': 'Physics, Un

46
{'name': 'Omoju Miller', 'title': 'Senior Data Scientist', 'company': 'Github', 'project': 'Receipt.id', 'background': 'Computer Science Education, UC Berkeley, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5935db7809fb222bc9aa339d_Omoju.jpg'}
47
{'name': 'Pablo Rosado', 'title': 'Senior Engineer', 'company': 'Amgen', 'project': 'Peace-of-Mind Parking', 'background': 'Mechanical Engineering, University of California, Berkeley, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5935dc0609fb222bc9aa33b3_Pablo.jpg'}
48
{'name': 'Qi Wen Li', 'title': 'Project Manager', 'company': 'IBM', 'project': 'Win the Bidding War', 'background': 'Biochemistry and Molecular Biophysics, Caltech, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5935dc5809fb222bc9aa33bd_Li.jpg'}
49
{'name': 'Sneha Ravi', 'title': 'Data Scientist', 'company': 'Uber', 'project': 'EduCare', 'background': 'Neurobiology, Duke University,

88
{'name': 'Changyao Chen', 'title': 'Quantitative Strategist', 'company': 'Neo Ivy Capital Management', 'project': 'Wise Wheels: Where will Citibike take you?', 'background': 'Mechanical Engineering, Argonne National Laboratory, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/59c1a286ab29300001ea1bbc_Changyao%20Chen.jpg'}
89
{'name': 'Jamison Galloway', 'title': 'Associate Director of Business Intelligence and Analytics', 'company': 'Columbia University', 'project': 'predictTix: Helping theatergoers anticipate Broadway bargains', 'background': 'Theoretical High Energy Physics, New York University Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/59c1a3afb8b7ef0001d51f7a_Jamison%20Galloway.jpg'}
90
{'name': 'Janelle Szary', 'title': 'Data Scientist', 'company': 'Pymetrics', 'project': 'IntuiShop: Intuitive shopping for styles', 'background': 'Cognitive and Information Sciences, Indiana University, Postdoc', 'imagelink':

51
{'name': 'Louis Antonelli', 'title': 'Data Scientist', 'company': 'Vizient', 'project': 'SilverTongue: Helping you nail that next big presentation', 'background': 'Experimental Particle Physics, The Ohio State University, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5ab94019b07e8222cfd31fc0_Jamie%20Antonelli%20headshot.jpg'}
52
{'name': 'Patrick Long', 'title': 'Data Scientist', 'company': 'IQVIA', 'project': 'enCard: Predicting credit card transaction failures', 'background': 'Neuroscience, University of Michigan, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5abb8d58de71f34d6de73e42_Patrick_Long.jpg'}
53
{'name': 'Peter Tsai', 'title': 'Data Engineer', 'company': 'MOCAP Analytics', 'project': 'NBATicketSavant: Buy tickets without regret!', 'background': 'Materials Science &amp; Engineering, Washington University in St. Louis, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de4a/5a

## Data Examination

In [15]:
import pandas as pd
import json

In [16]:
with open('./FellowInfo8Page_v2.txt') as json_file:
    insight = json.load(json_file)

In [17]:
insight['fellows']

[{'name': 'JP Bida',
  'title': 'Director of Risk',
  'company': 'Capital One',
  'project': 'Measuring the Impact of Open Source Coders on Github',
  'background': 'Bio-X Games, Stanford, Postdoc',
  'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576976f76364789c4e58a0e8_JP-Pic.jpg'},
 {'name': 'Douglas Mason',
  'title': 'Data Scientist',
  'company': 'Twitter',
  'project': 'Email Searcher: Search &amp; Visualize Your Mailbox',
  'background': 'Physics, Harvard University,Â\xa0PhD\n',
  'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57697251e8628b5103c6b117_Doug-Pic.jpg'},
 {'name': 'Virot Ta Chiraphadhanakul',
  'title': 'Data Scientist',
  'company': 'Facebook',
  'project': 'Tweet Timeline: Visualizing the Impact of Social Media',
  'background': 'Operations Research, MIT, PhD',
  'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769760f6364789c4e589ce7_Ta-Pic.jpg'},
 {'name': 'Julia Viladomat',
  'title': '

In [18]:
fellowInfo = pd.DataFrame(insight['fellows'], columns=['name', 'title', 'company', 
                                   'project', 'background', 'imagelink'])

In [19]:
fellowInfo.head()

Unnamed: 0,name,title,company,project,background,imagelink
0,JP Bida,Director of Risk,Capital One,Measuring the Impact of Open Source Coders on ...,"Bio-X Games, Stanford, Postdoc",https://assets.website-files.com/575a31d2ce5d0...
1,Douglas Mason,Data Scientist,Twitter,Email Searcher: Search &amp; Visualize Your Ma...,"Physics, Harvard University,Â PhD\n",https://assets.website-files.com/575a31d2ce5d0...
2,Virot Ta Chiraphadhanakul,Data Scientist,Facebook,Tweet Timeline: Visualizing the Impact of Soci...,"Operations Research, MIT, PhD",https://assets.website-files.com/575a31d2ce5d0...
3,Julia Viladomat,Data Scientist,Adobe,Sentiment Analysis Based on Social Media,"Statistics, Stanford, Postdoc",https://assets.website-files.com/575a31d2ce5d0...
4,David Freeman,Head of Anti-Abuse Engineering,LinkedIn,Flight Delay Predictor,"Mathematics, Stanford, Postdoc",https://assets.website-files.com/575a31d2ce5d0...


In [20]:
fellowInfo.shape

(791, 6)

In [21]:
fellowInfo.describe()

Unnamed: 0,name,title,company,project,background,imagelink
count,791,791,791,791,791,791
unique,791,170,409,790,731,791
top,Jolene Mork,Data Scientist,Facebook,IndieRecommender: Find Indie games like video ...,"Neuroscience, New York University, Postdoc",https://assets.website-files.com/575a31d2ce5d0...
freq,1,435,54,2,5,1


In [22]:
fellowInfo['project'].values

array(['Measuring the Impact of Open Source Coders on Github',
       'Email Searcher: Search &amp; Visualize Your Mailbox',
       'Tweet Timeline: Visualizing the Impact of Social Media',
       'Sentiment Analysis Based on Social Media',
       'Flight Delay Predictor',
       'Email Contacts Automatically Grouped &amp; Ranked',
       'Analysis of New York Stop-and-Frisk Data',
       'CouchTube: YouTube TV shows in one click',
       'SchoolGeo: Empowering Parents to Make Informed Decisions',
       'Event Map: Get Local News from Twitter',
       'Sort My Friends: Automatically Organize your Connections',
       'Flat Finder: Find Your Perfect Apartment', 'Kiva Loan Checker',
       'Arrive in Time: Airport Arrival Time Predictor',
       'STARtrack: Helping Educators Track Student Performance',
       'Diverse Recipe Recommendations', 'Multi-Domain Recommendations',
       'Stock Volatility PredictorÂ\xa0', 'Lower Your Electricity Bill',
       'Find the Expert on Stack Overflow