## Analyze Website

In [1]:
import requests
import os

In [2]:
def get_one_page(url):
    '''
    Get the html for one page
    '''
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

In [3]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
# print(html)

outF = open('./FellowInfo.txt', 'w')
outF.write(html)
outF.write('\n')
outF.close()


## Dealing with Exceptions

In [4]:
import requests
from requests.exceptions import RequestException

In [5]:
def get_one_page(url):
    '''
    Get the html for one page
    With the exceptions
    '''
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

In [6]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
# print(html)

## Using re (regular expression) to identify information

In [7]:
import requests
from requests.exceptions import RequestException
import re

In [8]:
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
    
def parse_one_page(html):
    pattern = re.compile('tooltip_name">(.*?)</div>.*?'
                         + 'toottip_title">(.*?)</div>.*?'
                         + 'tooltip_company">(.*?)</div>.*?'
                         + 'tooltip_project">(.*?)</div>.*?'
                         + 'tooltip_background">(.*?)</div>.*?'
                         + '&quot;(.*?)&quot;'
                         ,re.S)
    
    items = re.findall(pattern, html)
    print(items)

In [9]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
# print(html)
parse_one_page(html)

[('JP Bida', 'Director of Risk', 'Capital One', 'Measuring the Impact of Open Source Coders on Github', 'Bio-X Games, Stanford, Postdoc', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576976f76364789c4e58a0e8_JP-Pic.jpg'), ('Douglas Mason', 'Data Scientist', 'Twitter', 'Email Searcher: Search &amp; Visualize Your Mailbox', 'Physics, Harvard University,Â\xa0PhD\n', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57697251e8628b5103c6b117_Doug-Pic.jpg'), ('Virot Ta Chiraphadhanakul', 'Data Scientist', 'Facebook', 'Tweet Timeline: Visualizing the Impact of Social Media', 'Operations Research, MIT, PhD', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/5769760f6364789c4e589ce7_Ta-Pic.jpg'), ('Julia Viladomat', 'Data Scientist', 'Adobe', 'Sentiment Analysis Based on Social Media', 'Statistics, Stanford, Postdoc', 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57685673bc51c12e173d3ba2_Julia%20Pic.jpg'), ('David Freeman', 'Head of Anti-Abuse Engine

## Converting data into orgnized dictionary structure

In [10]:
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
    
def parse_one_page(html):
    pattern = re.compile('tooltip_name">(.*?)</div>.*?'
                         + 'toottip_title">(.*?)</div>.*?'
                         + 'tooltip_company">(.*?)</div>.*?'
                         + 'tooltip_project">(.*?)</div>.*?'
                         + 'tooltip_background">(.*?)</div>.*?'
                         + '&quot;(.*?)&quot;'
                         ,re.S)
    
    items = re.findall(pattern, html)
    
    for item in items:
        yield{
            'name': item[0],
            'title': item[1],
            'company': item[2],
            'project': item[3],
            'background': item[4],
            'imagelink': item[5]
        }

In [11]:
url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=1'
html = get_one_page(url)
# print(html)
for idx, item in enumerate(parse_one_page(html)):
    if idx % 10 == 0:
        print(idx)
        print(item)

0
{'name': 'JP Bida', 'title': 'Director of Risk', 'company': 'Capital One', 'project': 'Measuring the Impact of Open Source Coders on Github', 'background': 'Bio-X Games, Stanford, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576976f76364789c4e58a0e8_JP-Pic.jpg'}
10
{'name': 'Cinna Julie Wu', 'title': 'Data Scientist', 'company': 'Facebook', 'project': 'Sort My Friends: Automatically Organize your Connections', 'background': 'Applied Mathematics, UC Berkeley, PhD', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/57685e33cf2f2b4d31a02788_CinnaWu.jpg'}
20
{'name': 'Joe Gallagher', 'title': 'Data Lead', 'company': 'Reddit', 'project': 'RewindX: Rediscover Music From Your Youth', 'background': 'Computational Neuroscience, Stanford, Postdoc', 'imagelink': 'https://assets.website-files.com/575a31d2ce5d01dc7a20de45/576982c4e8628b5103c6dfc2_Joe-1318.jpg'}
30
{'name': 'Brian Boates', 'title': 'Data Science Lead', 'company': 'Square',

## Storing data as json + Scraping multiple pages

In [12]:
import requests
from requests.exceptions import RequestException
import re
import json

from multiprocessing import Pool

In [13]:
def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
    
def parse_one_page(html):
    pattern = re.compile('tooltip_name">(.*?)</div>.*?'
                         + 'toottip_title">(.*?)</div>.*?'
                         + 'tooltip_company">(.*?)</div>.*?'
                         + 'tooltip_project">(.*?)</div>.*?'
                         + 'tooltip_background">(.*?)</div>.*?'
                         + '&quot;(.*?)&quot;'
                         ,re.S)
    
    items = re.findall(pattern, html)
    
    for item in items:
        yield{
            'name': item[0],
            'title': item[1],
            'company': item[2],
            'project': item[3],
            'background': item[4],
            'imagelink': item[5]
        }
        
def write_to_file(content):
    with open('./FellowInfo8Page_v1.json', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')
        f.close()
        
def get_content(offset):
    print('page_' + str(offset))
    url = 'https://www.insightdatascience.com/fellows?61ea5d1b_page=' + str(offset)
    html = get_one_page(url)
    for idx, item in enumerate(parse_one_page(html)):
#         print(idx)
#         print(item)
        datalist.append(item)
        write_to_file(item)

In [14]:
Insight = {}
Insight['fellows'] = []
datalist = Insight['fellows']
    
for i in range(1,9):
    get_content(i)
    
with open('./FellowInfo8Page_v2.txt', 'w', encoding='utf-8') as f:
    json.dump(Insight, f)

page_1
page_2
page_3
page_4
page_5
page_6
page_7
page_8


## Data Examination

In [15]:
import pandas as pd
import json

In [16]:
with open('./FellowInfo8Page_v2.txt') as json_file:
    insight = json.load(json_file)

In [17]:
# insight['fellows']

In [18]:
fellowInfo = pd.DataFrame(insight['fellows'], columns=['name', 'title', 'company', 
                                   'project', 'background', 'imagelink'])

In [19]:
fellowInfo.head()

Unnamed: 0,name,title,company,project,background,imagelink
0,JP Bida,Director of Risk,Capital One,Measuring the Impact of Open Source Coders on ...,"Bio-X Games, Stanford, Postdoc",https://assets.website-files.com/575a31d2ce5d0...
1,Douglas Mason,Data Scientist,Twitter,Email Searcher: Search &amp; Visualize Your Ma...,"Physics, Harvard University,Â PhD\n",https://assets.website-files.com/575a31d2ce5d0...
2,Virot Ta Chiraphadhanakul,Data Scientist,Facebook,Tweet Timeline: Visualizing the Impact of Soci...,"Operations Research, MIT, PhD",https://assets.website-files.com/575a31d2ce5d0...
3,Julia Viladomat,Data Scientist,Adobe,Sentiment Analysis Based on Social Media,"Statistics, Stanford, Postdoc",https://assets.website-files.com/575a31d2ce5d0...
4,David Freeman,Head of Anti-Abuse Engineering,LinkedIn,Flight Delay Predictor,"Mathematics, Stanford, Postdoc",https://assets.website-files.com/575a31d2ce5d0...


In [20]:
fellowInfo.shape

(791, 6)

In [21]:
fellowInfo.describe()

Unnamed: 0,name,title,company,project,background,imagelink
count,791,791,791,791,791,791
unique,791,170,409,790,731,791
top,Keoki Seu,Data Scientist,Facebook,IndieRecommender: Find Indie games like video ...,"Neuroscience, New York University, Postdoc",https://assets.website-files.com/575a31d2ce5d0...
freq,1,435,54,2,5,1


In [22]:
fellowInfo['project'].values

array(['Measuring the Impact of Open Source Coders on Github',
       'Email Searcher: Search &amp; Visualize Your Mailbox',
       'Tweet Timeline: Visualizing the Impact of Social Media',
       'Sentiment Analysis Based on Social Media',
       'Flight Delay Predictor',
       'Email Contacts Automatically Grouped &amp; Ranked',
       'Analysis of New York Stop-and-Frisk Data',
       'CouchTube: YouTube TV shows in one click',
       'SchoolGeo: Empowering Parents to Make Informed Decisions',
       'Event Map: Get Local News from Twitter',
       'Sort My Friends: Automatically Organize your Connections',
       'Flat Finder: Find Your Perfect Apartment', 'Kiva Loan Checker',
       'Arrive in Time: Airport Arrival Time Predictor',
       'STARtrack: Helping Educators Track Student Performance',
       'Diverse Recipe Recommendations', 'Multi-Domain Recommendations',
       'Stock Volatility PredictorÂ\xa0', 'Lower Your Electricity Bill',
       'Find the Expert on Stack Overflow