In [34]:
import requests
from bs4 import BeautifulSoup

In [19]:
def getPage(url):
    """
    Utilty function used to get a Beautiful Soup object from a given URL
    """

    session = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs

## Dealing with different website layouts

In [20]:
import requests

class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body


def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')


def scrapeNYTimes(url):
    bs = getPage(url)
    title = bs.find('h1').text
    lines = bs.select('div.StoryBodyCompanionColumn div p')
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find('h1').text
    body = bs.find('div', {'class', 'post-body'}).text
    return Content(url, title, body)


url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/


The past few decades have been filled with a deep optimism about the role of cities and suburbs across the world. These engines of economic growth host a majority of world population, are major drivers of economic innovation, and have created pathways to opportunities for untold amounts of people.	
Authors






Jeffrey Gutman
Senior Fellow - Global Economy and Development







Adie Tomer
Fellow - Metropolitan Policy Program

 Twitter
AdieTomer






But all is not well within our so-called Urban Century. Rapid urbanization, rising gentrification, concentrated poverty, and shortages of basic infrastructure have combined to create spatial inequity in cities and suburbs across the globe. The challenges of housing, moving, and employing so many people have led to longer travel times, rising housing 

Title: The Men Who Want to Live Forever
URL: https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html

Would you like to live forever? Some billionaires, already invincible in every other way, have decided that they also deserve not to die. Today several biotech companies, fueled by Silicon Valley fortunes, are devoted to “life extension” — or as some put it, to solving “the problem of death.”
It’s a cause championed by the tech billionaire Peter Thiel, the TED Talk darling Aubrey de Gray, Google’s billion-dollar Calico longevity lab and investment by Amazon’s Jeff Bezos. The National Academy of Medicine, an independent group, recently dedicated funding to “end aging forever.”
As the longevity entrepreneur Arram Sabeti told The New Yorker: “The proposition that we can live forever is obvious. It doesn’t violate the laws of physics, so we can achieve it.” Of all the slightly creepy aspects to this trend, the strangest is the least noticed: The people publicly ch

In [21]:
class Content:
    """
    Common base class for all articles/pages
    """
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

class Website:
    """ 
    Contains information about website structure
    """

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [22]:
import requests
from bs4 import BeautifulSoup


class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site, url):
        """
        Extract content from a given page URL
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

In [23]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com', 'h1', 'div.StoryBodyCompanionColumn div p']
]
websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')
crawler.parse(
    websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(
    websites[2],
    'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(
    websites[3], 
    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')

URL: http://shop.oreilly.com/product/0636920028154.do
TITLE: Learning Python, 5th Edition 
BODY:

Get a comprehensive, in-depth introduction to the core Python language with this hands-on book. Based on author Mark Lutz’s popular training course, this updated fifth edition will help you quickly write efficient, high-quality code with Python. It’s an ideal way to begin, whether you’re new to programming or a professional developer versed in other languages. 

Complete with quizzes, exercises, and helpful illustrations,  this easy-to-follow, self-paced tutorial gets you started with both Python 2.7 and 3.3— the latest releases in the 3.X  and 2.X lines—plus all other releases in common use today. You’ll also learn some advanced language features that recently have become more common in Python code.

Explore Python’s major built-in object types such as numbers, lists, and dictionaries 
Create and process objects with Python statements, and learn Python’s general syntax model
Use functions

URL: https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html
TITLE: Oil Boom Gives the U.S. a New Edge in Energy and Diplomacy
BODY:
HOUSTON — A substantial rise in oil prices in recent months has led to a resurgence in American oil production, enabling the country to challenge the dominance of Saudi Arabia and dampen price pressures at the pump.
The success has come in the face of efforts by Saudi Arabia and its oil allies to undercut the shale drilling spree in the United States. Those strategies backfired and ultimately ended up benefiting the oil industry.
Overcoming three years of slumping prices proved the resiliency of the shale boom. Energy companies and their financial backers were able to weather market turmoil — and the maneuvers of the global oil cartel — by adjusting exploration and extraction techniques.
After a painful shakeout in the industry that included scores of bankruptcies and a significant loss of jobs, a steadier shale-drilling industry is a

## Crawling through sites with search

In [24]:
class Content:
    """Common base class for all articles/pages"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """
        Flexible printing function controls output
        """
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [25]:
class Website:
    """Contains information about website structure"""

    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [26]:
import requests
from bs4 import BeautifulSoup

class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self, topic, site):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()


crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]
sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                         row[3], row[4], row[5], row[6], row[7]))

topics = ['python', 'data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
New article found for topic: python
URL: Learning Python, 5th Edition 
TITLE: 
Get a comprehensive, in-depth introduction to the core Python language with this hands-on book. Based on author Mark Lutz’s popular training course, this updated fifth edition will help you quickly write efficient, high-quality code with Python. It’s an ideal way to begin, whether you’re new to programming or a professional developer versed in other languages. 

Complete with quizzes, exercises, and helpful illustrations,  this easy-to-follow, self-paced tutorial gets you started with both Python 2.7 and 3.3— the latest releases in the 3.X  and 2.X lines—plus all other releases in common use today. You’ll also learn some advanced language features that recently have become more common in Python code.

Explore Python’s major built-in object types such as numbers, lists, and dictionaries 
Create and process objects with Python statements, and learn Python’s general syntax model
Use f

KeyboardInterrupt: 

## Crawling Sites through Links

In [27]:
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Content:

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [30]:
import re


class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)


reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',
                  False, 'h1', 'div.StandardArticleBody_body')
crawler = Crawler(reuters)
crawler.crawl()

URL: https://www.reuters.com/article/wrapup-fed-powell-0103-thur-idCNKCS1OY092
TITLE: 焦点：鲍威尔推崇讲话简明 但还是需要字斟句酌--联储前副主席
BODY:
路透亚特兰大1月3日 - 美国联邦储备委员会(美联储)一位前官员周四表示，联储主席鲍威尔希望在工作中更加简单明了地讲话，这已经让他自己深陷困境，未来他讲话需要更加字斟句酌。 2018年12月19日，美国华盛顿，美联储主席鲍威尔在政策会议后的记者会上讲话。REUTERS/Yuri Gripas正当美联储开始对自己的操作进行全面评估之际，前美联储副主席科恩在一份他所称的”备忘录”当中表示，鲍威尔以及其它美联储官员需要仔细评估他们的沟通方式，以便投资者和公众都能够更好地了解联储的工作和政策。 科恩在乔治梅森大学莫卡特斯中心(Mercatus Center)举办的一个有关美联储政策的会议上称，”正如我们近来看到的，很难”用简单的英语来描述美联储管理利率和通胀的工作，这是一个科学、艺术和公众心理学兼而有之的过程。 “鲍威尔主席需要小心，并不能因为自己使用简单明了的英文，就无需必要的字斟句酌。” 他表示，鲍威尔领导下的美联储特别强调”数据依赖性”，或者是根据一连串经济统计数据来设定政策，减少对经济模型指引的依赖，这有可能”导致所传达信息的频繁变动以及政策反覆”。    鲍威尔最近公开露面中的即兴讲话方式导致美联储政策面呈现不确定性，而且似乎在2018年秋季的市场抛盘中发挥了作用。他今年公开露面的机会将增加，从本月开始，美联储年内八次政策会议的每次会后，他都将召开新闻发布会。 他还在日程安排中放上了公开问答环节，包括周五上午在美国经济学会的一次会议上，并强调向公众证明美联储行动合理性的重要性。 目前正是特别动荡的时期，美国总统特朗普不满于美联储升息，全球经济增长放缓，而且投资者对于美国经济复苏是否还能继续产生疑虑。 债市近来给出令人不安的信号，两年期美债收益率周四一度跌破美联储短期目标政策利率。这传统上是投资者预期美联储近期将被迫降息而不是继续升息的迹象。 科恩于2002年成为美联储理事。2006-2010年间，他作为美联储副主席帮助制定了抵御深度金融危机和经济衰退的政策。 他在讲话中重新提到之前的部分策略，举例说，在美国CBS公司著名的

URL: https://www.reuters.com/article/apple-cargillchina-wrapup0103-thur-idCNKCS1OY06L
TITLE: 焦点：苹果与嘉吉中国市场表现失色 特朗普贸易战火烧到自家人
BODY:
路透亚特兰大/纽约1月3日 - 美国科技业与农业两大龙头苹果(AAPL.O)与嘉吉疲弱的销售表现，或许是迄今最明显的信号表明美国总统特朗普追求重置全球贸易版图，让国内付出了代价；美国或将更显孤立，其做为全球经济增长发动机的力道将愈加脆弱。 2018年11月5日，中国上海，中国国际进口博览会上的嘉吉企业标识。REUTERS/Aly Song苹果周三罕见地调降季度营收预估，执行长库克将之归咎于中国市场的iPhone销售放缓。谷物贸易商嘉吉周四公布季度净利锐减20%，因全球贸易关系紧张及在中国生猪行业面临严峻挑战。 中国去年经济增长率可能高于6%，将是中国自十年前金融危机深谷以来经济增长最为温吞的表现，反映出较过去几年放缓以及近几个月内经济减速的态势。 企业和经济领导人10年来一直寄望，中国消费者购买力日益提升将支撑全球同步成长的时代。但美中贸易战令这样的期待受到威胁。 中国经济急剧放缓和其他地区表现疲弱，也可能令美国消费者得充当最重要的堡垒，来抵御更广泛的全球经济滑坡。美国消费者支出占该国经济活动的比重超过三分之二，且在家庭收入及薪资增长之际，消费者迄今为止仍乐于支出。 “美国做为世界火车头，与特朗普减少贸易逆差政策的目标之间存在矛盾。这也是美国消费者要充当世界火车头将充满挑战的另一个原因，”花旗全球首席经济学者暨经济合作暨发展组织(OECD)前首席经济学家曼恩说。 她表示，”我们正在仔细研究(美国、德国和其他地方在)国内活动强劲及外部活动趋弱之间的平衡”，以及”中国扭转该国经济轨道的政策效果”。 美国经济增长的其他推动因素也都在消退，或者在接下来的几个月料会消退，其中包括政府和企业支出以及净出口。 美国2018年非常强劲的经济增长势头预计将降温，但应会保持强劲直到2020年中期。路透调查分析师预计届时经济增长率将降至1.8%。国际货币基金组织(IMF)去年10月时将全球2019年经济增长预估调降至3.7%，指因贸易战；花旗则在12月将其预估调降至3.1%。 就在一年前还曾受到欢呼的全球主要经济体同步成长时代，如今已

URL: https://www.reuters.com/article/us-house-deal-border-wall-0104-idCNKCS1OY08Y?il=0
TITLE: 美国众议院通过议案结束政府关门 但不包含美边境筑墙拨款
BODY:
路透华盛顿1月3日 - 美国众议院周四批准一项议案以结束联邦政府部分关门状态，批准向国土安全部拨款至2月8日。美国多个联邦机构在接近两周前开始关门。 根据议案，国务院、商务部、农业部、劳工部、财政部和其他联邦机构的拨款将持续到9月3日当前财年结束。 在表决前几个小时，白宫称，总统特朗普的顾问将建议，如果国会通过这项没有向美墨边境筑墙额外拨款的议案，总统应该将其否决。(完) 编译 汪红英；审校 张涛 Our Standards:The Thomson Reuters Trust Principles.
URL: https://www.reuters.com/article/brexit-referendum-0104-fri-idCNKCS1OY08C?il=0
TITLE: 英国脱欧大臣称二度脱欧公投只会加剧民众意见分歧
BODY:
路透柏林1月4日 - 英国脱欧事务大臣巴克利(Stephen Barclay)对德国世界报表示，再次进行英国退欧公投，只会让英国人民分化的情况愈加恶化。 “举行第二次退欧公投只会加深分歧，”他对德国世界报说，”如果与第二次公投可能引发的紧张形势相较，目前英国民众的分歧程度较小。二次公投只会让我们的国家更加分裂。” 巴克利表示，不能在欧洲议会5月选举前举办第二次英国退欧公投。 “不过之后欧洲选举将必须在英国举行，”他表示，”这意味着对民主的重大伤害，因为大家已经投票决定退欧，但必须在5月底再次投票。” “我们的欧洲同事对(二度退欧公投)也不会有丝毫兴趣，因为那将招致非常民粹主义的反应，”他表示。(完) 编译 陈宗琦；审校 徐文焰 Our Standards:The Thomson Reuters Trust Principles.


KeyboardInterrupt: 

## Crawling multiple page types

In [35]:
class Website:
    """Common base class for all articles/pages"""

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

In [36]:
class Product(Website):
    """Contains information for scraping a product page"""

    def __init__(self, name, url, titleTag, productNumber, price):
        Website.__init__(self, name, url, TitleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag

class Article(Website):
    """Contains information for scraping an article page"""

    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag

In [37]:

def parsePage(url):
    
    if '/ideas/' in url:
        

oreilly = Website('O\'Reilly', 'https://oreilly.com', 'h1' '')        

IndentationError: expected an indented block (<ipython-input-37-3b55228eaa62>, line 7)