In [1]:
import feedparser
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup 
import urllib

### URL of News website RSS

In [2]:
#new_york_times news rss
rss_url = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml"

### functions used in this code

In [3]:
# fetch the rss feed and return the parsed RSS
def parseRSS(url):
    return feedparser.parse(url) 

# return list of news' info in specific area 
def getInfo(data, area):
    info = []
    for news in data['entries']:
        try:
            info.append(news[area])
        except:
            info.append("")
    return info

#parse Html in link
def parseHTML(url):
    url = urllib.request.urlopen(url)
    content = url.read()
    soup = BeautifulSoup(content, "html.parser")
    return soup

### parse XML

In [4]:
data = parseRSS(rss_url)

### create DataFrame

In [5]:
df = pd.DataFrame()
areas = ["title", "published", "author", "summary", "link"]

In [6]:
for area in areas:
    df[area] = getInfo(data, area)

In [7]:
#rename columns
df.columns = ["title", "date", "author", "description", "link"]

In [8]:
df.head()

Unnamed: 0,title,date,author,description,link
0,White House Bars Iranian Officials From U.S. a...,"Thu, 26 Sep 2019 01:14:07 +0000","Farnaz Fassihi, Lara Jakes and Edward Wong",The move was extraordinary given that dozens o...,https://www.nytimes.com/2019/09/25/world/middl...
1,"In a Twist, Netanyahu Wins a Chance to Keep Hi...","Thu, 26 Sep 2019 01:10:58 +0000",Isabel Kershner,"The Israeli prime minister, Benjamin Netanyahu...",https://www.nytimes.com/2019/09/25/world/middl...
2,China Wants the World to Stay Silent on Muslim...,"Wed, 25 Sep 2019 17:48:49 +0000",Jane Perlez,Beijing is using economic and diplomatic press...,https://www.nytimes.com/2019/09/25/world/asia/...
3,Boris Johnson Back Home. Parliament Back in Se...,"Wed, 25 Sep 2019 22:47:23 +0000",Stephen Castle,"After a stinging Supreme Court defeat, the pri...",https://www.nytimes.com/2019/09/25/world/europ...
4,Indonesia’s Leader Faces Student Protests and ...,"Thu, 26 Sep 2019 01:52:46 +0000",Richard C. Paddock,President Joko Widodo’s decision to limit the ...,https://www.nytimes.com/2019/09/25/world/asia/...


### extract news full text from "link"

In [9]:
# soup = parseHTML("https://www.nytimes.com/2019/09/25/world/middleeast/us-iran-rouhani-trump.html?emc=rss&partner=rss") #get website content from link
# content = soup.findAll("p") #get content of <p>
# content

In [10]:
df["full text"] = ""
for index, row in df.iterrows():
    try:
        soup = parseHTML(row["link"]) #get website content from link
        content = soup.findAll("p", {"class": "css-exrw3m evys1bk0"}) #get content of <p>
        text = ' '.join(list(map(lambda x: x.text, content))) #get text in <p>
        row["full text"] = text
    except:
        continue

In [11]:
df.head()

Unnamed: 0,title,date,author,description,link,full text
0,White House Bars Iranian Officials From U.S. a...,"Thu, 26 Sep 2019 01:14:07 +0000","Farnaz Fassihi, Lara Jakes and Edward Wong",The move was extraordinary given that dozens o...,https://www.nytimes.com/2019/09/25/world/middl...,UNITED NATIONS — The Trump administration barr...
1,"In a Twist, Netanyahu Wins a Chance to Keep Hi...","Thu, 26 Sep 2019 01:10:58 +0000",Isabel Kershner,"The Israeli prime minister, Benjamin Netanyahu...",https://www.nytimes.com/2019/09/25/world/middl...,After the polls closed in the Israeli election...
2,China Wants the World to Stay Silent on Muslim...,"Wed, 25 Sep 2019 17:48:49 +0000",Jane Perlez,Beijing is using economic and diplomatic press...,https://www.nytimes.com/2019/09/25/world/asia/...,"BEIJING — When Turkey’s leader, Recep Tayyip E..."
3,Boris Johnson Back Home. Parliament Back in Se...,"Wed, 25 Sep 2019 22:47:23 +0000",Stephen Castle,"After a stinging Supreme Court defeat, the pri...",https://www.nytimes.com/2019/09/25/world/europ...,LONDON — A day after suffering a crushing rebu...
4,Indonesia’s Leader Faces Student Protests and ...,"Thu, 26 Sep 2019 01:52:46 +0000",Richard C. Paddock,President Joko Widodo’s decision to limit the ...,https://www.nytimes.com/2019/09/25/world/asia/...,"JAKARTA, Indonesia — Indonesia’s president, Jo..."


### output dataframe to csv

In [12]:
df.to_csv("new_york_times_news.csv", encoding='utf-8')