In [1]:
import lxml
import re
import requests
import tweepy
import csv, json
import matplotlib
import numpy as np
import pandas as pd
import string
from bs4 import BeautifulSoup
from tweepy import OAuthHandler
from nltk import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [2]:
### First, scrape links from Breitbart ###
# Initialize main data
article_id = []
article_link = []

print("Scraping:")
# About 55 articles per page, scrape first 1 pages    
for i in range(1,118):
    url = 'http://www.breitbart.com/news/source/breitbart-news/page/' + str(i)
    r  = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')
    article_list = soup.find('div', { 'class' : 'articles-list' })
    
    # Get article ID
    for item in article_list.find_all('article'):
        article_id.append(item.get('id'))
    
    # Get article content (with and without thumb content)
    article_content = []
    for article in article_list.find_all('div', { 'class' : 'article-content' }):
        article_content.append(article)
        

    for item in article_content:
        article_link.append(item.find('a').get('href'))
    print(len(article_link), end = '...')

Scraping:
55...110...165...220...275...330...385...440...495...550...605...660...715...770...825...880...935...990...1045...1100...1155...1210...1265...1320...1375...1430...1485...1540...1595...1650...1705...1760...1815...1870...1925...1980...2035...2090...2145...2200...2255...2310...2365...2420...2475...2530...2585...2640...2695...2750...2805...2860...2915...2970...3025...3080...3135...3190...3245...3300...3355...3410...3465...3520...3575...3630...3685...3740...3795...3850...3905...3960...4015...4070...4125...4180...4235...4290...4345...4400...4455...4510...4565...4620...4675...4730...4785...4840...4895...4950...5005...5060...5115...5170...5225...5280...5335...5390...5445...5500...5555...5610...5665...5720...5775...5830...5885...5940...5995...6050...6105...6160...6215...6270...6325...6380...6435...

In [3]:
# Loop through Week 1
X_articles = []
X_links = []
X_author = []
X_date = []

for link in article_link[0:1500]:
    r = requests.get(link)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')
    entry = soup.find('div', {'class':'entry-content'})
    
    # Save article text
    try:
        content = []
        for item in entry.find_all('p'):
            if item.get('class') != ['sh2'] and item.find('em') is None and item.get('dir') != 'ltr': # added and item.get('dir') != 'ltr'
                content.append(item.get_text())
        # Combine paragraphs within article
        article_text = ' '.join(content)
        X_links.append(link)
        X_articles.append(article_text)
        
        # Save author
        author = soup.find('a', {'class':'byauthor'})
        X_author.append(author.get_text())
        
        # Save date
        date = soup.find('span', {'class':'bydate'})
        X_date.append(date.get_text())
        
    except AttributeError as a:
        print(a)
        print("Attribute Error - No content scraped for:", link)
    except ConnectionError as e:
        print(e)
        print("Connection Error - No content scraped for:", link)
        
    print(len(X_articles), end='...')


1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...16...17...18...19...20...21...22...23...24...25...26...27...28...29...30...31...32...33...34...35...36...37...38...39...40...41...42...43...44...45...46...47...48...49...50...51...52...53...54...55...56...57...58...59...60...61...62...63...64...65...66...67...68...69...70...71...72...73...74...75...76...77...78...79...80...81...82...83...84...85...86...87...88...89...90...91...92...93...94...95...96...97...98...99...100...101...102...103...104...105...106...107...108...109...110...111...112...113...114...115...116...117...118...119...120...121...122...123...124...125...126...127...128...129...130...131...132...133...134...135...136...137...138...139...140...141...142...143...144...145...146...147...148...149...150...151...152...153...154...155...156...157...158...159...160...161...162...163...164...165...166...167...168...169...170...171...172...173...174...175...176...177...178...179...180...181...182...183...184...185.

In [4]:
# Store Week 1 Breitbart Data
breitbart_data_1 = pd.DataFrame({'link': X_links, 'text': X_articles, 'author': X_author, 'date': X_date})
breitbart_data_1.to_csv('breitbart_articles-1.csv')

In [5]:
# Loop through Week 2
X_articles = []
X_links = []
X_author = []
X_date = []

for link in article_link[1500:3000]:
    r = requests.get(link)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')
    entry = soup.find('div', {'class':'entry-content'})
    
    # Save article text
    try:
        content = []
        for item in entry.find_all('p'):
            if item.get('class') != ['sh2'] and item.find('em') is None and item.get('dir') != 'ltr': # added and item.get('dir') != 'ltr'
                content.append(item.get_text())
        # Combine paragraphs within article
        article_text = ' '.join(content)
        X_links.append(link)
        X_articles.append(article_text)
        
        # Save author
        author = soup.find('a', {'class':'byauthor'})
        X_author.append(author.get_text())
        
        # Save date
        date = soup.find('span', {'class':'bydate'})
        X_date.append(date.get_text())
        
    except AttributeError as a:
        print(a)
        print("Attribute Error - No content scraped for:", link)
    except ConnectionResetError as e:
        print(e)
        print("Connection Error - No content scraped for:", link)
        
    print(len(X_articles), end='...')

1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...16...17...18...19...20...21...22...23...24...25...26...27...28...29...30...31...32...33...34...35...36...37...38...39...40...41...42...43...44...45...46...47...48...49...50...51...52...53...54...55...56...57...58...59...60...61...62...63...64...65...66...67...68...69...70...71...72...73...74...75...76...77...78...79...80...81...82...83...84...85...86...87...88...89...90...91...92...93...94...95...96...97...98...99...100...101...102...103...104...105...106...107...108...109...110...111...112...113...114...115...116...117...118...119...120...121...122...123...124...125...126...127...128...129...130...131...132...133...134...135...136...137...138...139...140...141...142...143...144...145...146...147...148...149...150...151...152...153...154...155...156...157...158...159...160...161...162...163...164...165...166...167...168...169...170...171...172...173...174...175...176...177...178...179...180...181...182...183...184...185.

In [6]:
# Store Week 2 Breitbart Data
breitbart_data_2 = pd.DataFrame({'link': X_links, 'text': X_articles, 'author': X_author, 'date': X_date})
breitbart_data_2.to_csv('breitbart_articles-2.csv')

In [7]:
# Loop through Week 3
X_articles = []
X_links = []
X_author = []
X_date = []

for link in article_link[3000:4500]:
    r = requests.get(link)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')
    entry = soup.find('div', {'class':'entry-content'})
    
    # Save article text
    try:
        content = []
        for item in entry.find_all('p'):
            if item.get('class') != ['sh2'] and item.find('em') is None and item.get('dir') != 'ltr': # added and item.get('dir') != 'ltr'
                content.append(item.get_text())
        # Combine paragraphs within article
        article_text = ' '.join(content)
        X_links.append(link)
        X_articles.append(article_text)
        
        # Save author
        author = soup.find('a', {'class':'byauthor'})
        X_author.append(author.get_text())
        
        # Save date
        date = soup.find('span', {'class':'bydate'})
        X_date.append(date.get_text())
        
    except AttributeError as a:
        print(a)
        print("Attribute Error - No content scraped for:", link)
    except ConnectionError as e:
        print(e)
        print("Connection Error - No content scraped for:", link)
        
    print(len(X_articles), end='...')

1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...16...17...18...19...20...21...22...23...24...25...26...27...28...29...30...31...32...33...34...35...36...37...38...39...40...41...42...43...44...45...46...47...48...49...50...51...52...53...54...55...56...57...58...59...60...61...62...63...64...65...66...67...68...69...70...71...72...73...74...75...76...77...78...79...80...81...82...83...84...85...86...87...88...89...90...91...92...93...94...95...96...97...98...99...100...101...102...103...104...105...106...107...108...109...110...111...112...113...114...115...116...117...118...119...120...121...122...123...124...125...126...127...128...129...130...131...132...133...134...135...136...137...138...139...140...141...142...143...144...145...146...147...148...149...150...151...152...153...154...155...156...157...158...159...160...161...162...163...164...165...166...167...168...169...170...171...172...173...174...175...176...177...178...179...180...181...182...183...184...185.

In [8]:
# Store Week 3 Breitbart Data
breitbart_data_3 = pd.DataFrame({'link': X_links, 'text': X_articles, 'author': X_author, 'date': X_date})
breitbart_data_3.to_csv('breitbart_articles-3.csv')

In [29]:
# Loop through Week 4
X_articles = []
X_links = []
X_author = []
X_date = []

for link in article_link[4500:len(article_link)]:
    r = requests.get(link)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')
    entry = soup.find('div', {'class':'entry-content'})
    
    # Save article text
    try:
        content = []
        for item in entry.find_all('p'):
            if item.get('class') != ['sh2'] and item.find('em') is None and item.get('dir') != 'ltr': # added and item.get('dir') != 'ltr'
                content.append(item.get_text())
        # Combine paragraphs within article
        article_text = ' '.join(content)
        X_links.append(link)
        X_articles.append(article_text)
        
        # Save author
        author = soup.find('a', {'class':'byauthor'})
        X_author.append(author.get_text())
        
        # Save date
        date = soup.find('span', {'class':'bydate'})
        X_date.append(date.get_text())
        
    except AttributeError as a:
        print(a)
        print("Attribute Error - No content scraped for:", link)
    except ConnectionError as e:
        print(e)
        print("Connection Error - No content scraped for:", link)
        
    print(len(X_articles), end='...')

1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...16...17...18...19...20...21...22...23...24...25...26...27...28...29...30...31...32...33...34...35...36...37...38...39...40...41...42...43...44...45...46...47...48...49...50...51...52...53...54...55...56...57...58...59...60...61...62...63...64...65...66...67...68...69...70...71...72...73...74...75...76...77...78...79...80...81...82...83...84...85...86...87...88...89...90...91...92...93...94...95...96...97...98...99...100...101...102...103...104...105...106...107...108...109...110...111...112...113...114...115...116...117...118...119...120...121...122...123...124...125...126...127...128...129...130...131...132...133...134...135...136...137...138...139...140...141...142...143...144...145...146...147...148...149...150...151...152...153...154...155...156...157...158...159...160...161...162...163...164...165...166...167...168...169...170...171...172...173...174...175...176...177...178...179...180...181...182...183...184...185.

In [30]:
# Store Week 4 Breitbart Data
breitbart_data_4 = pd.DataFrame({'link': X_links, 'text': X_articles, 'author': X_author, 'date': X_date})
breitbart_data_4.to_csv('breitbart_articles-4.csv')

In [31]:
breitbart_all = pd.concat([breitbart_data_1, breitbart_data_2, breitbart_data_3, breitbart_data_4], ignore_index=True)

In [32]:
breitbart_all.head()
breitbart_all.to_csv('breitbart_all.csv')

In [11]:
article_link.index(link)

5169

In [13]:
breitbart_data.text

0      Monday in Miami after attending a Martin Luthe...
1      Monday in Miami after attending a Martin Luthe...
2      BMW AG responded to Trump’s claims by saying t...
3      Judd Apatow is stress-eating Oreos by the slee...
4      The distance is roughly 1.5 miles down Pennsyl...
5      The two-year program, set to launch this summe...
6      Vanity Fair reports that multiple people close...
7      On Monday’s broadcast of “MSNBC Live,” Represe...
8      Aside from his disrespect for the national ant...
9      Monday on CNN International’s “Amanpour,” comm...
10     The report, titled “Degrees of Racism”, highli...
11     Along with migrants from Central America, Afri...
12     “This is a real scandal,” said Habib, who is a...
13     The incident occurred around 3:30 a.m. on Janu...
14     In a video that accompanies the “Troubled Time...
15     Turkey claims that Gulen’s network of charter ...
16      Student protesters and “anti-fascists” can be...
17     It is that policy that D

In [14]:
X_author

['Pam Key',
 'Pam Key',
 'Katherine Rodriguez',
 'Daniel Nussbaum',
 'Alex Swoyer',
 'Jerome Hudson',
 'Lucas Nolan',
 'Ian Hanchett',
 'Robert J. Marlow',
 'Pam Key',
 'Jack Montgomery',
 'John Binder',
 'Aaron Klein',
 'AWR Hawkins',
 'Jeff Poor',
 'Frances Martel',
 'Charlie Nash',
 'John Binder',
 'Trent Baker',
 'Katherine Rodriguez',
 'Warner Todd Huston',
 'Warner Todd Huston',
 'Charlie Spiering',
 'Pam Key',
 'Jerome Hudson',
 'Michelle Moons',
 'Frances Martel',
 'Jerome Hudson',
 'AWR Hawkins',
 'Pam Key',
 'Tom Ciccotta',
 'Breitbart London',
 'Katie McHugh',
 'Trent Baker',
 'Pam Key',
 'Breitbart London',
 'AWR Hawkins',
 'AWR Hawkins',
 'Dr. Susan Berry',
 'Breitbart London',
 'Dr. Susan Berry',
 'Breitbart London',
 'Jack Montgomery',
 'Lee Stranahan',
 'Liam Deacon',
 'John Hayward',
 'Breitbart Jerusalem',
 'Ildefonso Ortiz',
 'Joel B. Pollak',
 'Liam Deacon',
 'Donna Rachel Edmunds',
 'Breitbart Jerusalem',
 'Matthew Boyle',
 'Bob Price',
 'Trent Baker',
 'Amanda Hou