In [1]:
import re
import pymongo
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from os.path import basename
from os.path import isfile

# ReadMe

This script reads in the newspaper articles in text files form and stores it in a MongoDB database. Articles are processed such that bylines etc. are removed. Articles are then merged with data from the codebook, identifying which debate etc. the article is about.

In [2]:
client = pymongo.Connection()
collection = client.politics.debates

In [3]:
def cleanText(filename):
    """
    Cleans the article from unnecessary information, such as bylines and copyright information.
    @param filename: filename of the article to read.
    returns a string containing the article text stripped of unnecessary information.
    """
    with open(filename) as fh:
        txt = []
        for line in fh:
            upper = re.match("[A-Za-z-]+:", line) # checks if line contains meta information we don't need
            white = re.match("\s+", line) # checks if line contains copyright etc informations (starting with whitespace)
            if not upper and not white: txt.append(re.sub("\d","", line)) # appends and removes digits
    return("".join(txt))

In [4]:
def convertTypes(value):
    """
    Converts numpy integers to normal integers, in order to store values in MongoDB database.
    """
    if isinstance(value, np.int64):
        return(int(value))
    else:
        return(value)

# Data in 2012 Folder

Articles about the debate.

We have additional information about each article in a separate codebook. 
This reads the code book in and changes the columns names.

In [14]:
df = pd.read_excel("data/2012/DebateCodeBook5.2015.xlsx")
df.columns = ["_id", "title", "debate", "days_after", "year", "newspaper", "word_count"]
df.index = df["_id"]

In [15]:
df.head()

Unnamed: 0_level_0,_id,title,debate,days_after,year,newspaper,word_count
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,EIGHT Questions,1,0,2012,Washington Post,2215
2,2,The Denver Presidential Debate: Panel Verdict,1,1,2012,The Guardian,2548
3,3,Battling on the Home Front,1,0,2012,The Globe and Mail,2044
4,4,"After Debate, a Torrent of Criticism for Obama",1,2,2012,The New York Times,1476
5,5,REBOUNDING ROMNEY Mitt recharged after debate win,1,2,2012,Daily News,1448


Reads in all the text files to be stored.

In [4]:
files = glob("data/2012/*.TXT")
files[:5]

['data/2012/1.TXT',
 'data/2012/10.TXT',
 'data/2012/100.TXT',
 'data/2012/101.TXT',
 'data/2012/102.TXT']

Loops through the files, processes them and inserts them into the database.

In [19]:
for f in tqdm(files):
    i = int(basename(f).split(".")[0]) # gets the ID of the article, aka the filename without extension
    d = df.loc[i].to_dict() # gets the matching row from the codebook
    d['text'] = cleanText(f) # adds the cleaned text to the dict
    d = {key:convertTypes(value) for key,value in d.items()} # converts numpy ints to normal ints
    collection.insert(d) # inserts into database



# 2nd Batch: Articles about other topics

In [11]:
collection = client.politics.articles

In [9]:
df = pd.read_excel("data/2ndBatch/DebateCodeBook5.2015.xlsx", 1)
df.columns = ["_id", "topic", "title", "date", "newspaper", "word_count"]
df.index = df["_id"]
df.head()

Unnamed: 0_level_0,_id,topic,title,date,newspaper,word_count
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
151,151,Drought,Drought stable in ND but worsens again in SD,18,Bismurk Tribune,94
152,152,Drought,Rains slightly reduce drought severity in Arka...,4,The Associated Press State & Local Wire,86
153,153,Drought,Drought conditions worsen in the Dakotas over ...,4,The Associated Press State & Local Wire,98
154,154,Drought,Nearly one-third of SD now in exceptional drought,11,The Associated Press State & Local Wire,83
155,155,Drought,Va. updates drought advisories,10,The Associated Press State & Local Wire,90


In [10]:
files = glob("data/2ndBatch/*.TXT")
files[:5]

['data/2ndBatch/151.TXT',
 'data/2ndBatch/152.TXT',
 'data/2ndBatch/153.TXT',
 'data/2ndBatch/154.TXT',
 'data/2ndBatch/155.TXT']

In [15]:
for f in tqdm(files):
    i = int(basename(f).split(".")[0]) # gets the ID of the article, aka the filename without extension
    d = df.loc[i].to_dict() # gets the matching row from the codebook
    d['text'] = cleanText(f) # adds the cleaned text to the dict
    d = {key:convertTypes(value) for key,value in d.items()} # converts numpy ints to normal ints
    collection.insert(d) # inserts into database



# New Data

In [5]:
collection = client.politics.articles

In [7]:
df = pd.read_excel("data/comparison_articles.xlsx")
df.columns = ["_id", "topic", "title", "date", "newspaper", "word_count"]
df.index = df["_id"]

In [8]:
for i in df["_id"]:
    c = collection.find({"_id": int(i)}).count()
    if c < 1:
        f = "data/articles/{}.TXT".format(i)
        if isfile(f):
            d = df.loc[i].to_dict() # gets the matching row from the codebook
            d['text'] = cleanText(f) # adds the cleaned text to the dict
            d = {key:convertTypes(value) for key,value in d.items()} # converts numpy ints to normal ints
            collection.insert(d) # inserts into database
        else:
            print(f)

In [31]:
collection = client.politics.debates2

df = pd.read_excel("data/debates.xlsx")
df.columns = ["_id", "title", "debate", "days_after", "year", "newspaper", "word_count"]
df.index = df["_id"]

In [32]:
df.head()

Unnamed: 0_level_0,_id,title,debate,days_after,year,newspaper,word_count
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
241,241,EIGHT Questions,1,0,2012,The Washington Post,2215
242,242,"After Debate, a Torrent of Criticism for Obama",1,2,2012,The New York Times,1476
243,243,REBOUNDING ROMNEY Mitt recharged after debate ...,1,2,2012,Daily News,1448
244,244,Obama looks past debate,1,2,2012,The Washington Post,1490
245,245,America's Future No debating it: Mitt knockout...,1,1,2012,The New York Post,952


In [33]:
for i in df["_id"]:
    c = collection.find({"_id": int(i)}).count()
    if c < 1:
        f = "data/articles/{}.TXT".format(i)
        if isfile(f):
            d = df.loc[i].to_dict() # gets the matching row from the codebook
            d['text'] = cleanText(f) # adds the cleaned text to the dict
            d = {key:convertTypes(value) for key,value in d.items()} # converts numpy ints to normal ints
            collection.insert(d) # inserts into database

# Stem words

In this section, all articles are tokenized and stemmed. The stemmed words are stored in the database.

In [9]:
import string
import re

from nltk.stem import SnowballStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords

In [10]:
sb = SnowballStemmer("english")
stops = stopwords.words("english")

In [11]:
def stemWords(text, t = True):
    if t:
        tokens = word_tokenize(text)
    else:
        tokens = text
    out = []
    for token in tokens:
        #token = re.sub("\d","", token)
        token = remove_punctuation(token.lower())
        if len(token) > 2 and token not in stops: out.append(sb.stem(token))
    return(out)

In [12]:
def remove_punctuation(s):
    """see http://stackoverflow.com/questions/265960/best-way-to-\
    strip-punctuation-from-a-string-in-python"""
#     table = str.maketrans("", "")
    return s.translate(str.maketrans("","", string.punctuation)) 

In [13]:
cursor = collection.find({"stemmed": {"$exists": False}})

for article in tqdm(cursor, total = cursor.count()):
    txt = article["text"]
    stemmed = stemWords(txt)
    collection.find_and_modify({
            "_id": article["_id"]
        },{
            "$set": {
                "stemmed": stemmed
            }
        })



# Storing Brown corpus data in database

In [62]:
from nltk.corpus import brown

In [63]:
collection = client.politics.brown

In [64]:
ids = brown.fileids(categories = ["reviews","news"])

In [74]:
for fileid in tqdm(ids):
    words = brown.words(fileids=fileid)
    stemmed = stemWords(words, t=False)
    d = {
        "_id": fileid,
        "stemmed": stemmed,
        "category": brown.categories(fileid)
    }
    collection.insert(d)

