In [9]:
# some essentials

import pandas
import os
from os import listdir
from os.path import isfile, join
import re
import lucem_illud
import nltk


# original article regex = r'(DOCUMENTS)(.*?)(The New York Times Company)'
# original date regex = r'^(DOCUMENTS)(.*?)(,)(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$'

In [2]:
# from my personal machine
targetDir = "C:/Users/Alex/contentAnalysis/vietnam/data/NYT"

In [3]:
# helper functions for getting article text and article date

def getText(x):
    reResults = re.findall(r'(DOCUMENTS)(.*?)(The New York Times Company)', x.replace('\n', ' '), flags = re.M)
    return[t[1] for t in reResults]

def getDate(x) :
    reResults = re.findall(r'(DOCUMENTS)(.*?)(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)', x.replace('\n', ' '))
    return[t[1] for t in reResults]

In [4]:
# Below, I make multiple runs at cleaning.
# First run: Get everything into a dataframe by date and article text

nytText = []
nytDate = []


for file in (file for file in os.scandir(targetDir) if file.is_file() and not file.name.startswith('.')):
    with open(file.path, encoding='utf-8') as f:
        workingText = f.read()
        workingText = str(workingText)
        goodText = getText(workingText)
        date = getDate(workingText)
        nytText += goodText
        nytDate += date
        
        
# for some reason I get one more date than I should. Should probably figure out why at some point.
# NB - 25825 might be the problem row - seems to still have full date attached
nytDF = pandas.DataFrame({'date' : nytDate[:-1], 'article' : nytText})

In [5]:
# some helper functions for my second run - these are made ad hoc, so... 
# probably will not be useful as-is for other text cleaning projects


# strip out NYT title upper case
def cleanerUpper(x):
    return re.sub(r'\s+NEW YORK TIMES\s+', '', x)

# strip out NYT title lower case
def cleanerLower(x):
    return re.sub(r'\s+The New York Times\s+', '', x)

#strip out useless text
def cleanerText(x):
    return re.sub(r'(January|February|March|April|May|June|July|August|September|October|November|December)(.*?)(LENGTH:)', '', x)

# strip out copyright and year
def cleanerCopy(x):
    return re.sub(r'(Copyright)\s+(\d{4})', '', x)

# simplify the date
def cleanerYear(x):
    return re.sub(r'(\D+)\s(\d{1,2})\W', '', x)

# behaves as advertised - destroys first comma found
def destroyComma(x):
    return re.sub(r',', '', x)

# strip word count
def cleanWordCount(x):
    return re.sub(r'(\d+).(words)', '', x)

# death to datelines
# not that clean. What does 25835 look like before this function?
def cleanDateline(x):
    return re.sub(r'(DATELINE:)\s(\w+)(,\s(\w+).\s(\d+))?', '', x)

In [6]:
# second run at cleaning - clean up article and date columns

# clean dates
nytDF['date'] = nytDF['date'].apply(cleanerUpper)
nytDF['date'] = nytDF['date'].apply(cleanerLower)
#nytDF['date'] = nytDF['date'].apply(cleanerYear)
nytDF['date'] = nytDF['date'].apply(destroyComma)

#clean articles
nytDF['article'] = nytDF['article'].apply(cleanerUpper)
nytDF['article'] = nytDF['article'].apply(cleanerLower)
nytDF['article'] = nytDF['article'].apply(cleanerText)
nytDF['article'] = nytDF['article'].apply(cleanerCopy)
nytDF['article'] = nytDF['article'].apply(cleanWordCount)
nytDF['article'] = nytDF['article'].apply(cleanDateline)

In [7]:
print(nytDF)

                                                 article                date
0         S Vietnamese House of Reprs, by narrow majo...     January 1 1970 
1         US rept (as of Dec 20, 1969) of 9 yrs of Vi...     January 1 1970 
2         Unusual methods used by US brigade commande...     January 1 1970 
3         Internatl Press Inst '69 press freedom surv...     January 1 1970 
4         Vice Pres Agnew on Far East tour to explain...     January 1 1970 
5         Sen H Williams makes unannounced visit to S...     January 1 1970 
6         Army announces it will try Sgt D Mitchell o...     January 1 1970 
7         C L Sulzberger holds Laos is most significa...     January 2 1970 
8         Lr, commenting on James Reston Dec 17 artic...     January 2 1970 
9         George C Wallace, who headed 3d party movem...     January 2 1970 
10        Vietcong issues communique describing mil s...     January 2 1970 
11        H Ross Perot expresses disappointment over ...     January 2 1970 

In [10]:
# that's a pretty big corpus - this one is more manageable. 
# filtered for every article 1980-1989 - about ~11k

nyt80s = nytDF[nytDF['date'].str.contains('198\d', na = False)]
nyt80sClean = nyt80s

# also tokenize and normalize
nyt80sClean['tokenizedArticle'] = nyt80sClean['article'].apply(lambda x: nltk.word_tokenize(x))
nyt80sClean['normalizedArticle'] = nyt80sClean['tokenizedArticle'].apply(lambda x: lucem_illud.normalizeTokens(x, stopwordLst = lucem_illud.stop_words_basic, stemmer = lucem_illud.stemmer_basic))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
print(nyt80sClean)

                                                 article                date  \
7369      '79 food production in Vietnam was only 13....     January 2 1980    
7370      Karleton Armstrong, convicted of '70 bombin...     January 2 1980    
7371      Militants holding US diplomatic personnel h...     January 5 1980    
7372      Militants holding US diplomatic personnel h...     January 5 1980    
7373      China apparently concludes that best strate...     January 5 1980    
7374      5 US chemical companies that produced milit...     January 7 1980    
7375      Phnom Penh, Cambodia, celebrates 1st annive...     January 8 1980    
7376      Australian Government agrees to form indepe...     January 8 1980    
7377      Warrant Officer Francis Anton, former Army ...     January 9 1980    
7378      Author Richard J Walton article on Soviet i...    January 10 1980    
7379      Former Vietnam war prisoner Luis Antonio Or...    January 10 1980    
7380      Vietnam accuses China of condu

In [12]:
# save dataframes as csv files 

#nytDF.to_csv('nytFull', index=False, header=True)

#nyt80s.to_csv('nyt80s.csv', index=False, header=True)

nyt80sClean.to_csv('nyt80sClean.csv', index=False, header=True)


