In [1]:
import xml.etree.cElementTree as et
from lxml import etree
import pandas as pd
import os
import re
import time
import datetime
import gc

In [2]:
# Multiprocessing Module
import multiprocessing as mp
from multiprocessing import Pool

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [4]:
# Check core count
mp.cpu_count()

8

## 1. Parse XML

In [6]:
# Import updated data
filePath='/home/ec2-user/SageMaker/data/RegNews-Jan1985Dec2021/'
files=[]
for file in os.listdir(filePath):
    files.append(file)
print(len(files))
#print(files[0:5])

for file in files:
    if file.endswith('.xml'):
        pass
    else:
        print(file)

1040337


In [5]:
# Clean archived datasets to free up some storage
# filePath_old='/home/ec2-user/SageMaker/data/corpus/'
# files=[]
# for file in os.listdir(filePath_old):
#     files.append(file)
print(len(files))
# for f in os.listdir(filePath_old):
#     os.remove(os.path.join(filePath_old, f))

853284


In [7]:
# Function to print one XML example
def print_xml(file):
    tree = etree.parse(file)
    xml = etree.tostring(tree, encoding="unicode", pretty_print=True)
    print(xml)

In [8]:
print_xml(filePath+files[100])

<RECORD>
   <GOID>426572138</GOID>
   
   <Obj>
      <SourceRollupType>Newspapers</SourceRollupType>
      <ObjectTypes>
         <other ObjectTypeOrigin="Publication">NEWSPAPER</other>
         <mstar>News</mstar>
      </ObjectTypes>
      <ObjectRollupType>Articles - All Types</ObjectRollupType>
      <TitleAtt>
         <Title>Town Is Split Over Ending 129-Year Era</Title>
      </TitleAtt>
      <NumericDate>1987-08-08</NumericDate>
      <StartDate>1987-08-08</StartDate>
      <EndDate>1987-08-08</EndDate>
      <AlphaDate>Aug 8, 1987</AlphaDate>
      <Language>       
         <RawLang>English</RawLang>
     </Language>
      <Language IsPrimary="true">       
         <ISO>         
            <ISOCode>ENG</ISOCode>
            <ISOExpansion ISOCode="ENG">English</ISOExpansion>
         </ISO>
     </Language>
      <Copyright>
         <CopyrightData>Copyright New York Times Company Aug 8, 1987</CopyrightData>
      </Copyright>
      <PrintLocation>       
         <StartP

In [8]:
# Function to remove html tags from a string
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Function to remove multiple spaces
def remove_spaces(text):
    text=re.sub(' +',' ',text).strip()
    return text

In [27]:
# Function to parse XML
def import_xml(filename):
    ID=filename.split('.xml')[0]
    file=filePath+filename
    
    xmlp = et.XMLParser(encoding="UTF-8")
    parsed_xml = et.parse(file,parser=xmlp)
    root = parsed_xml.getroot()
    
    try:
        for child in root.findall('Obj'):
            lang=child.find('Language').find('RawLang').text
        if lang=='English':
            for child in root.findall('Obj'):
                type=child.find('ObjectTypes').find('mstar').text
                title=child.find('TitleAtt').find('Title').text
                try:
                    startdate=child.find('StartDate').text
                    enddate=child.find('EndDate').text
                except:
                    startdate=child.find('NumericDate').text
                    enddate=child.find('NumericDate').text

            if root.find('TextInfo')!=None:
                for node in root.iter('Text'):
                    text=node.text
                    text=remove_spaces(remove_html_tags(text))
                    wordcount=node.get('WordCount')
            else:
                text=''
                wordcount=0

            for child in root.findall('DFS'):
                pubtitle=child.find('PubFrosting').find('Title').text
                sourcetype=child.find('PubFrosting').find('SourceType').text

            return ID,title,type,startdate,enddate,text,wordcount,pubtitle,sourcetype
        
        else:
            print(filename, ": non-English article")
    
    except:
        not_parsed.append(filename)
        print('Could not parse:',filename)

In [28]:
# Define a thread Pool to process multiple XML files simultaneously
# Default set to 3, but may change number of processes depending on instance
p = Pool(processes=8)

2513320605.xml : non-English article
2535925933.xml : non-English article
2562308056.xml : non-English article
2597553077.xml : non-English article
2615158963.xml : non-English article
2526267996.xml : non-English article
2610101974.xml : non-English article
2504008199.xml : non-English article
2494917725.xml : non-English article
2574833128.xml : non-English article
2494574343.xml : non-English article
2540573529.xml : non-English article
2569777007.xml : non-English article
2476160416.xml : non-English article
2580094400.xml : non-English article
2599112888.xml : non-English article
2518800245.xml : non-English article
2587137023.xml : non-English article
2605583882.xml : non-English article
2466047613.xml : non-English article


In [29]:
# Apply function with Pool to corpus, may limit number of articles by using split
start_time = time.time()

not_parsed=[]
processed_lists=p.map(import_xml, files)

print("--- %s seconds ---" % (time.time() - start_time))

--- 107.37254595756531 seconds ---


In [30]:
# Transform processed data into a dataframe
df = pd.DataFrame(processed_lists, columns=['ID','Title','Type','StartDate','EndDate','Text',
            'TextWordCount','PubTitle', 'SourceType'])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040337 entries, 0 to 1040336
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   ID             1040317 non-null  object
 1   Title          1040317 non-null  object
 2   Type           1040317 non-null  object
 3   StartDate      1040317 non-null  object
 4   EndDate        1040317 non-null  object
 5   Text           1040317 non-null  object
 6   TextWordCount  1040317 non-null  object
 7   PubTitle       1040317 non-null  object
 8   SourceType     1040317 non-null  object
dtypes: object(9)
memory usage: 71.4+ MB
None


In [31]:
print(df.head())

           ID                                              Title  Type  \
0   433947392  Federal Officials Seek to Relax Rules for Dump...  News   
1   292714709  Connecticut Seacoast Village Says `Enough' to ...  News   
2  2227336674  In Battle to Confirm a New Justice, Both Sides...  News   
3  1828076074               Toyota, Suzuki agree to explore deal  News   
4   421725277   Los Angeles; Limits on Solicitors at LAX Stalled  News   

    StartDate     EndDate                                               Text  \
0  2008-10-19  2008-10-19  The Interior Department has advanced a proposa...   
1  1988-01-31  1988-01-31  This picturesque river town has adopted rules ...   
2  2005-07-03  2005-07-03  WASHINGTON, July 2 - The last time Ralph G. Ne...   
3  2016-10-12  2016-10-12  Japanese automakers Toyota and Suzuki said Wed...   
4  2002-05-22  2002-05-22  Citing a need for additional study, the city's...   

  TextWordCount                               PubTitle  \
0           313 

In [33]:
if len(not_parsed)>0:
    print(len(not_parsed))
    print(not_parsed)

In [34]:
df.to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/parsed_xml.pkl')

## 2. Clean Data

In [5]:
df=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/parsed_xml.pkl')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040337 entries, 0 to 1040336
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   ID             1040317 non-null  object
 1   Title          1040317 non-null  object
 2   Type           1040317 non-null  object
 3   StartDate      1040317 non-null  object
 4   EndDate        1040317 non-null  object
 5   Text           1040317 non-null  object
 6   TextWordCount  1040317 non-null  object
 7   PubTitle       1040317 non-null  object
 8   SourceType     1040317 non-null  object
dtypes: object(9)
memory usage: 71.4+ MB
None


In [6]:
# Check article type
print(df['SourceType'].value_counts())
print(df['Type'].value_counts())

Newspapers                     826064
Blogs, Podcasts, & Websites    214253
Name: SourceType, dtype: int64
News                      1035464
Feature                      4051
Undefined                     669
Commentary                     35
General Information            32
Article                        29
Statistics/Data Report         10
Obituary                        8
Review                          7
Correction/Retraction           5
Recipe                          3
Interview                       3
Editorial                       1
Name: Type, dtype: int64


In [7]:
# Include only Type==News
df=df[df['Type']=='News'].sort_values(['PubTitle','StartDate']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035464 entries, 0 to 1035463
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   ID             1035464 non-null  object
 1   Title          1035464 non-null  object
 2   Type           1035464 non-null  object
 3   StartDate      1035464 non-null  object
 4   EndDate        1035464 non-null  object
 5   Text           1035464 non-null  object
 6   TextWordCount  1035464 non-null  object
 7   PubTitle       1035464 non-null  object
 8   SourceType     1035464 non-null  object
dtypes: object(9)
memory usage: 71.1+ MB
None


In [8]:
# Convert dates
df['StartDate']=df['StartDate'].astype('datetime64[ns]')
df['Year']=df['StartDate'].astype('datetime64[ns]').dt.year
df['Month']=df['StartDate'].astype('datetime64[ns]').dt.month

In [9]:
# Check start and end dates for each pub title
for title in df.sort_values('PubTitle')['PubTitle'].unique():
    print(title,min(df[df['PubTitle']==title].sort_values('StartDate')['StartDate'].dt.date),
         max(df[df['PubTitle']==title].sort_values('StartDate')['StartDate'].dt.date))

Boston Globe 1987-01-12 2021-12-31
Boston Globe (Online) 2002-11-17 2021-12-31
Boston Globe (pre-1997 Fulltext) 1985-01-01 1996-12-31
Chicago Tribune 1985-08-26 2021-12-31
Chicago Tribune (Online) 2017-02-27 2021-12-31
Chicago Tribune (pre-1997 Fulltext) 1985-01-01 1996-12-03
Los Angeles Times 1988-02-02 2021-12-31
Los Angeles Times (Online) 2017-02-26 2021-12-31
Los Angeles Times (pre-1997 Fulltext) 1985-01-01 1996-12-03
New York Times 1985-01-01 2021-12-31
New York Times (Online) 1996-01-01 2021-12-31
The Washington Post 1996-12-04 2021-12-31
The Washington Post (Online) 2011-05-09 2021-12-31
The Washington Post (pre-1997 Fulltext) 1987-01-01 1996-12-03
USA TODAY 1997-02-17 2021-12-30
USA TODAY (pre-1997 Fulltext) 1987-04-01 1997-02-14
USA Today (Online) 2016-05-10 2021-12-30
Wall Street Journal 1985-01-02 2021-12-31
Wall Street Journal (Online) 2010-01-08 2021-12-31


In [10]:
# Clean duplicated news articles due to overlapped databases (referring to ProQuest publication coverage)
df=df[~(((df['PubTitle']=='Boston Globe') & (df['StartDate']<datetime.datetime(1997,1,1)))
                  | ((df['PubTitle']=='Boston Globe (Online)') & (df['StartDate']<datetime.datetime(2015,9,8))))]
df=df[~(((df['PubTitle']=='Chicago Tribune') & (df['StartDate']<datetime.datetime(1996,12,4)))
                  | ((df['PubTitle']=='Chicago Tribune (Online)') & (df['StartDate']<datetime.datetime(2017,2,23))))]
df=df[~(((df['PubTitle']=='Los Angeles Times') & (df['StartDate']<datetime.datetime(1996,12,4)))
                  | ((df['PubTitle']=='Los Angeles Times (Online)') & (df['StartDate']<datetime.datetime(2017,2,21))))]
df=df[~((df['PubTitle']=='New York Times (Online)') & (df['StartDate']<datetime.datetime(1996,1,1)))]
df=df[~(((df['PubTitle']=='The Washington Post') & (df['StartDate']<datetime.datetime(1996,12,4)))
                  | ((df['PubTitle']=='The Washington Post (Online)') & (df['StartDate']<datetime.datetime(2016,5,21))))]
df=df[~(((df['PubTitle']=='USA TODAY') & (df['StartDate']<datetime.datetime(1997,2,17)))
                  | ((df['PubTitle']=='USA Today (Online)') & (df['StartDate']<datetime.datetime(2012,12,5))))]
df=df[~((df['PubTitle']=='Wall Street Journal (Online)') & (df['StartDate']<datetime.datetime(2010,1,8)))]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1013048 entries, 1527 to 1035463
Data columns (total 11 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   ID             1013048 non-null  object        
 1   Title          1013048 non-null  object        
 2   Type           1013048 non-null  object        
 3   StartDate      1013048 non-null  datetime64[ns]
 4   EndDate        1013048 non-null  object        
 5   Text           1013048 non-null  object        
 6   TextWordCount  1013048 non-null  object        
 7   PubTitle       1013048 non-null  object        
 8   SourceType     1013048 non-null  object        
 9   Year           1013048 non-null  int64         
 10  Month          1013048 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 92.7+ MB
None


In [11]:
# Consolidate newspaper names
df.loc[(df['PubTitle']=='Boston Globe (pre-1997 Fulltext)') | (df['PubTitle']=='Boston Globe') | 
       (df['PubTitle']=='Boston Globe (Online)'),'Newspaper']='Boston Globe'
df.loc[(df['PubTitle']=='Wall Street Journal') | (df['PubTitle']=='Wall Street Journal (Online)'),
    'Newspaper']='Wall Street Journal'
df.loc[(df['PubTitle']=='USA TODAY (pre-1997 Fulltext)') | (df['PubTitle']=='USA TODAY') | 
       (df['PubTitle']=='USA Today (Online)'),'Newspaper']='USA Today'
df.loc[(df['PubTitle']=='Chicago Tribune (pre-1997 Fulltext)') | (df['PubTitle']=='Chicago Tribune') | 
       (df['PubTitle']=='Chicago Tribune (Online)'),'Newspaper']='Chicago Tribune'
df.loc[(df['PubTitle']=='Los Angeles Times') | (df['PubTitle']=='Los Angeles Times (pre-1997 Fulltext)') | 
        (df['PubTitle']=='Los Angeles Times (Online)'),'Newspaper']='Los Angeles Times'
df.loc[(df['PubTitle']=='New York Times') | (df['PubTitle']=='New York Times (Online)'),'Newspaper']='New York Times'
df.loc[(df['PubTitle']=='The Washington Post') | (df['PubTitle']=='The Washington Post (pre-1997 Fulltext)') | 
       (df['PubTitle']=='The Washington Post (Online)'),'Newspaper']='The Washington Post'

In [12]:
df=df.sort_values(['Newspaper','StartDate','Title']).reset_index(drop=True)

In [13]:
# Article count by pub title
for title in df.sort_values('PubTitle')['PubTitle'].unique():
    print(title,min(df[df['PubTitle']==title].sort_values('StartDate')['StartDate'].dt.date),
         max(df[df['PubTitle']==title].sort_values('StartDate')['StartDate'].dt.date),
         len(df[df['PubTitle']==title]))

Boston Globe 1997-01-01 2021-12-31 48235
Boston Globe (Online) 2015-09-08 2021-12-31 8727
Boston Globe (pre-1997 Fulltext) 1985-01-01 1996-12-31 20396
Chicago Tribune 1996-12-04 2021-12-31 52513
Chicago Tribune (Online) 2017-02-27 2021-12-31 8679
Chicago Tribune (pre-1997 Fulltext) 1985-01-01 1996-12-03 38857
Los Angeles Times 1996-12-04 2021-12-31 63124
Los Angeles Times (Online) 2017-02-26 2021-12-31 8151
Los Angeles Times (pre-1997 Fulltext) 1985-01-01 1996-12-03 59422
New York Times 1985-01-01 2021-12-31 123287
New York Times (Online) 1996-01-01 2021-12-31 160486
The Washington Post 1996-12-04 2021-12-31 70900
The Washington Post (Online) 2016-05-21 2021-12-31 18567
The Washington Post (pre-1997 Fulltext) 1987-01-01 1996-12-03 31039
USA TODAY 1997-02-17 2021-12-30 19558
USA TODAY (pre-1997 Fulltext) 1987-04-01 1997-02-14 11304
USA Today (Online) 2016-05-10 2021-12-30 10089
Wall Street Journal 1985-01-02 2021-12-31 152114
Wall Street Journal (Online) 2010-01-08 2021-12-31 107600


In [14]:
# Article count by newspaper
for title in df.sort_values('Newspaper')['Newspaper'].unique():
    print(title,min(df[df['Newspaper']==title].sort_values('StartDate')['StartDate'].dt.date),
         max(df[df['Newspaper']==title].sort_values('StartDate')['StartDate'].dt.date),
         len(df[df['Newspaper']==title]))

Boston Globe 1985-01-01 2021-12-31 77358
Chicago Tribune 1985-01-01 2021-12-31 100049
Los Angeles Times 1985-01-01 2021-12-31 130697
New York Times 1985-01-01 2021-12-31 283773
The Washington Post 1987-01-01 2021-12-31 120506
USA Today 1987-04-01 2021-12-30 40951
Wall Street Journal 1985-01-02 2021-12-31 259714


## 3. Identify and Remove Duplicated Articles

In [47]:
# Full text for certain articles is not available due to copyright restrictions
print("Number of empty full texts:",df[df['Text']==""]['ID'].nunique())
print(df[df['Text']==""]['Newspaper'].value_counts())
# # Examples
# print(df[df['Text']==""]['ID'][-10:])
# print(df[df['Text']==""]['Title'][-10:])

Number of empty full texts: 0
Series([], Name: Newspaper, dtype: int64)


In [48]:
# Define a text preprocessor (lemmatizer)
def my_preprocessor(text):
    doc=nlp(text)
    lemmas=[token.lemma_ for token in doc if not token.is_punct | token.is_space]
    text_out=" ".join(lemmas)
    return text_out

In [49]:
# Convert ID and text to list
id_list=df['ID'].tolist()
text_list=df['Text'].tolist()
print(len(text_list), len(id_list))

1013048 1013048


In [52]:
# Examples
print(my_preprocessor(text_list[0]))

state regulator say yesterday -PRON- would crack down on illegal mail order liquor sale to minor after a 19-year old woman work for -PRON- be able to buy beer wine and Scotch on five occasion by phone without be require to prove -PRON- be of drinking age the regulator warn that the internet and toll free phone shopping represent giant loophole in state drinking law say the liquor be purchase from four different firm and deliver by United Parcel Service Airborne Express a Newton package store and a Canton wine merchant the investigator at the Alcohol Beverage Control Commission be shock at how easy -PRON- be to do this say Michael T. Duffy the state 's consumer affair director -PRON- think this practice be very widespread Massachusetts law which even out of state firm must comply with bar the sale of liquor to anyone under 21 and mandate that only licensed company may deliver alcohol to customer Duffy say -PRON- have no idea how much liquor be be sell by phone or over the internet altho

In [53]:
# Define a function to preprocess text by list index
def preprocess_text(i):
    id=id_list[i]
    text_out=my_preprocessor(text_list[i])
    return id,text_out

In [54]:
# Use multipleprocessing to preprocess all text
start_time = time.time()
with Pool(8) as p:
    text_lemmatized=p.map(preprocess_text, list(range(len(id_list))))
print("--- %s seconds ---" % (time.time() - start_time))

--- 4863.837230682373 seconds ---


In [55]:
# Transform processed data into a dataframe
df_lemmatized = pd.DataFrame(text_lemmatized, columns=['ID','TextLemmatized'])
print(df_lemmatized.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013048 entries, 0 to 1013047
Data columns (total 2 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1013048 non-null  object
 1   TextLemmatized  1013048 non-null  object
dtypes: object(2)
memory usage: 15.5+ MB
None


In [56]:
# Merge
df=df.merge(df_lemmatized, on='ID', how='left')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1013048 entries, 0 to 1013047
Data columns (total 13 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   ID              1013048 non-null  object        
 1   Title           1013048 non-null  object        
 2   Type            1013048 non-null  object        
 3   StartDate       1013048 non-null  datetime64[ns]
 4   EndDate         1013048 non-null  object        
 5   Text            1013048 non-null  object        
 6   TextWordCount   1013048 non-null  object        
 7   PubTitle        1013048 non-null  object        
 8   SourceType      1013048 non-null  object        
 9   Year            1013048 non-null  float64       
 10  Month           1013048 non-null  float64       
 11  Newspaper       1013048 non-null  object        
 12  TextLemmatized  1013048 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(10)
memory usage: 108.2+ MB
Non

In [57]:
# Check duplicates
df['GroupNo']=df.groupby('TextLemmatized').cumcount()+1
print("Number of duplicated articles:",df[df['GroupNo']>1]['ID'].nunique())

Number of duplicated articles: 22786


In [58]:
# Keep the earliest article if duplicated
df_nodup=df.groupby('TextLemmatized').nth(0).reset_index()
print(df_nodup.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990262 entries, 0 to 990261
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   TextLemmatized  990262 non-null  object        
 1   ID              990262 non-null  object        
 2   Title           990262 non-null  object        
 3   Type            990262 non-null  object        
 4   StartDate       990262 non-null  datetime64[ns]
 5   EndDate         990262 non-null  object        
 6   Text            990262 non-null  object        
 7   TextWordCount   990262 non-null  object        
 8   PubTitle        990262 non-null  object        
 9   SourceType      990262 non-null  object        
 10  Year            990262 non-null  float64       
 11  Month           990262 non-null  float64       
 12  Newspaper       990262 non-null  object        
 13  GroupNo         990262 non-null  int64         
dtypes: datetime64[ns](1), float64(2), in

In [59]:
df_nodup['GroupNo']=df_nodup.groupby('TextLemmatized').cumcount()+1
print("Number of duplicated articles:", df_nodup[df_nodup['GroupNo']>1]['ID'].nunique())
print("Number of unavailable articles:",df_nodup[df_nodup['TextLemmatized']==""]['ID'].nunique())

Number of duplicated articles: 0
Number of unavailable articles: 0


In [None]:
# Remove dataframes to release memory
del df
del df_lemmatized

In [63]:
gc.collect()

18

In [66]:
# Check start and end dates for each newspaper
for title in df_nodup.sort_values('Newspaper')['Newspaper'].unique():
    print(title,min(df_nodup[df_nodup['Newspaper']==title].sort_values('StartDate')['StartDate'].dt.date),
         max(df_nodup[df_nodup['Newspaper']==title].sort_values('StartDate')['StartDate'].dt.date),
         len(df_nodup[df_nodup['Newspaper']==title]))

Boston Globe 1985-01-01 2021-12-31 75946
Chicago Tribune 1985-01-01 2021-12-31 99327
Los Angeles Times 1985-01-01 2021-12-31 129998
New York Times 1985-01-01 2021-12-31 273223
The Washington Post 1987-01-01 2021-12-31 117519
USA Today 1987-04-01 2021-12-30 40387
Wall Street Journal 1985-01-02 2021-12-31 253862


In [65]:
# Check start and end dates for each pub title
for title in df_nodup.sort_values('PubTitle')['PubTitle'].unique():
    print(title,min(df_nodup[df_nodup['PubTitle']==title].sort_values('StartDate')['StartDate'].dt.date),
         max(df_nodup[df_nodup['PubTitle']==title].sort_values('StartDate')['StartDate'].dt.date),
         len(df_nodup[df_nodup['PubTitle']==title]))

Boston Globe 1997-01-01 2021-12-31 47343
Boston Globe (Online) 2015-09-08 2021-12-31 8210
Boston Globe (pre-1997 Fulltext) 1985-01-01 1996-12-31 20393
Chicago Tribune 1996-12-04 2021-12-31 52237
Chicago Tribune (Online) 2017-02-27 2021-12-31 8410
Chicago Tribune (pre-1997 Fulltext) 1985-01-01 1996-12-03 38680
Los Angeles Times 1996-12-04 2021-12-31 62640
Los Angeles Times (Online) 2017-02-26 2021-12-31 8030
Los Angeles Times (pre-1997 Fulltext) 1985-01-01 1996-12-03 59328
New York Times 1985-01-01 2021-12-31 122489
New York Times (Online) 1996-01-01 2021-12-31 150734
The Washington Post 1996-12-04 2021-12-31 68673
The Washington Post (Online) 2016-05-21 2021-12-31 17842
The Washington Post (pre-1997 Fulltext) 1987-01-01 1996-12-03 31004
USA TODAY 1997-02-17 2021-12-30 19508
USA TODAY (pre-1997 Fulltext) 1987-04-01 1997-02-14 11297
USA Today (Online) 2016-05-10 2021-12-30 9582
Wall Street Journal 1985-01-02 2021-12-31 151874
Wall Street Journal (Online) 2010-01-08 2021-12-31 101988


In [67]:
df_nodup.to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/parsed_xml_clean.pkl')