#  Import and explore

In [1]:
# import libraries

import pandas as pd
import re
import numpy as np
from collections import Counter, OrderedDict
from operator import itemgetter

In [4]:
# import data

#raw = pd.read()

fields = ['Body', 'Descriptors', 'General Online Descriptors', 'Lead Paragraph',
          'News Desk', 'Online Section', 'Types Of Material','Word Count']

df1 = pd.read_csv('/data/nyt-all/nyt1996.csv',encoding='latin-1',usecols=fields)
df2 = pd.read_csv('/data/nyt-all/nyt2005.csv',encoding='latin-1',usecols=fields)
df3 = pd.read_csv('/data/nyt-all/nyt1986.csv',encoding='latin-1',usecols=fields)

raw = pd.concat([df1,df2,df3])

In [3]:
# delete the Lead Paragraph from the Body 
raw['Body'] = raw.apply(lambda row : str(row['Body']).replace(str(row['Lead Paragraph']), ''), axis=1)

In [4]:
len(raw)

195209

In [5]:
# selecting columns

filtered_df = raw[['Body', 'Descriptors', 'General Online Descriptors'#, 'News Desk', 'Online Section'
                  ,'Types Of Material','Word Count']]
filtered_df = filtered_df.drop_duplicates(subset=['Body'])

In [6]:
# lowercase columns
for col in filtered_df.columns:
    if col not in ['Body', 'Word Count']:
        filtered_df[col] = filtered_df[col].str.lower()

In [7]:
# filter Word Count
filtered_df = filtered_df[filtered_df['Word Count'] > 99]

#### do we also have to filter to limit the Word Count upperbound to 512 according to BERT, or not?

In [8]:
filtered_df.isna().sum()

Body                               0
Descriptors                    33680
General Online Descriptors     19420
Types Of Material             102010
Word Count                         0
dtype: int64

In [9]:
# fill NaN value with 'None'
filtered_df = filtered_df.fillna('None')

# filter Types Of Material
material_list = ['editorial','op-ed','letter', 'None']
filtered_df = filtered_df[filtered_df['Types Of Material'].isin(material_list)]

# Exploring Types of Material

In [10]:
print ('This df contains {} rows'.format(len(filtered_df)))

for m in material_list:
    temp = filtered_df[filtered_df['Types Of Material']==m].sort_values('Body')
    print('For {}, there are {} articles in total. {} articles lack Descriptors. {} articles lack General Online Descriptors.\
          {} articles does not have any descriptors at all.'.format(\
        m, len(temp), 
        len(temp[temp['Descriptors']=='None']), len(temp[temp['General Online Descriptors']=='None']),
        len(temp[(temp['Descriptors']=='None') & (temp['General Online Descriptors']=='None')]))
         )

# many editorials lack topic labels

This df contains 120631 rows
For editorial, there are 3168 articles in total. 581 articles lack Descriptors. 227 articles lack General Online Descriptors.          174 articles does not have any descriptors at all.
For op-ed, there are 2820 articles in total. 636 articles lack Descriptors. 232 articles lack General Online Descriptors.          222 articles does not have any descriptors at all.
For letter, there are 12633 articles in total. 4599 articles lack Descriptors. 4644 articles lack General Online Descriptors.          4593 articles does not have any descriptors at all.
For None, there are 102010 articles in total. 24353 articles lack Descriptors. 9297 articles lack General Online Descriptors.          8020 articles does not have any descriptors at all.


# Explore keywords in Descriptors/General Online Descriptors & Editorials

In [11]:
def check_keywords(df,top):
    des = []
    genondes = []
    for row in df.iterrows():
        s_des = row[1]['Descriptors']#.astype(str)
        s_genondes = row[1]['General Online Descriptors']#.astype(str)
        if type(s_des) == str:
            des.extend(s_des.split(sep='|'))
        if type(s_genondes) == str:
            genondes.extend(s_genondes.split(sep='|'))
    
    d_des = OrderedDict(sorted(Counter(des).items(), key = itemgetter(1), reverse = True))
    d_genondes = OrderedDict(sorted(Counter(genondes).items(), key = itemgetter(1), reverse = True))
    
    print ('No. of Descriptors:',len(d_des))
    print ('Top 20 Descriptors:\n',list(d_des)[:top])
    
    print ('\nNo. of General Online Descriptors:',len(d_genondes))
    print ('Top 20 General Online Descriptors:\n',list(d_genondes)[:top])
    
    return d_des, d_genondes

In [12]:
descript,gen_descript=check_keywords(filtered_df,30)

No. of Descriptors: 7405
Top 20 Descriptors:
 ['None', 'united states international relations', 'weddings and engagements', 'finances', 'biographical information', 'elections', 'politics and government', 'baseball', 'basketball', 'law and legislation', 'television', 'editorials', 'mergers, acquisitions and divestitures', 'football', 'united states armament and defense', 'ethics', 'education and schools', 'terrorism', 'presidential election of 1996', 'labor', 'travel and vacations', 'children and youth', 'housing', 'stocks and bonds', 'airlines and airplanes', 'suits and litigation', 'murders and attempted murders', 'motion pictures', 'budgets and budgeting', 'music']

No. of General Online Descriptors: 1572
Top 20 General Online Descriptors:
 ['politics and government', 'None', 'finances', 'united states politics and government', 'united states international relations', 'elections', 'presidential elections (us)', 'medicine and health', 'international relations', 'presidential election 

In [13]:
print (descript)



In [14]:
print (gen_descript)



# Filter by keywords

In [15]:
# filter for interested topics
key_dict = {
    'law' : ['law','right','court'],
    'pol' : ['politics','relation','international','regional'],
    'med' : ['medicine','health','disease'],
    'fin' : ['finances','business'],
    'mil' : ['defense','armament','military'],
    'edu' : ['education','school','teacher']
}

# select lists of keywords
l = [key_dict.get(k) for k in list(key_dict)]
flatten_l = [item for sublist in l for item in sublist]

# column used for filtering
filter_col = 'Descriptors'
###filter_col = 'General Online Descriptors'

s = '|'.join([item for item in flatten_l])
filtered_topics = filtered_df[filtered_df[filter_col].str.contains(s)==True]

#### have to think about thr articles that have keywords in more than one topic

In [16]:
# clean Body

for row in filtered_topics.iterrows():
    body = row[1]['Body'].lstrip()
    
    #print ('\n', body)
    
    #print (re.findall(r'\((.*?)\, [A-Za-z]{3}\. \d\{2}\)', body))
    #get = re.findall('\(+(.*?\.\s\d{2})\)', body)
    #get = re.findall('\(.*\.\s\d{2}\)', body)
    
    #get = re.findall('\(.{1,200}\.\s\d{2}\)', body)
    
    get = re.findall("\([^\(]*\.\s\d{2}\)", body)
    
    if get:        
        body = re.sub("\([^\(]*\.\s\d{2}\)",'', body)
    
    body_after = re.sub('To the Editor:','', body)    
    #filtered_topics.loc[row[0],'Body'] = body_after
    
    filtered_topics._set_value(row[0],'Body',body_after)

In [17]:
# split Descriptors into list
filtered_topics['Descriptors'] = filtered_topics['Descriptors'].str.split('|')
filtered_topics['General Online Descriptors'] = filtered_topics['General Online Descriptors'].str.split('|')

# add empty column to df
###filtered_topics.insert(1,"Topic", '')

# at some point have to change Types of Material to 'news' and 'editorials'
###filtered_topics.loc[filtered_topics["Types Of Material"] == 'letter', "Types Of Material"] = 'editorial'
###filtered_topics.loc[filtered_topics["Types Of Material"] == 'op-ed', "Types Of Material"] = 'editorial'
###filtered_topics.loc[filtered_topics["Types Of Material"] == 'None', "Types Of Material"] = 'news'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_topics['Descriptors'] = filtered_topics['Descriptors'].str.split('|')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_topics['General Online Descriptors'] = filtered_topics['General Online Descriptors'].str.split('|')


In [18]:
filtered_topics

Unnamed: 0,Body,Descriptors,General Online Descriptors,Types Of Material,Word Count
4,"In an Aug. 9, 1994, letter to judges and prose...","[crime and criminals, law and legislation, cap...","[capital punishment, crime and criminals, law ...",letter,169
21,"Randy is an official nobody now, but is one of...","[united states international relations, electi...","[united states international relations, united...",op-ed,760
32,"Now into the mix comes Martin Luther King Jr.,...","[copyrights, licensing agreements]",[copyrights],,811
46,Medical specialists serving on F.D.A. advisory...,"[meat, law and legislation, food contamination...","[standards and standardization, meat, consumer...",,1158
53,The only explanation the police gave to the ho...,"[orphans and orphanages, freedom and human rig...","[freedom and human rights, children and youth,...",,713
...,...,...,...,...,...
26087,Key legislators said tonight that they were re...,"[roads and traffic, finances, government bonds...","[government bonds, budgets and budgeting, road...",,832
26088,Two state legislators submitted bills today to...,"[politics and government, elections, referendu...","[referendums, local government, elections, pol...",,518
26101,More than 470 children were sent home when the...,"[education and schools, lead, paints and paint...","[lead, environment, art, air pollution, educat...",,137
26103,The city discharged 23 more provisional Emerge...,"[medicine and health, suspensions, dismissals ...","[suspensions, dismissals and resignations, wag...",,144


# Assign topic to articles

In [19]:
# define the main topic by the keyword in descriptors and add Topic column

def match_string(list_string,search_string):
    result = [re.search(i, search_string).group() for i in list_string if re.search(i, search_string) is not None]
    
    if len(result) > 0:
        return result[0]

def match_key(dictionary, search_string):
    match_list = [key for key,val in dictionary.items() if any(search_string in s for s in val)]
    
    if len(match_list) > 0:
        return match_list[0]

def match_topic(df, key_dict):
    
    # add empty column to df
    filtered_topics.insert(0,"Topic", "None")
    
    for row in df.iterrows():
        
                
#        print (row[0])
#        print (row[1]['Topic'])
#        print (row[1]['Descriptors'])
        
        topic = ''        
        
        #print(df.loc[row[0], 'Topic'])
        #print(type(df.loc[row[0], 'Topic']))
        
        if row[1]['Topic'] in list(key_dict):
            pass
        
        else:
            for des in row[1]['Descriptors']:
            
            #print (des)
            
                match_res = match_string(flatten_l,des)
            
            #print (match_res)
            
                if match_res:
                    topic = match_key(key_dict,match_res)
                
                    if topic:
                        #print (topic)
                        df._set_value(row[0],'Topic', topic)
    return df

In [20]:
return_df = match_topic(filtered_topics, key_dict)

In [21]:
len(return_df)==len(return_df.drop_duplicates(subset='Body'))

False

In [22]:
# once again drop duplicated bodies
return_df = return_df.drop_duplicates(subset='Body')

In [23]:
# check dataset size
print('current data size:',len(return_df))
print('topics size:\n{}'.format(return_df['Topic'].value_counts()))

current data size: 19295
topics size:
pol    6886
law    3553
fin    3100
mil    2132
edu    1881
med    1743
Name: Topic, dtype: int64


In [24]:
def count_result(df):
    
    # at some point have to change Types of Material to 'news' and 'editorials'
    df.loc[df["Types Of Material"] == 'letter', "Types Of Material"] = 'editorial'
    df.loc[df["Types Of Material"] == 'op-ed', "Types Of Material"] = 'editorial'
    df.loc[df["Types Of Material"] == 'None', "Types Of Material"] = 'news'
    #print ('No. of editorials',len(df[df['Types Of Material'].isin(['editorial','letter','op-ed'])]))
    
    count_df = df.groupby(["Topic", "Types Of Material"]).size().reset_index(name="Count")
    count_df = count_df.pivot('Topic','Types Of Material','Count').reset_index()
    count_df['ratio'] = count_df['editorial']/(count_df['editorial']+count_df['news'])
    
    for row in count_df.iterrows():
        print ('On the topic {}, we have {} news articles and {} editorials, or {} % editorial'\
               .format(row[1]['Topic'], row[1]['news'], row[1]['editorial'], round(row[1]['ratio'],4)*100))

In [25]:
count_result(return_df)

On the topic edu, we have 1377 news articles and 504 editorials, or 26.790000000000003 % editorial
On the topic fin, we have 2560 news articles and 540 editorials, or 17.419999999999998 % editorial
On the topic law, we have 2622 news articles and 931 editorials, or 26.200000000000003 % editorial
On the topic med, we have 1292 news articles and 451 editorials, or 25.869999999999997 % editorial
On the topic mil, we have 1700 news articles and 432 editorials, or 20.26 % editorial
On the topic pol, we have 5366 news articles and 1520 editorials, or 22.07 % editorial


  count_df = count_df.pivot('Topic','Types Of Material','Count').reset_index()


# Recheck the Body

In [26]:
# recheck the data in df

for i in return_df.sort_values(by=['Body']).iterrows():
    print(i[1]['Body'][:200],'\n')

 ''A Cheap Dollar Won't Cure the Deficit'' by Michael Hudson  was an important and intelligent article. The fallacy that the deliberate debasement of the dollar would stimulate exports and inhibit imp 

 ''A Treaty is Not a Christmas Tree''  wisely calls upon the Senate to abjure using the intermediate nuclear force treaty ''to settle old scores or push new agendas.'' In that spirit, I would like to  

 ''Circumcision Under Criticism as Unnecessary to Newborn'' (front page, Feb. 1) is both symbolically and substantively distorted, starting with the headline. Circumcision is back in the news in 1988  

 ''Congressional Pork, White House Pork'' (editorial, Feb. 6) left me somewhat confused. The implication was clearly that it would be better to continue a Congressional expenditure system in which no  

 ''Defer the Contra Decision''  says that Congress cannot renew aid to the contras ''that is explicitly labeled military without flagrantly violating the letter and spirit of the peace pro

''It would be so foolish to pass up something like this,'' Manfred Krug, a folksy German television star, advises in ads blanketing the airwaves. On billboards and in glossy magazine ads, young execut 

''It's O.K. if we evolved from primates and other creatures,'' said Toni M. Kovach, a 38-year-old social worker at St. Patrick's Senior Center. ''There's too much evidence to support it.'' But at the  

''It's a dispute between the trustees,'' Mr. Caputi said. ''Usually the sale is routine.'' The property, including several buildings, represents the remaining assets of the college, founded by Quakers 

''It's a distortion of facts, it's a complete distortion of the historic picture,'' said a senior official at the Russian Embassy in Washington who asked not be identified because he did not have auth 

''It's a fait accompli, it's over: Dean's going to be it,'' said Gerald McEntee, head of the American Federation of State, County and Municipal Employees, who runs the umbrella political o

''What we are seeing is the collapse of the Iraqi opposition,'' said Kanan Makiya, a leading Iraqi dissident living in the United States. ''They were unable to transcend their petty factional alliance 

''What we enable with the passage of this bill is the creation of an independent regulator with all the tools necessary to protect the taxpayer,'' said Representative Richard H. Baker, a Louisiana Rep 

''What we have now is just not working,'' said Mr. Rockefeller, a former Governor. ''We've got to make a change, a radical change. There are parts of the bill that I'm not happy about. The block grant 

''What we're seeing today is exactly the sort of incident that will start the next war,'' said a senior United Nations official who spoke on condition of anonymity. The peacekeepers rushed to the area 

''What's happening now is not a coincidence, or like some kind of flower that has blossomed overnight,'' argues VÃÂ­ctor Abramovich of the Center for Legal and Social Studies here, one of

A week later, your front-page article on regionalization ("Can Our Schools Be Merged?," June 2) pointed out that the Whitman education plan does little to encourage the formation of more efficient con 

A week later, your front-page article on regionalization (''Can Our Schools Be Merged?,'' June 2) pointed out that the Whitman education plan does little to encourage the formation of more efficient c 

A welcome court ruling that strikes down a dissonant provision of New York City's cabaret law could encourage a jazz revival in the city. The cabaret law requires restaurants and bars that feature mus 

A white teen-ager convicted of manslaughter and assault for attacking three black men in Howard Beach, Queens, was sentenced yesterday to 6 to 18 years in prison by a judge who said the teen-ager had  

A white teen-ager was acquitted yesterday of assault charges stemming from what the police said was a bias-related attack by a group of white men on two Hispanic teen-agers in Queens. He w

Among the most controversial changes was a redefinition of science itself, so that it would not be explicitly limited to natural explanations. The vote was a watershed victory for the emerging movemen 

Among the most divisive issues are the rights of women, the role of Islam and the scope and reach of Kurdish self-rule. Sticking to the Aug. 15 deadline is favored by the Americans, but it runs the ri 

Among the new initiatives are a vocational program to help high school dropouts and failing students get equivalency diplomas and learn new skills, and a commission to promote diversity in the constru 

Among the officials who testified in closed sessions in recent days were Stephen A. Cambone, the under secretary of defense for intelligence, and Lt. Gen. William G. Boykin, his deputy, Congressional  

Among the recommendations in what is said to be a sweeping critique of the government's performance is the creation of a new interagency center on proliferation, to assess efforts by other

At American insistence -- and consistent with pledges President Clinton made to Congress when he sought support for the peacekeeping mission here -- the NATO forces are to leave at the end of the year 

At Christopher Columbus High School, where I am assistant principal, every student has a chance to participate in instrumental ensembles, choruses, dance and art studio classes and musical-theater pro 

At Columbine, the killers, Eric Harris and Dylan Klebold, belonged to the ''Trench Coat Mafia'' and loved all things Goth. They sometimes did a Nazi salute while bowling and planned their attack for H 

At E. O. Smith High School in Storrs, there are no study halls and juniors and seniors are allowed to leave campus for lunch, a privilege that attempts to address how the school's 940 students could h 

At Matsu island, five miles from the mainland, the Prime Minister, Lien Chan, told soldiers to "step up war readiness and handle a possible outbreak of military crisis with a nonprovocativ

But even as Republicans rejoiced about the quick passage of the measure, an interesting showdown appeared to be developing between Senator Arlen Specter, the Pennsylvania Republican who recently becam 

But even as sales were expanding for the industry, they were shrinking at G.M. and Ford. The decline was the sharpest at General Motors, the world's largest automaker. G.M.'s sales fell 7.7 percent fr 

But even as the ''sponsorship scandal'' has unfolded, one unseemly chapter after another, Prime Minister Paul Martin has held fast, in the face of overwhelming evidence to the contrary, to a cherished 

But even as the drilling ploy of Senator Ted Stevens of Alaska was rebuffed, President Bush was employing a yet more apocalyptic argument to press the Senate to renew the Patriot Act and override conc 

But even as this clash of views is being played out, El Alto is quietly benefiting from trade preferences provided by Washington, spawning hundreds of small businesses and thousands of job

By themselves, the governors cannot alter Medicaid or welfare policy now set by the Federal Government. But both the White House and Republican leaders in Congress have been looking to them for sugges 

By using Ward as a front, Mrs. Clinton and her law partner helped the infamous Madison S.& L. as it ''evaded regulations designed to protect the safety and soundness of the institution and violated th 

By using the latest information about genetic characteristics of the population as a whole, and much-improved genetic profiles of racial and ethnic groups, experts have greatly enhanced their ability  

By week's end, a worried Clinton Administration was again trying to get the parties back to the peace table. An emissary, former Senator George J. Mitchell, left to talk to the leaders of Britain and  

By working out clear understandings with Beijing on regional issues today, Washington could help avoid unnecessary and highly destructive future conflicts. Here's what Washington should be

Dr. Jack Kevorkian has continued to make plain throughout 27 physician-assisted deaths that the relief of suffering has been his motive. When you define motive (relieve suffering) in the context of th 

Dr. Jafari's party, Dawa, and Mr. Mahdi's, the Supreme Council for the Islamic Revolution in Iraq, known as Sciri, are the two largest groups in the Shiite alliance, which captured a slim majority of  

Dr. Jeffrey B. Smith, a senior resident in dermatology at the University of South Florida in Tampa, recalled this poignant diagnosis in a review of the effects of smoking on the skin published last mo 

Dr. Jeffrey Schwartz, an expert on obsessive-compulsive disorder at the institute, said the actor's success in exposing Hughes's inner demons was an achievement worthy of a great writer. ''You think o 

Dr. John Fitzpatrick, director of the Cornell Lab of Ornithology, who led the effort to confirm the sightings, said at a news conference in Washington, ''This is really the most spectacula

For the hordes of music lovers who traveled from places as distant as Maine and Switzerland for this year's International Country Music Fan Fair, the star search was only part of the allure. The six-d 

For the last century and a half, the most fundamental driving force behind all Chinese political events has been the yearning to make China strong again, to rid it of the humiliation of foreign interv 

For the last few decades, the share of Medicare costs incurred by patients in their last year of life has stayed at about 28 percent, said Dr. Gail R. Wilensky, a senior fellow at Project HOPE who pre 

For the last few years Justice William A. Bablitch of the Wisconsin Supreme Court has traveled around the state lecturing law students and lawyers on the need to write clearly and crisply. To illustra 

For the last few years, student loan companies have taken great advantage of subsidies that Congress thought it had done away with more than a decade ago. Through creative use of a loophol

House Democrats and Republicans joined forces today and approved a $47.9 million aid package for the Nicaraguan rebels. The vote was 345 to 70. The overwhelming vote in favor of aid did not reflect an 

House Democrats unveiled a $25 million plan today to provide food, clothing and medical aid to the Nicaraguan rebels over the next four months. The plan, intended as a alternative to a request by Pres 

House Republicans gathered within hours of the indictment's becoming public, and chose Representative Roy Blunt of Missouri, the No.3 House Republican, to assume Mr. DeLay's duties temporarily. They a 

House Republicans heard a report on Saturday from the National Republican Congressional Committee on the potential politics of changing the tax system, saying that there was broad support for ''simpli 

House Republicans, who provided a copy of the report, have been demanding an overhaul of the program in recent years and narrowly passed a bill in 2003 to allow eight governors to take ove

In a city with a distinguished political tradition of grudges and vendettas, some people are asking what is really behind Senator Edward M. Kennedy's hot battle with the publisher Rupert Murdoch. Some 

In a civil trial that began Monday in federal court here, 13 plaintiffs are seeking up to $250,000 each in damages, charging that agents from the Immigration and Naturalization Service unnecessarily s 

In a combative opening statement, Evan Mecham's lawyers today began defending the Governor in his impeachment trial here by charging that he was the victim of political persecution. The defense team a 

In a compromise with Council leaders 10 days ago, the administration agreed that 25 percent of the proposed 13,600 new apartments would be for low- and moderate-income New Yorkers. The Council also ag 

In a concession to the rebels, President Yeltsin ordered that the troops be removed from Chechnya and stationed elsewhere in the North Caucasus, just across the border. Before Mr. Yeltsin'

In the meantime, the governments passed immediate measures that offer some benefits to passengers. As of the first of this month, for example, the number of weekly frequencies between the two countrie 

In the memorandum, apparently intended for Thomas F. McLarty, who was the White House chief of staff, Mr. Watkins wrote that "we both know that there would be hell to pay" if "we failed to take swift  

In the middle was thrust the moderator, former Mayor Edward I. Koch (who confided proudly earlier in the day that he had been told that he had been picked after the two sides had considered 350 other  

In the minds of current party leaders, Mr. Wang, one of the last surviving leaders of the Cultural Revolution, personifies some of the most violent excesses of the onset of China's most tumultuous pol 

In the moments before the final countdown, the launching director, Michael D. Leinbach, told the crew, ''Good luck, godspeed -- and have a little fun up there.'' Minutes later, the shuttle

It was a strange choice, politicians here say. Until a year ago, Mr. Farah was living an obscure and mundane life in a Los Angeles suburb, going to school part time and working as a clerk in the West  

It was a surprise, therefore, when the mayor of Mr. Osso's district visited him at home two weeks ago and began to ask probing questions about his family. ''He asked how many children I had and about  

It was a year ago today that the stories and photos of the shocking abuses at Abu Ghraib prison first came to the public's attention. It was a scandal that undermined the military's reputation and dim 

It was an inauspicious start to the National Guard's hurricane response, which fell so short that it has set off a national debate about whether in the future the Pentagon should take charge immediate 

It was an unbridled celebration, a victory lap, not just for President Bush and Mr. Cheney, but also for the thousands upon thousands of Republicans from around the country who had descend

Let the students hear recordings of Paul Robeson's renditions of Shakespeare's ''Othello,'' and be told that at Rutgers University, where he was the sole black student when he entered, he won first pr 

Let us not forget the sorrow that must be concealed behind the smile of Ms. Menchu as she extends her hand to military aides from the same army that killed three family members. According to her book, 

Let's be direct: PBS is hardly the Mount Olympus it once was. With the cable explosion, public television is no longer the only source of what was once labeled ''educational television,'' nor of more  

Let's contrast the raptures of the after-dinner smoker to the agonies of the smoker dying of lung cancer or emphysema. My guess is that most people stricken by cigarette-induced disease would, in retr 

Let's discuss the decree that will change your lives," said Aleksandr V. Ignatenko, director of the Center for Agricultural Reform in this Russian farming center about 100 miles south of M

Mr. Bush plans to hail the improvement at a cabinet meeting and to cite it as validation of his argument that tax cuts would stimulate the economy and ultimately help pay for themselves. Based on reve 

Mr. Bush severely restricted federal financing and opposed therapeutic cloning, the most promising research avenue, in a policy hailed by religious conservatives opposed to embryonic stem cell researc 

Mr. Bush signaled yesterday that we are in for more of the same when he sneered and said, ''One of the things that people want us to do here is to play a blame game.'' This is not a game. It is critic 

Mr. Bush spoke before Brazilian business leaders, diplomats and students at the luxury Blue Tree Park Hotel here in the capital, and did not mention Mr. ChÃÂ¡vez by name. But his barbs at Mr. ChÃÂ¡v 

Mr. Bush strongly hinted that the government was beginning a leak investigation into how the existence of the program was disclosed. It was first revealed in an article published on The Ne

Mr. Zyuganov's promises to safeguard private property and to continue a market economy in Russia if he is elected in June were vague, and he refused to say today whether he supported ratification of t 

Mr. de Klerk and his National Party colleauges in the Cabinet will retain their posts until the end of next month. "Our decision should be seen as an important step in the growing maturity and normali 

Mr. de Klerk said he has already met with representatives of other parties and with prominent South Africans outside party politics to talk about creating a new political movement. He said the Nationa 

Mr. de Kock's life sentences, the maximum under South African law, could give pause to many others who participated in atrocities but, emboldened by the acquittal this month of former Defense Minister 

Mr. du Pont, 57, is accused of murdering David Schultz, a 1984 Olympic gold medalist, by shooting him three times on Friday on the du Pont estate, which has a wrestling training center. Fr

Officials view the attack as a challenge to the authority of the government, which has faced resistance from loggers and land speculators in the region over new land-use and ownership regulations. Imm 

Oil contracts for August delivery rose $1.33, or 2.3 percent, to close at $59.42 a barrel on the New York Mercantile Exchange, representing an increase of 58 percent in the last year and more than dou 

Oil prices are high, there is little infighting, and a comfortable premium is likely to fill OPEC coffers for a while. Perhaps the producing nations can afford to be a bit generous to the world's cons 

Oil prices continued to rise yesterday amid conflicting reports on OPEC's plans to bolster the battered oil market. On the New York Mercantile Exchange, contracts for April delivery of West Texas Inte 

Oil prices plummeted on Sunday as damage to refineries and oil facilities along the coast also turned out to be less than had been feared. In New Orleans, the slow slog toward recovery fro

President Chaim Herzog challenged American Jewish critics today to suggest an alternative to Israel's tough policies against Palestinian protests. ''The question I must ask you is, what do you see as  

President Chun Doo Hwan has proposed talks with the opposition leader, Kim Dae Jung, ''for national reconciliation,'' a presidential aide said today. The President's chief secretary, Kim Yoon Whan, sa 

President Clinton and President Kim Young Sam of South Korea proposed last month that their two nations join with China and North Korea in peace talks aimed at securing a peace agreement to replace th 

President Clinton and Senator Dole seem to agree that the denial of normal trade privileges is not the way to influence Chinese behavior on a host of diplomatic, arms control, human rights and commerc 

President Clinton calls the files flap a "snafu" -- a military acronym politely rendered as "situation normal: all fouled up" -- an honest mistake. Republicans call it a deliberate dirty t

Senators may find their switchboards lighting up with more calls than usual today. The message: ''Don't bug me!'' Communications industry workers from around the country are holding a ''Call in to Con 

Senators of both parties quickly indicated that they intended to stick with their higher number when Congress returned after a week's break. ''The House, instead of agreeing with us and putting the do 

Senators will be sorely tempted to put their stamp on the treaty to eliminate medium- and shorter-range missiles. The itch seemed irresistible as the hearings got under way this week. But an array of  

Sending the work abroad was only an interim measure, and Blue Cross/Blue Shield has created 400 union positions in New Jersey to handle it, Ms. McMahon said. A spokesman said the insurer was trying to 

Senior Administration officials said today that the coup attempt against Gen. Manuel Antonio Noriega demonstrated deepening division within the Panamanian military, his principal base of s

Supporters of ousted opposition party leader, Megawati Sukarnoputri, had held a vigil for over a month at the single-story building in a busy residential district of Jakarta in an effort to prevent th 

Supporters of the legislation had contended that City Hall often focuses its economic development efforts on Manhattan, neglecting to nurture companies in the other boroughs. They said job-creation ag 

Supporters of the measure contend that it would put a cap on multimillion-dollar jury awards that they say inhibit manufacturers and add to consumer costs. But opponents say that limiting such awards, 

Supporters of the resolution said Turkey was repressing dissent and mistreating political prisoners. Others called on Turkey to drop its plan to create a ''security zone'' in northern Iraq and to chan 

Supporters of what some lawmakers refer to as the ''DeLay ultra-deep-water provision'' say it is crucial to developing new technology to prevent the nation from becoming as dependent on fo

The Constitutional Court and the Parliament would be virtually powerless to reverse such a measure, even if they wanted to, which is doubtful. Besides, how can anyone be sure that Mr. Yeltsin would ho 

The Constitutional question was raised last week by staff members for the Senate Foreign Relations Committee. Mr. Burns said the Clinton Administration concluded that the office would not formally com 

The Consumer Price Index rose three-tenths of 1 percent in July, higher than the two-tenths of 1 percent economists had anticipated, with inflation still being restrained. At the same time, the Govern 

The Consumer Product Safety Commission and a Federal district judge are expected to decide soon whether to give final approval to an agreement on the future sale and production of all-terrain vehicles 

The Consumers Power Company of Jackson, Mich., offered $250 million worth of 8 3/4 percent noncallable first mortgage bonds yesterday at 99.72, to yield 8.82 percent to 1993, through under

The Sandinista Government today canceled a negotiating session with contra representatives that was planned for Wednesday. The contras had earlier announced they would not attend. No effort had been m 

The Saudi government, itself under assault from Al Qaeda, is not in the business of directly financing terrorism, and since 9/11 it has responded to American pressure to control the flow of charitable 

The Saudi kingdom remains the 600-pound gorilla of the global oil market. Given its vast reserves, Saudi Arabia can keep pumping oil for the next 70 years. Oil, along with Islam's holy cities, Mecca a 

The Schools Chancellor is right on the mark. The reforms he proposes will come to naught unless Mayor Rudolph Giuliani and Gov. George Pataki work harder to find room in admittedly strapped budgets to 

The Scottish police identified the gunman as Thomas Hamilton, 43, a loner and avid gun enthusiast who lived in a housing project in nearby Stirling. Mr. Hamilton arrived at the school, in 

The bill would tighten restrictions that are already imposed by the Supreme Court on Federal court review of state criminal trials. Innocent people convicted in unfair trials would be imprisoned and i 

The bill would, in fact, do little to insure "public safety," but would severely threaten civil liberties. We would add, however, that Congress is not necessarily forced to choose between fighting ter 

The bill's sponsor, Assemblyman Joseph Crowley, has noted that the bill would not have passed without support from Jewish legislators. The famine was not a ''natural disaster.'' Beginning in 1494, the 

The bill's sponsor, Senator John P. Scott, a Republican of Lyndhurst, said he opposes legalization of same-sex marriages for moral reasons and because it would mean higher insurance costs for employer 

The bill's supporters, for example, assume that child molesters are more likely than other criminals to repeat the crime. Assemblyman Bill Hoge of Pasadena, the chief sponsor of the bill, 

The employees experienced a variety of emotional and physical symptoms, including insomnia and other sleep disorders, weight gain, headaches, hypertension, heart trouble and other trauma," the suit co 

The employers will not subsidize the coverage, but their participation created a pool of potential participants sufficiently large to justify lower insurance rates than individuals would have to pay o 

The encouraging news is that Bates, Bowdoin, Dickinson, Mount Holyoke and Sarah Lawrence have joined the growing list of colleges that have taken the bold step of not relying on SAT scores to create a 

The energy ministry, along with the national oil company, PetrÃÂ³leos de Venezuela, ''will begin a process of discussion with the 32 operating agreements so that we can reach the objective of the mig 

The engineer of Train 1254, John J. DeCurtis, had been on duty for 14 1/2 hours on an overnight shift, and investigators are looking into the possibility that fatigue contributed to the ac

The missioaries, Carl and Eleanor Johnson, who run a medical center near the capital, Bujumbura, were not harmed, the spokesman said. Tutsi soldiers have driven most Hutu from the capital in a methodi 

The mistakes that led us to this point came in waves. Republican blunders: Republicans often argue that Democrats are out of touch with mainstream Americans, but this time it was the Republicans who w 

The mixed messages in the two votes reflect public ambivalence towards gays and lesbians. Opinion polls indicate that about 85 percent of Americans support equal rights for gay people in job opportuni 

The modifications were required by the McCain-Feingold campaign finance law of 2002, which called for the commission to tie contribution limits to inflation every two years. ''I like to call it electi 

The money -- from $26 billion to $28 billion this year -- is hidden in false accounts within the public budget of the Pentagon, despite a clause in the Constitution demanding a full accoun

The question resurfaces in light of events like a West London bomb explosion (news item, April 18) attributed to the Irish Republican Army. To me and other Unionists, it is incredible that the Preside 

The question to which we don't know the answer is, what assessment the pilots made," he said. "Given the unsophisticated navigation, given the bad weather, why go in at all?" The pilot and co-pilot of 

The question, of course, is just how many people will want to relive that fight. While it lacks the partisanship and personality of Michael Moore's ''Fahrenheit 9/11,'' the as-yet-untitled picture rel 

The questions (Did we have to drop the bomb on Japan? Did it hasten Japan's surrender? Is fair to use the knowledge of 1996 to judge decisions made 50 years ago?) invite us to view history quite diffe 

The questions are coming at a sensitive time for Mr. Miller, just as he is trying to get traction in the campaign for the Democratic nomination. And though Mr. Miller's primary opponents h

The vise tightens on the $450 billion health-care industry, which runs the gamut from drug and medical-supply houses to hospital chains and nursing homes. Many costs are soaring, and skilled workers a 

The visit, a pointed reminder of American military capacity on the peninsula, came just hours after a speech in Tokyo in which Ms. Rice repeated that the United States had no intention of attacking No 

The vital United States interests are to keep Saddam boxed in, to prevent him from threatening the oilfields in Kuwait and Saudi Arabia, to prevent him from savaging the Kurds and to support all reaso 

The vote also likely represents the lone opportunity that House members will have to express their sentiments on Mr. McCain's legislation. The Senate approved the measure in October, 90 to 9, as part  

The vote by the Cotton Exchange to remain in the city is a tremendous vote of confidence in the city as the financial capital of the world," Mayor Rudolph W. Giuliani said. "We're ecstatic


Those whose survival is put in jeopardy by this election-year posturing will find moral justification for theft and seek to self-medicate anxiety and depression caused by their situation. Projected sa 

Those words were not ignored. When President Boris N. Yeltsin's officials said the Government would be making an example of a select number of tax-dodgers, Gazprom was not on the list. The company's i 

Though Governor Whitman's proposed cigarette tax sounds nice, it is now time that people other than the shareholders and C.E.O.'s of managed-care companies benefit from health care profits. In return  

Though House Republicans have not finished preparing the legislation, they said they hoped to push it through the House before lawmakers take a spring break at the end of this month. But advocates of  

Though I have no statistics, I suspect that the South loses more than it gains in the exchange of profits for Federal spending. This process cheats the South and the North alike. We are f

Vice President Bush's televised quarrel with Dan Rather neither helped nor hurt his image, a New York Times Poll has found, but the persistent controversy over his role in the Iran-contra affair appea 

Vice President Bush, on the defensive over his role in the Iran-contra affair, insisted in a debate here tonight that he had answered every question, save one, about his role in the Iran-contra affair 

Vice President Dick Cheney told troops at Fort Drum, N.Y., on Tuesday that in the event of a swift withdrawal of American troops, Iraq ''would return to the rule of tyrants, become a massive source of 

Vicki Saporta, executive director of the federation, said that as a result of the 1994 law, ''there has been significantly less violence outside clinics and a significant opening of access, with fewer 

Victims Assistance Services is working on its application to the State Division of Criminal Services for a grant to help finance the new program. The agency estimated that it would cost ro

White House officials said Mr. Clinton might hold a news conference later this week at which he would discuss his Cabinet shuffle and name some replacements. Others who have told Mr. Clinton that they 

White House officials said today that Terence McAuliffe, the national finance chairman for the Clinton-Gore campaign this year, and Ann Jordan, the wife of Vernon Jordan, the Clinton confidant, will b 

White House officials said tonight that they were uncertain whether they could comply with the request. The Whitewater independent counsel, Kenneth W. Starr, has been asked by Attorney General Janet R 

White House spokesmen have said the acquisition of the files was an innocent, though highly regrettable, mistake, and President Clinton has apologized to those whose privacy was violated. But the Whit 

White phosphorus, which dates to World War II, should have been banned generations ago. Packed into an artillery shell, it explodes over a battlefield in a white glare that can illuminate 

In [27]:
return_df.sort_values(by=['Body']).tail(10)

Unnamed: 0,Topic,Body,Descriptors,General Online Descriptors,Types Of Material,Word Count
23074,pol,Zarqawi and his followers do oppose democracy ...,"[terrorism, politics and government, islam]","[terrorism, islam, religion and churches, poli...",news,835
3552,pol,"Zhao Ziyang, the head of the Chinese Communist...",[international relations],"[international relations, politics and governm...",news,594
3005,med,"Zyprexa and Symbyax from Eli Lilly, Risperdal ...","[mental health and disorders, accidents and sa...","[labeling and labels, consumer protection, acc...",news,483
46633,med,"Zyprexa will remain on the market, the company...","[mental health and disorders, suits and litiga...","[liability for products, diabetes, suits and l...",news,680
4816,pol,"Zyuganov said he respected private property, w...","[elections, public opinion, politics and gover...","[public opinion, elections, politics and gover...",news,1009
30408,med,[Based on new figures released Friday by Indon...,"[earthquakes, foreign aid, tsunamis, water, fo...","[foreign aid, water, earthquakes, tidal waves,...",news,1230
52012,edu,"audience The collection of intended readers, l...","[education and schools, english language, teac...","[teachers and school employees, english langua...",news,390
41607,law,borne Division were beginning a run. A court-m...,"[murders and attempted murders, united states ...","[courts, courts-martial, armies, murders and a...",news,156
53188,fin,had charged that excess money from the private...,"[politics and government, inaugurations, gover...","[governors (us), finances, elections, politics...",news,461
56949,pol,"signed] Bob Dole ""Dear Dole: Nothing short of ...",[united states politics and government],"[united states politics and government, politi...",editorial,714


In [28]:
raw.loc[56949,'Body']

56949     signed] Bob Dole "Dear Dole: Nothing short of...
56949                                                     
Name: Body, dtype: object

In [None]:
# to do

# furthur cleaning # check if Alhindi take any other steps in preprocessing
    # take out (ART, MONTH DATE)
    # In his Editorial Notebook critical of lawyers (''Jury by Trial,'' Feb. 7)
    # A three-column advertisement for the Newport Centre Shopping Mall in New Jersey  announces in bold
    # In ''Cutting the Cost of Car Insurance'' (Personal Finance, Feb. 7)
    # In his Editorial Notebook critical of lawyers (''Jury by Trial,'' Feb. 7)
    # Lee Koenigsberg's letter on Feb. 28 responding to Carole Gould's article,
    # The Federal Government keeps its books differently from ordinary householders. If I buy a $100,000
    # A former executive of the Wedtech Corporation testified yesterday that
    # To the Sports Editor:
    
# recheck keywords in Descriptor and General Online Descriptors to further remove irrelevant articles

# Save TXT

In [34]:
def save_topic_csv(df,key_dict):
    
    df = df[['Topic','Types Of Material','Body']]
    
    for k in list(key_dict):
        save = df[df['Topic']==k]
        save[['Types Of Material','Body']].to_csv('/data/output_txt/nyt-'+str(k)+'.txt', sep='\t', header=False, index=False)
        print ('saved topic_txt:', k)

In [35]:
save_topic_csv(return_df,key_dict)

saved topic_txt: law
saved topic_txt: pol
saved topic_txt: med
saved topic_txt: fin
saved topic_txt: mil
saved topic_txt: edu


# Load and create BERT embeddings

https://huggingface.co/bert-base-cased

In [1]:
import os
import torch

In [2]:
from transformers import BertTokenizer, TFBertModel

In [3]:
# intiate tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertModel.from_pretrained("bert-base-cased")

2023-01-25 14:03:50.167009: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-25 14:03:53.795109: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7377 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:83:00.0, compute capability: 6.1
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are 

In [5]:
# test cell
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(**encoded_input) # shape=(1, 13, 768)

In [6]:
output

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 13, 768), dtype=float32, numpy=
array([[[ 0.60231996,  0.10919418,  0.14172179, ..., -0.41773665,
          0.6058591 ,  0.1764017 ],
        [ 0.5118536 , -0.47698206,  0.5507509 , ..., -0.28141266,
          0.37927842,  0.11556842],
        [ 0.09948093,  0.08669561,  0.08693355, ...,  0.47888178,
         -0.323643  ,  0.31219953],
        ...,
        [ 0.8080866 , -0.7380119 ,  0.20007204, ...,  0.7404601 ,
         -0.799814  ,  0.64488906],
        [ 0.33053666, -0.19578056,  0.3148015 , ..., -0.05245744,
          0.5358108 ,  0.1987034 ],
        [ 0.56553817, -0.21757378, -0.47202027, ..., -0.35540947,
          0.6141086 , -0.2475606 ]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-8.28349411e-01,  6.09058440e-01,  9.99977589e-01,
        -9.98308301e-01,  9.87391055e-01,  9.29295182e-01,
         9.96644378e-01, -9.89093125e-01, -9.91235673e-

In [35]:
### note on output
# https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

# len(output[0][0]) # 13
# len(output[1][0]) # 768
# type(output) # transformers.modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
# type(output[0][0]) # tensorflow.python.framework.ops.EagerTensor
# type(output[1][0]) # tensorflow.python.framework.ops.EagerTensor

tensorflow.python.framework.ops.EagerTensor

In [2]:
os.listdir('output_txt')

['nyt-law.txt',
 'nyt-edu.txt',
 'nyt-med.txt',
 'nyt-fin.txt',
 'nyt-mil.txt',
 'nyt-pol.txt']

In [59]:
reading = pd.read_csv('/data/output_txt/nyt-law.txt', sep='\t',header=None)

In [67]:
def read_transform_bert(file):
    for row in reading.iterrows():
        type_m = row[1][0]
        text = row[1][1]
        
        # label
        if type_m == 'editorial':
            label = torch.tensor([1]).unsqueeze(0)
        elif type_m == 'news':
            label = torch.tensor([0]).unsqueeze(0)
        
        # text
        encoded_input = tokenizer(text, return_tensors='tf')
##        output = model(**encoded_input)
        
        # text with label
        output = model(**encided_input, labels=labels)
        
        
        


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# for Yerke

## add the special tokens [CLS/SEP] in each article
## please find out the way to get BERT embs
## how to keep the label of each article with article