# Selecting data for model training and testing
- Since GLG is interested in short text topic modeling (abstracts from client request), only part of news text is needed
- Test data is taken from well-defined sections to see if all news from those sect6ions go at least to first level cluster

# Python libraries

In [1]:
# data processing libraries
import pandas as pd

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

import spacy
nlp = spacy.load("en_core_web_sm")

# supporting libraries
import re
import pickle



In [2]:
# file location of the data
input_folder = './data/'
output_folder = './transition_files/'

file_name = 'all-the-news-2-1.csv'

# Clean and selecting first few paragraphs (10 sentences) for selected publications

In [3]:
# load data
df_data = pd.read_csv(input_folder + file_name, #file location
                      encoding = "ISO-8859-1", #deal with texts in different formats
                     )

# display first row of the data frame
print(df_data.shape)
df_data.head(1).T

  interactivity=interactivity, compiler=compiler, result=result)


(2688879, 12)


Unnamed: 0,0
Unnamed: 0,0
Unnamed: 0.1,0
date,2016-12-09 18:31:00
year,2016
month,12
day,9
author,Lee Drutman
title,We should take concerns about the health of liberal democracy seriously
article,"This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de..."
url,https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs


In [4]:
# select ONLY data with specified section and publication and non-duplicated texts of article
df_data['publication'] = df_data['publication'].fillna("")
df_data = df_data[df_data['publication'].apply(len)>0]

df_data['section'] = df_data['section'].fillna("")
df_data = df_data[df_data['section'].apply(len)>0]

df_data['article'] = df_data['article'].fillna("")
df_data = df_data[df_data['article'].apply(len)>0]
df_data = df_data.drop_duplicates('article')

df_data.shape

(1660535, 12)

In [5]:
#Publications in the data
print('Number of unique values:')
df = df_data.groupby('publication')[['section', 'article']].nunique()
df

Number of unique values:


Unnamed: 0_level_0,section,article
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
CNBC,634,191185
CNN,63,124659
Economist,46,23050
Fox News,670,20130
Gizmodo,78,18214
New Yorker,1,4644
People,35,133766
Reuters,224,734147
The New York Times,3774,240107
The Verge,148,50201


In [6]:
#check each publication
set(df_data[df_data['publication'] == "Wired"]['section'])

{'article',
 'artificial-intelligence',
 'backchannel',
 'business',
 'culture',
 'deals',
 'design',
 'environment',
 'gadget-lab-podcast',
 'gadgetlab',
 'gear',
 'ideas',
 'magazine',
 'music',
 'national-affairs',
 'opinion',
 'outdoor',
 'phones',
 'photo',
 'physics-math',
 'privacy',
 'reviews',
 'science',
 'security',
 'social-media',
 'transportation',
 'trends',
 'uncategorized'}

In [7]:
# Select only publications with more than 10 sections and less than 100
df=df[(df['section'] > 10) & (df['section'] < 100)]
print(df.sum())
df

section       250
article    319746
dtype: int64


Unnamed: 0_level_0,section,article
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
CNN,63,124659
Economist,46,23050
Gizmodo,78,18214
People,35,133766
Wired,28,20057


In [8]:
selected_publications = list(df.index)
selected_publications

['CNN', 'Economist', 'Gizmodo', 'People', 'Wired']

In [9]:
df_data = df_data[df_data['publication'].isin(selected_publications)]
df_data.shape

(319746, 12)

In [10]:
#clean text
df_data['article'] = df_data['article'].str.replace(r"[^A-Za-z0-9//-/.,!?:; ]",'', regex=True)

#select texts that have at least 500 but no more than 10000 symbols
df_data['text_length'] = df_data['article'].fillna("").apply(len)
df_data = df_data[df_data['text_length'] >= 500]
df_data = df_data[df_data['text_length'] < 10000]

# cut text to have no more than 1500 symbols
df_data['article'] = df_data['article'].str[:1500]
df_data.shape

(306365, 13)

In [11]:
#checking number of articles per section
s = df_data['section'].value_counts()
s.describe(percentiles=[0.01,0.25,0.5,0.75,0.99])

count      229.000000
mean      1337.838428
std       4400.854165
min          1.000000
1%           1.000000
25%          4.000000
50%         50.000000
75%        819.000000
99%      18521.880000
max      50665.000000
Name: section, dtype: float64

In [12]:
df_data['section_articles_count'] = df_data.groupby('section')['url'].transform("count")
df_data.shape

(306365, 14)

In [13]:
#selecting articles from sections with at least 1000 articles but not more than 15000
df_data = df_data[(df_data['section_articles_count'] > 1000) &
                    (df_data['section_articles_count'] < 15000) 
                   ]
df_data.shape

(188479, 14)

In [14]:
#section names
df_data['section'].value_counts()

movies                    12741
style                     11487
crime                     10988
health                    10913
music                      9942
opinions                   9441
entertainment              9431
celebrity                  9148
parents                    6948
business                   6930
asia                       6374
human-interest             5746
royals                     5504
pets                       5104
food                       4400
sports                     3945
home                       3419
culture                    3247
science                    2911
world                      2867
bodies                     2758
middleeast                 2696
tech                       2631
News                       2409
babies                     2203
africa                     2139
gear                       2108
Sploid                     2041
security                   1840
investing                  1822
country                    1818
transpor

In [15]:
#delete too wide or too nerrow topics
delete_sections = ["bodies", "News", "Sploid", "media", "investing", "human-interest",
                   "country", "britain", "europe", "united-states", "americas",
                   "middle-east-and-africa", "africa", "asia", "world", "home", "middleeast",
                   "awards", "Apple", "graphic-detail", "books-and-arts", "opinions",
                   "leaders", "app-news-section", "Privacy and Security", "entertainment"
                  ]
df_data = df_data[df_data['section'].isin(delete_sections) == False]
df_data.shape

(119138, 14)

In [16]:
#selected section names
df_data['section'].value_counts()

movies                   12741
style                    11487
crime                    10988
health                   10913
music                     9942
celebrity                 9148
parents                   6948
business                  6930
royals                    5504
pets                      5104
food                      4400
sports                    3945
culture                   3247
science                   2911
tech                      2631
babies                    2203
gear                      2108
security                  1840
transportation            1666
finance-and-economics     1648
Space                     1641
Health                    1193
Name: section, dtype: int64

In [17]:
#Publications in the selected data
print('Number of unique values:')
df = df_data.groupby('publication')[['section', 'article']].nunique()
df

Number of unique values:


Unnamed: 0_level_0,section,article
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
CNN,6,13762
Economist,2,3149
Gizmodo,2,2834
People,13,85151
Wired,7,14230


In [18]:
df_data = df_data.reset_index(drop=False)

In [19]:
# split the data on sub-samples of 50,000 records each 
# and save for next steps
k = 0
batch_size = 50000

for k in range(3):
    df_part = df_data.loc[k * batch_size: (k+1) * batch_size,:]
    print(k, len(df_part), df_part.index[0])
    with open(output_folder + 'data_part_'+str(k)+'.pickle', 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(df_part, f, pickle.HIGHEST_PROTOCOL)

0 50001 0
1 50001 50000
2 19138 100000


# Selecting first few paragraphs (10 sentences)

In [20]:
for k in range(3):
    file_name = 'data_part_'+str(k)+'.pickle'

    # load data
    with open(output_folder + file_name, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        df_data = pickle.load(f)

    #get spaCy doc
    print(k)
    %time df_data['spacy_doc'] = df_data['article'].apply(lambda x: nlp(x))
    print("="*50)

    #delete text of article
    del df_data['article']
    
    #save batch as pickle
    with open(output_folder + 'spacy_doc_' +str(k)+ '.pickle', 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(df_data, f, pickle.HIGHEST_PROTOCOL)

0
CPU times: user 19min 52s, sys: 2.83 s, total: 19min 55s
Wall time: 19min 55s
1
CPU times: user 19min 41s, sys: 1.75 s, total: 19min 43s
Wall time: 19min 43s
2
CPU times: user 6min 47s, sys: 456 ms, total: 6min 47s
Wall time: 6min 47s


# Saving Test and Train data

In [3]:
#clean noun phrases from stop-words
def clean_NPs(np):
    tmp_no_stop_words = [w for w in np if w.is_stop==False]

    #make only last word as lemma
    if len(tmp_no_stop_words)>0:
        tmp_lemmas = [w.text for w in tmp_no_stop_words[:-1]] + [tmp_no_stop_words[-1].lemma_]
    else:
        tmp_lemmas = []

    tmp_atleast_one_alpha = [w for w in tmp_lemmas if len(re.sub(r"\d|\W", "", w)) > 0]
    tmp_result = [w for w in tmp_atleast_one_alpha if len(w)>0]
    
    return " ".join(tmp_result)

In [4]:
list_dfs = []

for k in range(3):
    file_name = 'spacy_doc_' +str(k)+ '.pickle'
    print(file_name)

    # load data
    with open(output_folder + file_name, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        df_data = pickle.load(f)
    
    #select first 10 sentenses
    df_data['first_10_sents'] = df_data['spacy_doc'].apply(lambda doc: list(doc.sents)[:10])
    
    #extract noun phrases from first 10 sentenses for cluster naming
    df_data['list_of_nouns'] = df_data['first_10_sents'].apply(lambda sents: [word 
                                                                     for s in sents 
                                                                     for word in s if (word.is_stop==False) & \
                                                                                 (len(word.text)>2) & \
                                                                                 (word.is_alpha) & \
                                                                                 (word.pos_ == 'NOUN')])
    #extract lemmas from first 10 sentenses for cluster naming
    df_data['list_of_lemmas'] = df_data['first_10_sents'].apply(lambda sents: [word 
                                                         for s in sents 
                                                         for word in s if (word.is_stop==False) & \
                                                                     (len(word.text)>2) & \
                                                                     (word.is_alpha) & \
                                                                     (word.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV'])])
    
    
    #extract lemmas from first 10 sentenses for cluster naming
    df_data['list_of_verb_lemmas'] = df_data['first_10_sents'].apply(lambda sents: [word 
                                                         for s in sents 
                                                         for word in s if (word.is_stop==False) & \
                                                                     (len(word.text)>2) & \
                                                                     (word.is_alpha) & \
                                                                     (word.pos_ in ['VERB'])])
    
    #extract noun phrases from first 10 sentenses
    df_data['noun_phrases'] = df_data['first_10_sents'].apply(lambda sents: [np 
                                                                             for s in sents 
                                                                             for np in s.noun_chunks])
    
    #delete stop-words ("the", "a", "your" etc.) and clean NPs
    df_data['noun_phrases'] = df_data['noun_phrases'].apply(lambda NPs: [clean_NPs(np) for np in NPs])
    df_data['noun_phrases'] = df_data['noun_phrases'].apply(lambda NPs: [np for np in NPs if len(np)>1])
    
    df_data['list_of_first_10_sents'] = df_data['first_10_sents'].apply(lambda l: [s.text for s in l])
    df_data['first_10_sents'] = df_data['first_10_sents'].apply(lambda l: " ".join([s.text for s in l]))

    #delete 'spacy_doc' of article 
    #(it is used for LDA model but we need only text of first 10 sentenses for other models)
    del df_data['spacy_doc']
    
    list_dfs.append(df_data)

spacy_doc_0.pickle
spacy_doc_1.pickle
spacy_doc_2.pickle


In [5]:
df_data = pd.concat(list_dfs)
print(df_data.shape)
df_data.columns

(119140, 20)


Index(['index', 'Unnamed: 0', 'Unnamed: 0.1', 'date', 'year', 'month', 'day',
       'author', 'title', 'url', 'section', 'publication', 'text_length',
       'section_articles_count', 'first_10_sents', 'list_of_nouns',
       'list_of_lemmas', 'list_of_verb_lemmas', 'noun_phrases',
       'list_of_first_10_sents'],
      dtype='object')

In [6]:
df_data['list_of_first_10_sents'].head()

0    [Calling all foodies  get your royal rsums ready!, Queen Elizabethis looking for a kitchen porter, and the fulltime job comes with some dream perk...
1    [Celebloved Stanley the giraffe is seemingly safe as two wildfires continue to devastate southern California, but animal rights critics are still ...
2    [John Oliver has a special treat in store for the one of the last Blockbuster stores., Though the formerly popular video rental chain has been for...
3    [Blake Lively has one of the mostshared celebrity manes on Pinterest., Well, at least according to the inspiration boards Im looking at., And shes...
4    [Meghan King Edmonds knows how to put together a gorgeous space., Weeks after welcoming twin sons Hart King and Hayes Kingon June 5, the 33yearold...
Name: list_of_first_10_sents, dtype: object

In [7]:
#Save Test data (People articles)
df_test = df_data[df_data['publication'] == "People"]
print(df_test.shape)
df_test[["date", 'author', 
         'title', 'url', 
         'section', 'publication',
         'first_10_sents','list_of_first_10_sents', 
         'list_of_verb_lemmas', 'noun_phrases', 'list_of_nouns','list_of_lemmas'
        ]].to_csv(output_folder + "test.tsv", index=False, sep="\t")

(85158, 20)


In [8]:
#Save Train data (all but People articles)
df_train = df_data[df_data['publication'] != "People"]
print(df_train.shape)
df_train[["date", 'author', 
         'title', 'url', 
         'section', 'publication',
         'first_10_sents','list_of_first_10_sents', 
         'list_of_verb_lemmas', 'noun_phrases', 'list_of_nouns','list_of_lemmas'
         ]].to_csv(output_folder + "train.tsv", index=False, sep="\t")

(33982, 20)


In [9]:
#publications in Train dataset
print("Train data articles by publications:")
df_train.groupby('publication')['first_10_sents'].count()

Train data articles by publications:


publication
CNN          13763
Economist     3153
Gizmodo       2834
Wired        14232
Name: first_10_sents, dtype: int64

In [10]:
#selected section names
df_train['section'].value_counts()

health                   8237
business                 6930
culture                  3247
science                  2911
tech                     2527
gear                     2108
security                 1841
transportation           1666
finance-and-economics    1648
Space                    1641
Health                   1193
movies                     31
music                       1
style                       1
Name: section, dtype: int64

In [11]:
print("Done")

Done
