In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pwd

/content


In [5]:
# load data: concat all texts within one document
import os
import json
from tqdm import tqdm

def get_document_by_country(fp: str) -> str:
    doc = ''
    with open(fp, 'r') as f:
        data = json.load(f)
    for news in data:
        doc += news['text']
        doc += ' '
    return doc


countries = []
docs = []
dir_path = '/content/drive/MyDrive/news-data'
for fn in tqdm(os.listdir(dir_path)):
    if fn == 'README.md':
        continue
    countries.append(fn.split('.')[0])
    docs.append(get_document_by_country('{}/{}'.format(dir_path, fn)))


100%|██████████| 11/11 [00:32<00:00,  2.94s/it]


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vecs = tfidf_vectorizer.fit_transform(docs)


In [108]:
# get words with largest tfidf-score in each country
import numpy as np

corpus = np.array(tfidf_vectorizer.get_feature_names())

def get_top_words(vec, corpus, n):
    top_idx = vec.argsort()[::-1][0: n]
    return corpus[top_idx]




In [109]:
# try to extract all unique words out

word_freq = {}
for vec in tfidf_vecs:
    vec = vec.toarray().flatten()
    for term in get_top_words(vec, corpus, 200):
        if term not in word_freq:
            word_freq[term] = 1
        else:
            word_freq[term] += 1

# extract words that only appear in one countries top score terms
unique_words = []
for word in word_freq:
    if word_freq[word] == 1:
        unique_words.append(word)


In [None]:
# save stop words to
keep_words = {
    'friend',
    'wrong',
    'offers',
    'thanks',
    'privacy',
    'updates',
    'protest',
    'protesters',
    'protests',
    'anti',
    'advertisement',
    'families',
    'nuclear',
    'tourism',
    'cultural',
    'book',
    'pressure',
    'violence',
    'lives',
    'share',
    'agreement',
    'festival',
    'policy',
    'efforts',
    'force',
    'corruption',
    'youths',
    'defense',
    'thinks',
    'threat',
    'sabotage',
    'wants',
    'opposition',
    'stressed',
    
}

In [3]:
save_path = '/content/drive/MyDrive/news-data/stop_words.txt'
stop_words = []
with open(save_path, 'r') as f:
    for word in f.readlines():
        stop_words.append(word.strip())

stop_words


['sri',
 'lanka',
 'colombo',
 'send',
 'jaffna',
 'matara',
 'kandy',
 'galle',
 'lankaâ',
 'lankan',
 'rajapaksa',
 'ceb',
 'unp',
 'pix',
 'sirisena',
 'hitad',
 'cabinet',
 'dengue',
 'prof',
 'thera',
 'donâ',
 'perera',
 'puttalam',
 'upfa',
 'countryâ',
 'tamil',
 'nuwara',
 'present',
 'pic',
 'verifyerrors',
 'getty',
 'pictures',
 'sign',
 'september',
 '2022',
 'address',
 'enter',
 'valid',
 'afp',
 'images',
 'august',
 'london',
 'message',
 'emailed',
 'offers',
 'reuters',
 'signing',
 'notice',
 'updates',
 'england',
 'brexit',
 'queen',
 'newsletter',
 'inside',
 'inbox',
 'outside',
 'epa',
 'ii',
 'emails',
 'elizabeth',
 'ratings',
 'johnson',
 'white',
 'indypl100',
 'black',
 'headlines',
 'indyarts',
 'tv',
 'wales',
 'delaney',
 'christofilou',
 'canada',
 'toronto',
 'com',
 '___',
 'thestar',
 'https',
 'canadian',
 'ontario',
 'york',
 'run',
 'apnews',
 'jays',
 'torontostarreprints',
 '14',
 'www',
 '13',
 'series',
 'iran',
 'tehran',
 'iranian',
 'islam

In [4]:
len(stop_words)

278

In [5]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'