In [1]:
# Libraries 

import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from pymongo import MongoClient
from collections import Counter

from transformers import pipeline

import spacy
from spacy.language import Language
from setfit import SetFitModel, SetFitTrainer
from alphabet_detector import AlphabetDetector
from spacy_language_detection import LanguageDetector

password = ''
mongod_restart_command = "sudo -S systemctl restart mongod"
os.system('echo %s | %s' % (password, mongod_restart_command))

tqdm.pandas()
ad = AlphabetDetector()

In [None]:
# Models 

# # Language detection 
nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func = get_lang_detector)
nlp_model.add_pipe('language_detector', last = True)

huff_classifier = pipeline("text-classification", model = "Yueh-Huan/news-category-classification-distilbert")

politics_binary = SetFitModel.from_pretrained('politics_binary/')

In [2]:
# Functions 

def read_mongoDB(localhost, database, collection): 
    
    # Making a Connection with MongoClient
    client = MongoClient("mongodb://localhost:" + localhost + "/")
    
    # Database
    db = client[database]
    
    # Collection
    col = db[collection]
    
    data = pd.DataFrame(list(col.find()))
    result = data.drop("_id", axis = 1)
    
    return result

def language_detection(text, nlp_model): 

    # Document level language detection
    doc = nlp_model(text)
    language = doc._.language
    
    return language

def get_lang_detector(nlp, name): 
    
    return LanguageDetector(seed = 42)

In [None]:
# File paths 

# Get all the files 
directory = 'NLP701 Project'
all_files = os.listdir(path = directory)

# Get data files 
data_files = [directory + '/' + file for file in all_files if 'fthr' in file]

In [None]:
# Get the scraped data 

# Saudi scraped 
saudi_scraped = pd.read_feather(data_files[1])

# Vietnam scraped 
vietnam_scraped = pd.read_feather(data_files[2])
vietnam_scraped = vietnam_scraped[vietnam_scraped['lang'] == 'en'].drop(['lang'], axis = 1)

# Afghanistan scraped 
afghanistan_scraped = read_mongoDB('27017', 'NLP701 Project', 'SCRAPED_ARTICLES')

# Combine dataframes 
scraped_data = pd.concat([saudi_scraped, vietnam_scraped, afghanistan_scraped])

# Deduplicate 
scraped_data = scraped_data.dropna()

In [None]:
# Saudi GDELT 

saudi_gdelt = pd.read_feather(data_files[-1])
saudi_gdelt = saudi_gdelt[['Actor1CountryCode', 'Actor2CountryCode', 'IsRootEvent', 'EventCode',
                           'EventBaseCode', 'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions',
                           'NumSources', 'NumArticles', 'AvgTone', 'DATEADDED', 'SOURCEURL']]

saudi_gdelt = saudi_gdelt.dropna()
saudi_gdelt = saudi_gdelt[saudi_gdelt['IsRootEvent'] == 1]
saudi_gdelt = saudi_gdelt[((saudi_gdelt['GoldsteinScale'] > 0) & (saudi_gdelt['AvgTone'] > 0))|
                          ((saudi_gdelt['GoldsteinScale'] < 0) & (saudi_gdelt['AvgTone'] < 0))]
np.save('saudi_filtered_links.npy', saudi_gdelt['SOURCEURL'].unique())

Vietnam scraped 
vietnam_gdelt = pd.read_feather(data_files[3])
vietnam_gdelt = vietnam_gdelt[['Actor1CountryCode', 'Actor2CountryCode', 'IsRootEvent', 'EventCode',
                               'EventBaseCode', 'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions',
                               'NumSources', 'NumArticles', 'AvgTone', 'DATEADDED', 'SOURCEURL']]

vietnam_gdelt = vietnam_gdelt.dropna()
vietnam_gdelt = vietnam_gdelt[vietnam_gdelt['IsRootEvent'] == 1]
vietnam_gdelt = vietnam_gdelt[((vietnam_gdelt['GoldsteinScale'] > 0) & (vietnam_gdelt['AvgTone'] > 0))|
                              ((vietnam_gdelt['GoldsteinScale'] < 0) & (vietnam_gdelt['AvgTone'] < 0))]
np.save('vietnam_filtered_links.npy', vietnam_gdelt['SOURCEURL'].unique())

afghanistan_gdelt = read_mongoDB('27017', 'GDELT', 'Afghanistan')
afghanistan_gdelt = afghanistan_gdelt[['Actor1CountryCode', 'Actor2CountryCode', 'IsRootEvent', 'EventCode',
                           'EventBaseCode', 'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions',
                           'NumSources', 'NumArticles', 'AvgTone', 'DATEADDED', 'SOURCEURL']]

afghanistan_gdelt = afghanistan_gdelt.dropna()
afghanistan_gdelt = afghanistan_gdelt[afghanistan_gdelt['IsRootEvent'] == 1]
afghanistan_gdelt = afghanistan_gdelt[((afghanistan_gdelt['GoldsteinScale'] > 0) & (afghanistan_gdelt['AvgTone'] > 0))|
                          ((afghanistan_gdelt['GoldsteinScale'] < 0) & (afghanistan_gdelt['AvgTone'] < 0))]

np.save('afghanistan_filtered_links.npy', afghanistan_gdelt['SOURCEURL'].unique())

# Regression

In [None]:
# Get the scraped data 

# File paths 

# Get all the files 
directory = 'NLP701 Project'
all_files = os.listdir(path = directory)

# Get data files 
data_files = [directory + '/' + file for file in all_files if 'fthr' in file]

# Saudi scraped 
saudi_scraped = pd.read_feather(data_files[1])

# Vietnam scraped 
vietnam_scraped = pd.read_feather(data_files[2])
vietnam_scraped = vietnam_scraped[vietnam_scraped['lang'] == 'en'].drop(['lang'], axis = 1)

# Afghanistan scraped 
afghanistan_scraped = read_mongoDB('27017', 'GDELT', 'AfghanistanArticles')

# Combine dataframes 
scraped_data = pd.concat([saudi_scraped, vietnam_scraped, afghanistan_scraped])

# Deduplicate 
scraped_data = scraped_data.dropna()

scraped_data = scraped_data.drop(['PostDate'], axis = 1)
scraped_data = scraped_data.set_index('SOURCEURL')

In [None]:
# Read scraped data 

afghanistan_gdelt_scraped = pd.read_feather('afghanistan_gdelt_scraped.fthr')
afghanistan_gdelt_scraped['GoldsteinScale'] = afghanistan_gdelt_scraped['GoldsteinScale'].astype(float)
afghanistan_mean_goldstein = afghanistan_gdelt_scraped[['GoldsteinScale', 'SOURCEURL']].groupby('SOURCEURL').mean()

saudi_gdelt_scraped = pd.read_feather('saudi_gdelt_scraped.fthr')
saudi_mean_goldstein = saudi_gdelt_scraped[['GoldsteinScale', 'SOURCEURL']].groupby('SOURCEURL').mean()

vietnam_gdelt_scraped = pd.read_feather('vietnam_gdelt_scraped.fthr')
vietnam_mean_goldstein = vietnam_gdelt_scraped[['GoldsteinScale', 'SOURCEURL']].groupby('SOURCEURL').mean()

mean_goldstein = pd.concat([afghanistan_mean_goldstein, saudi_mean_goldstein, vietnam_mean_goldstein], axis = 0)

In [None]:
# Clean scraped data 

scraped_goldstein = mean_goldstein.merge(scraped_data, left_index = True, right_index = True)

scraped_goldstein['Text'] = scraped_goldstein[['Title', 'Text']].agg('. '.join, axis = 1)

scraped_goldstein = scraped_goldstein.drop(['Title'], axis = 1)

scraped_goldstein = scraped_goldstein[scraped_goldstein['Text'].progress_apply(lambda x: ad.only_alphabet_chars(x, "LATIN"))]

scraped_goldstein = scraped_goldstein[scraped_goldstein['Text'].progress_apply(lambda x: all([elm not in x for elm in ['é', 'è', 'ê', 'ë', 'ç', 'ñ', 'ø', 'ð', 'Ð ', 'å', 'æ', 'œ', 'ē', 'č', 'ŭ',  'š', 'ò',
 'Ó',
 'á',
 'ó',
 'ə',
 'İ',
 'ş',
 'Ş',
 'ı',
 'ğ',
 'ù',
 'ä',
 'ü',
 'ö',
 'ß',
 'ł',
 'ń',
 'ż',
 'ś',
 'ę',
 'ą',
 'ć',
 'ï',
 'í',
 'ú',
 'ư',
 'ơ',
 'ô',
 'ệ',
 'ầ',
 'ọ',
 'ậ',
 'ộ',
 'ờ',
 'â',
 'Đ',
 'ễ',
 'ạ',
 'ì',
 'ấ',
 'Â',
 'ẩ',
 'ả',
 'Ô',
 'ỗ',
 'à',
 'ồ',
 'ề',
 'ĩ',
 'ố',
 'ị',
 'ă',
 'ế',
 'Ö',
 'Ü',
 'Ä',
 'ž',
 'ã',
 'ș',
 'ţ',
 'î',
 'Î',
 'ț',
 'Ș',
 'Ţ',
 'đ',
 'Č',
 'Ž',
 'Ú',
 'Á',
 'É',
 'Í',
 'Ț',
 'ớ',
 'Ç',
 'Ğ',
 'ﬁ',
 'Ă',
 'Ã',
 'ō',
 'ừ',
 'ň',
 'ý',
 'Ø',
 'ﬂ',
 'ḵ',
 'õ',
 'Õ',
 'ȃ',
 'ī',
 'ā',
 'ū',
 'È',
 'ų',
 'ė',
 'į',
 'ở',
 'ǎ',
 'Ḳ',
 'ļ',
 'Ķ',
 'Ę',
 'Ā',
 'ď',
 'Ě',
 'Ł',
 'Ą',
 'Š',
 'Ż',
 'Ē',
 'Ế',
 'ĺ',
 'ﬃ',
 'ﬀ',
 'ũ',
 'À',
 'Ộ',
 'ẫ',
 'ắ',
 'ặ',
 'ợ',
 'ỹ',
 'ứ',
 'ự',
 'ẽ',
 'ằ',
 'ữ',
 'ụ',
 'ổ',
 'ẻ',
 'Ï',
 'ů',
 'ě',
 'ř',
 'Ə',
 'Ś',
 'ź',
 'Ń',
 'ḥ',
 'Ḥ',
 'Å',
 'ỳ',
 'ể',
 'ủ',
 'ỏ',
 'Ꜥ',
 'Ū',
 'ử',
 'Ź',
 'ẵ',
 'ẹ',
 'Ờ',
 'Œ',
 'Ạ',
 'ỡ',
 'ỉ',
 'Û',
 'û',
 'ő',
 'Ê',
 'ť',
 'Ố',
 'ḫ',
 'Ð',
 'þ',
 'ṅ',
 'ṭ',
 'Ć',
 'Ư',
 'Ñ',
 'ṣ',
 'Æ',
 'ȇ',
 'Ɵ',
 'Ấ',
 'ỷ',
 'Ò',
 'ƒ',
 'ľ',
 'ŕ',
 'ħ',
 'Ġ',
 'Ħ',
 'Ẩ',
 'Į',
 'Ầ',
 'Ơ',
 'Ệ',
 'Ậ',
 'ỵ',
 'Ẵ',
 'Ả',
 'Ắ',
 'Ị',
 'Ũ',
 'Ì',
 'Ồ',
 'Ỹ',
 'ᴀ',
 'ɴ',
 'ᴍ',
 'ɪ',
 'ᴅ',
 'Ý',
 'Ṣ',
 'ḍ',
 'ĝ',
 'Ù',
 'Ō',
 'ņ',
 'ķ',
 'ꞌ',
 'ǀ',
 'Ő',
 'ṇ',
 'ÿ',
 'Ë',
 'Ť',
 'ŋ',
 'ʏ',
 'ᴏ',
 'ᴜ',
 'ʀ',
 'ᴡ',
 'ʟ',
 'ᴠ',
 'ᴇ',
 'ʙ',
 'ᴄ',
 'ɡ']]))]

scraped_goldstein['lang'] = scraped_goldstein['Text'].progress_apply(lambda x: language_detection(x, nlp_model)['language'])

scraped_goldstein = pd.read_feather('en_scraped_goldstein.fthr')

scraped_goldstein = scraped_goldstein[scraped_goldstein['lang'] == 'en']

scraped_goldstein_good = scraped_goldstein[~((scraped_goldstein['Text'].apply(lambda x: x.lower()).str.contains('news') & 
                                              scraped_goldstein['Text'].apply(lambda x: x.lower()).str.contains('headline'))|
                                             (scraped_goldstein['Text'].apply(lambda x: x.lower()).str.contains('top') & 
                                              scraped_goldstein['Text'].apply(lambda x: x.lower()).str.contains('news')))]

scraped_goldstein_good = scraped_goldstein_good.drop_duplicates(subset = ['SOURCEURL'])

predictions = politics_binary.predict(list(scraped_goldstein_good['Text']))

scraped_goldstein_good['predictions'] = predictions

scraped_goldstein_good = scraped_goldstein_good[scraped_goldstein_good['predictions'] == 1]

scraped_goldstein_good.reset_index(drop = True).to_feather('scraped_goldstein_pol_classified.fthr')

In [14]:
scraped_goldstein_good = pd.read_feather('scraped_goldstein_pol_classified.fthr')

saudi_links = list(np.load('saudi_filtered_links.npy', allow_pickle = True))

vietnam_links = list(np.load('vietnam_filtered_links.npy', allow_pickle = True))

afghanistan_links = list(np.load('afghanistan_filtered_links.npy', allow_pickle = True))

all_links = saudi_links + vietnam_links + afghanistan_links

In [16]:
scraped_goldstein_good = scraped_goldstein_good[scraped_goldstein_good['SOURCEURL'].isin(all_links)]

In [None]:
# Get political subset 

texts = [elm[:2500] for elm in scraped_goldstein_good['Text'].unique()]

# Links with political articles 

pol_not_dataset = []
for text in tqdm(texts): 
    
    result = huff_classifier.predict(text)[0]
    result['text'] = text
    
    pol_not_dataset.append(result)

pol_not_df = pd.DataFrame(pol_not_dataset)

pol_not_df.to_feather('pol_not_df_2.fthr')

not_pol = pol_not_df[~(pol_not_df['label'].isin(['WORLDPOST', 'BUSINESS', 'WORLD NEWS', 'POLITICS', 'THE WORLDPOST', 'IMPACT'])) & 
                      (pol_not_df['score'] > 0.9)]

not_pol = not_pol.sort_values('score', ascending = False)

pol = pol_not_df[(pol_not_df['label'].isin(['WORLDPOST', 'WORLD NEWS', 'POLITICS'])) & 
                 (pol_not_df['score'] > 0.75)]

pol_list = []
for topic in ['WORLDPOST', 'WORLD NEWS', 'POLITICS']: 
    
    pol_filt = pol[pol['label'] == topic].sort_values('score', ascending = False).iloc[:3155]
    pol_list.append(pol_filt)
    
pol = pd.concat(pol_list)
pol = pol.sort_values('score', ascending = False)

not_pol.reset_index(drop = True).to_feather('not_pol.fthr')

pol.reset_index(drop = True).to_feather('pol.fthr')