In [None]:
# Libraries 

import os
import psutil
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

from newsplease import NewsPlease

from multiprocess import Pool

from pymongo import MongoClient

from subprocess import check_output

# Starting MongoDB
password = '' # YOUR SYSTEM PASSWORD
mongod_restart_command = "sudo -S systemctl restart mongod"
os.system('echo %s | %s' % (password, mongod_restart_command))

In [None]:
# Functions 

def article_scraper(url): 

    try: 
        source = requests.get(url, timeout = 10)

        article = NewsPlease.from_html(source.content)

        article_dict = article.get_dict()

        title = article_dict['title']
        texts = [article_dict['description'], article_dict['maintext'], article_dict['text'], article_dict['description']]

        scraped_date = article_dict['date_publish']
        if scraped_date != None:
            post_date = str(scraped_date.date())
        else:
            post_date = None

        if all([elm == None for elm in texts]):
            full_text = None
        else:
            full_text = ". ".join(". ".join([text for text in texts if text != None]).splitlines())

        results = {'SOURCEURL':url, 'Title':title, 'Text':full_text, 'PostDate':post_date}
        
    except: 
        results = None 

    return results

def read_mongoDB(localhost, database, collection): 
    
    # Making a Connection with MongoClient
    client = MongoClient("mongodb://localhost:" + localhost + "/")
    
    # Database
    db = client[database]
    
    # Collection
    col = db[collection]
    
    data = pd.DataFrame(list(col.find()))
    result = data.drop("_id", axis = 1)
    
    return result    
    
def insert_mongoDB(df, localhost, database, collection): 
    
    # Making a Connection with MongoClient
    client = MongoClient("mongodb://localhost:" + localhost + "/")
    
    # Database
    db = client[database]
    
    # Collection
    col = db[collection]
    
    # Load to mongoDB 
    col.insert_many(df.to_dict('records'))
    
def get_pid(name): 
     
    return check_output(["pidof",name])

def process_ram_retrieve(process_name): 
    
    mongod_process_id = int(get_pid(process_name).decode().strip())
    mongod_process = psutil.Process(mongod_process_id)
    process_ram = (mongod_process.memory_info().rss/1024 ** 2)
        
    return process_ram

In [None]:
# Read data 

df = pd.read_feather('GDELT_DATA.fthr')

In [None]:
# Get links 

links = [link for link in df['SOURCEURL'].unique() if link]
links = [link for link in links if 'http' in link]

In [None]:
# Remove scraped links 

scraped_df = read_mongoDB("27017", "NLP701_Project", "SCRAPED_ARTICLES")
os.system('echo %s | %s' % (password, mongod_restart_command))
scraped_links = set(scraped_df['link'].unique())
remaining_links = list(set(all_links).difference(scraped_links))

In [None]:
# Get chunk into parallels 

link_chunks = [links[i:i + 15] for i in range(0, len(links), 15)]

In [None]:
# Parallel scrape all links 

num_chunk = 1
p = Pool(15)
for chunk in tqdm(link_chunks): 
    
    try:
        
        # Scrape in parallel 
        result = p.map_async(article_scraper, chunk)
        data = result.get(timeout = None)
        not_none = [elm for elm in data if elm]
        
        if data: 
            
            # Convert to df
            data_df = pd.DataFrame(not_none)
        
            # Load to MongoDB
            insert_mongoDB(data_df, "27017", "NLP701_Project", "SCRAPED_ARTICLES")

            # Check RAM by MongoDB
            process_ram = process_ram_retrieve('mongod')
            if process_ram > 1000: 
                
                os.system('echo %s | %s' % (password, mongod_restart_command))
                print(num_chunk)
                
    except KeyboardInterrupt: 
        
        p.terminate()
        p.join()
        
        break
    
    except:
        
        pass
        
    num_chunk += 1
    
p.terminate()
p.join()