In [None]:
from io import BytesIO
from io import TextIOWrapper
from zipfile import ZipFile
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import glob
import sys
import csv
import os
import string
import time
import nltk
import requests
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
# or: requests.get(url).content

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
starttime = 20200629 #change to 20191101, yyyymmdd
endtime = 20200630 #change to current date
key_pos = np.loadtxt('/content/drive/Shared drives/FNA XN Spring 2020 Project 4/Data/dictionaries/posi_dic.txt', dtype=str).tolist()
key_neg = np.loadtxt('/content/drive/Shared drives/FNA XN Spring 2020 Project 4/Data/dictionaries/nega_dic.txt', dtype=str).tolist()

In [None]:
countryreference = {}
gdeltref = urlopen('http://data.gdeltproject.org/blog/2018-news-outlets-by-country-may2018-update/MASTER-GDELTDOMAINSBYCOUNTRY-MAY2018.TXT').read().decode('utf-8')

for line in gdeltref.split('\n'):
  nline = line.split('\t')
  if len(nline)==3:
    countryreference[nline[0]] = nline[1]

In [None]:
'''
English dataset

'''

gdeltfiles = urlopen('http://data.gdeltproject.org/gdeltv2/masterfilelist.txt').read().decode('utf-8')
eng_filelist = []
for line in gdeltfiles.split('\n'):
  nline = line.split(' ')
  if len(nline)==3:
    if 'gkg' in nline[2]:
      if int(nline[2][37:45])>=starttime and int(nline[2][37:45])<endtime:
        eng_filelist.append(nline[2])

In [None]:
'''
Translingual dataset

'''
gdeltfiles = urlopen('http://data.gdeltproject.org/gdeltv2/masterfilelist-translation.txt').read().decode('utf-8')
transl_filelist = []
for line in gdeltfiles.split('\n'):
  nline = line.split(' ')
  if len(nline)==3:
    if 'gkg' in nline[2]:
      if int(nline[2][37:45])>=starttime and int(nline[2][37:45])<endtime:
        transl_filelist.append(nline[2])

In [None]:
print('Files in each dataset, Translated:',len(transl_filelist), 'English:',len(eng_filelist))

Files in each dataset, Translated: 96 English: 96


In [None]:
'''
Set of Covid-19 themes
'''
covidthemes = ['CORONAVIRUS', 'DISEASE', 'INFECTIOUS','VIRUS']

In [None]:
def get_urls(countries, translation=False):
  if translation:
    flist = transl_filelist
  else:
    flist = eng_filelist
  urldictionary = {}
  for i in countries:
    urldictionary[i]={}
  counter = 0
  for fname in tqdm(flist):
    resp = urlopen(fname)
    zipfile = ZipFile(BytesIO(resp.read()))
    zipfile.namelist()
    fdate = int(fname[37:45])
    for cc, ccds in urldictionary.items():
      if fdate not in ccds.keys():
        urldictionary[cc][fdate] = []
    for file in zipfile.namelist():
      for line in zipfile.open(file).readlines():
          try:
            nline = line.decode('utf-8').split('\t')
            dom = nline[3]
            url = nline[4]
            themes = nline[8]#.split(';')
            is_covid = False
            for i in covidthemes:
              if i in themes:
                is_covid = True
            if is_covid:
              if dom in countryreference.keys():
                if countryreference[dom] in countries:
                  urldictionary[countryreference[dom]][fdate].append((dom, url))
                  counter+=1
                  #print(counter)
          except:
            continue
  print('Found', counter, 'articles!')
  return urldictionary

In [None]:
def cleantext(text):
    text = re.sub(r'\n','',text)
    text = re.sub(r'\t','',text)
    text = re.sub(r'\r','',text)
    
    text = re.sub(r"\’", "'", text)#smart single quotes
    text = re.sub(r"\“", '"', text)#smart double quotes->delete
    text = re.sub(r"\”", '"', text)    
    text = re.sub(r"\—", "-", text)

    
    #text = re.sub(r"\"", "", text)
    #text = re.sub(r"\-", " ", text)
    text = re.sub(r"\xa0", " ", text)

    #text = re.sub("\S*\d\S*", "", text).strip()
    text = re.sub('''[^A-Za-z0-9-.!?,;$#@\(/)'""]+''', ' ', text)###Apostrphe included
    text = re.sub('\s+', ' ', text)
    #text = re.sub(r"\'s", " is", text)
    return text

def scrapetext(url):
    #print('text', url)
    try:
      # open the url using urllib.request and put the HTML into the page variable
      page = urlopen(url)
      soup = BeautifulSoup(page, "lxml")
      texts = ''
      for p in soup.body.find_all('p'):
          if len(p.text)>30:
              texts+=' '+p.text
      #print(cleantext)
      return cleantext(texts)
    except:
        #print(url, 'not reachable for scraping')
        return ''


In [None]:
def body_keyword_count_lemma(body, key_positive, key_negative):
    lemmatizer = WordNetLemmatizer() 
    counter_p = 0 
    counter_n = 0
    for word in body.split():
        # print('Comparing: ', word)
        for key_p in key_positive:
            if lemmatizer.lemmatize(key_p) in word:
                # print('Positive word: ',key_p)
                counter_p += 1
                break
        for key_n in key_negative:
            if lemmatizer.lemmatize(key_n) in word:
                # print('Negative word: ',key_n)
                counter_n += 1
                break        
        
    return (len(body.split()), counter_p, counter_n)

In [None]:
def processurls(urldict, savename, save=True, threshold=500):
  usedlinks = []
  for key, links in urldict.items():
    dates, doms, urls, texts, total, posit, negat = [],[],[],[],[],[],[]
    counter=0
    for i in links:
      if counter >threshold:
        break
      if i[1] not in usedlinks:
        text = scrapetext(i[1])
        if len(text)>50: #50 character limit
          counter+=1
          tot, pos, neg = body_keyword_count_lemma(text, key_pos, key_neg)
          dates.append(key)
          doms.append(i[0])
          urls.append(i[1])
          texts.append(text)
          total.append(tot)
          posit.append(pos)
          negat.append(neg)
          usedlinks.append(i[1])
    df = pd.DataFrame({'date':dates,'dom':doms,'url':urls,'text':texts,'total':total,'positive':posit,'negative':negat})
    df.to_csv(savename+'_'+str(key)+'.csv', sep=',')
  print('DONE')

In [None]:
def collect_country(countries, translation, filepath, threshold):
  print('Collecting GDELT for', countries)
  ccdict = get_urls(countries, translation)
  print('Gathered', len(ccdict), 'countries')
  for cc, urldict in ccdict.items():
    count = 0
    for dd, ddlinks in urldict:
      count+=len(ddlinks)
    print(cc, 'N Articles', count)
  #print(ccdict)
  for cc, urldict in ccdict.items():
    processurls(urldict, filepath+cc, True, threshold)

In [None]:
countryset = ['GM', 'SW']
threshold = 500 #daily limit for news pieces
savefilepath = '/content/drive/Shared drives/FNA XN Spring 2020 Project 4/Data/fixed_data/'
collect_country(countryset, True, savefilepath, threshold) # True for translated and False for eng

  0%|          | 0/96 [00:00<?, ?it/s]

Collecting GDELT for ['GM', 'SW']


100%|██████████| 96/96 [01:33<00:00,  1.03it/s]


Found 3337 articles!
Gathered 2 days
DONE
DONE
