# 1 Introduction

## 1.0 Package imports

In [1]:
from newsplease import NewsPlease
import pandas as pd
import nltk
from tqdm import tnrange
import re
import multiprocessing
import pickle
import os

# 2 Data Download

## 2.0 Constants

In [3]:
lengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30 ,31]
lengths = [x + 1 for x in lengths]

relevant_words = ['land', 'forest', 'agriculture', 
                  'farm', 'farmer', 'plantation', 'agrarian',
                  'smallholder', 'grazing', 'development', 'habitat', 
                  'resource', 'cattle', 'dispute', 'strife', 'peat',
                  'rice', 'palm oil', 'sugarcane', 'cassava', 'coconut',
                  'corn', 'mango', 'orange', 'maize', 'wheat', 'sorghum',
                  'bananas', 'tomatoes', 'citrus',
                  'livestock', 'kill', 'dead', 'airport',
                  'aluminum', 'mining', 'agro', 'dam',
                  'road', 'infrastructure', 'transmission', 
                  'conservation', 'settlement', 'displace',
                  'exile', 'caste', 'conflict', 'relocation',
                  'village', 'encroach', 'fertilizer', 'mine',
                  'illegal mining', 'malnutrition', 'contamination',
                  'mangrove', 'water', 'cow', 'cattle', 'appropriation', 
                  'appropriated', 'protest', 'environmental', 'pollution',
                  'copper', 'iron', 'timber', 'acre', 'hectare', ]

## 2.1 Function definitions

In [9]:
def load_month(month):
    month_str = str(month).zfill(2)
    df_paths = ["2018{}{}.csv".format(month_str, str(x).zfill(2)) for x in range(1, lengths[month - 1])]
    df_list = [pd.read_csv("data/" + x) for x in df_paths]
    return df_list

def save_obj(obj, name):
    'Helper function using pickle to save and load objects'
    with open('text/' + str(month) + '/' + name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def find_link_to_scrape(df):
    df['to_scrape'] = ''
    df['title'] = ''
    for i in range(len(df)):
        links = df['SOURCEURL'][i]
        l = re.findall(r'\w+(?:-\w+)+', links)
        if l:
            title = max(l, key = len)
            title = title.replace('-', ' ')
            df['title'][i] = title
            if any(word in title for word in relevant_words):
                df['to_scrape'][i] = str(links)
    return df

def combine_days(dfs):
    df_parsed = [find_link_to_scrape(dfs[x]) for x in tnrange(len(dfs))]
    df_month = pd.concat(df_parsed)
    df_subs = df_month[df_month['to_scrape'] != '']
    df_subs = df_subs.reset_index()
    return df_subs

## 2.2 Function execution

In [None]:
for i in range(1, 11):
    month = load_month(i)
    df = combine_days(month)
    df.to_csv("interim/{}.csv".format(str(i).zfill(2)))

In [31]:
month = '01'

def save_obj(obj, name):
    'Helper function using pickle to save and load objects'
    with open('../data/metadata/matching/' + name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

df = pd.read_csv("../data/metadata/variables/{}.csv".format(month))
urls = df['to_scrape'].unique()

# create a dictionary mapping urls to the month's csv
mapping_dictionary = {}
for i, val in enumerate(urls):
    match = df.index[df['to_scrape'] == urls[i]].tolist()
    mapping_dictionary[i] = match
    
save_obj(mapping_dictionary, month)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
month = 11

if not os.path.exists("text/{}".format(str(month))):
    os.makedirs("text/{}".format(str(month)))

    
def download_url(i):
    try:
        article = NewsPlease.from_url(urls[i])
        save_obj(article, str(i).zfill(5))
        return 1
    except Exception as ex:
        print(i, ex)
        return 0

potential = range(0, len(urls))
existing = os.listdir("text/{}/".format(str(month)))
existing = [int(x[:5]) for x in existing if ".DS" not in x]
potential = [x for x in potential if x not in existing]
print(len(potential))
pool = multiprocessing.Pool(16)
zip(*pool.map(download_url, potential))

In [None]:
pool.close()
pool.join()