In [1]:
# 1. Introduction

## 1.0 Package imports

In [1]:
from keras.models import load_model
from keras_bert import get_custom_objects
import os.path
import ktrain.text.preprocessor as tpp
import pickle
from newsplease import NewsPlease
import sys

Using TensorFlow backend.


using Keras version: 2.2.4-tf


In [2]:
#!{sys.executable} -m pip install ktrain==0.7.2
#!{sys.executable} -m pip install keras==2.2.5
#!{sys.executable} -m pip install tensorflow==1.14.0

# 2 Definitions

In [3]:
bert_model = load_model('../models/text_bert_coref_data.hdf5', custom_objects=get_custom_objects())
preproc_type = tpp.TEXT_PREPROCESSORS.get('bert', None)
preproc = preproc_type(maxlen=350,
                        max_features=35000,
                        classes = ['pos','neg'],
                        lang='en', ngram_range=1)

In [4]:
def is_land_conflict(text):
    """Returns True if text is environmental conflict related, otherwise returns False

        Parameters:
        text (str): news article as a string object

        Returns:
        is_conflict (bool): True if text is an env/land conflict; false otherwise.

    """

    # preprocess the text
    (text_processed,  _) = preproc.preprocess_train([text], verbose=0)

    # return model's prediction 
    return bool(bert_model.predict(text_processed).argmax(axis=1) )

# 3 Execution

The goal is to make a dataframe that has columns for `id`, `country`, `year`, `month`, `conflict`, saved in `../data/brazil/output/roberta/year/month.csv`

In [9]:
year = 2018
month = 8
country = 'indonesia'

path = f"../data/{country}/text/{str(year)}/{str(month).zfill(2)}/"
path_output = f'../data/{country}/output/{str(year)}/{str(month).zfill(2)}.csv'
to_process = [x for x in os.listdir(path) if '.pkl' in x]

In [10]:
from tqdm import tqdm_notebook, tnrange
import pandas as pd


results = {}

for idx in tnrange(0, len(to_process)):
    file = to_process[idx]
    index = file[:5]
    with open(path + file, 'rb') as pickle_file:
        content = pickle.load(pickle_file)
        if not content.text == None:
            pred = is_land_conflict(content.text)
            results[index] = pred

  import sys


HBox(children=(FloatProgress(value=0.0, max=840.0), HTML(value='')))




PermissionError: [Errno 13] Permission denied: '../data/indonesia/text/2018/08/00054.pkl'

# Analysis guide
- Select 9 random months from 2017, 2018, and 2019 in both Brazil and Indonesia
- Read through and look for any false negatives
- Read through and look for true positives (confict event) and semi-true positives (not in country, year)
- Save good examples of true positives
- Brazil, 01-2017, no false negatives, 19/103 false positives
- Brazil, 04-2017
- Brazil, 08-2017

In [None]:
import pandas as pd
n = 0
results_df = pd.DataFrame({'Index': [], 
                           'Conflict': [],
                           'Title': [],
                           'Text': []})
for i in results:
    if results[i] == True:
        n += 1
        with open(path + i + '.pkl', 'rb') as pickle_file:
            content = pickle.load(pickle_file)
            results_df = results_df.append({'Index': i, 
                           'Conflict': True,
                           'Title': content.title,
                           'Text': content.text}, ignore_index = True)
            print(n, content.title)

In [None]:
# This data goes to country/output/year/01.csv
results_df_binary = pd.DataFrame({'Index': list(results.keys()), 
                           'Conflict': list(results.values())})
results_df_binary.to_csv(path_output, index = False)

In [12]:
results_df

Unnamed: 0,Index,Conflict,Title,Text
0,00642,1.0,Aceh Conservation Agency Finds Dead Elephant W...,A baby elephant is being put on a drip at the ...
1,00859,1.0,Genting Plantations sells indirect Indonesian ...,KUALA LUMPUR: Genting Plantations Bhd ’s (GenP...
2,00508,1.0,"Choppers sent to fight forest fires in Riau, K...","Central government to send choppers to Riau, K..."
3,00246,1.0,Cambodia’s rice exports fall sharply,Cambodia’s rice exports fall sharply\nCambodia...
4,00938,1.0,East Java's villagers hunt witches to put an e...,"news, world\nIn a small village, deep in the l..."
...,...,...,...,...
214,01035,1.0,The human element of mangrove management,As countries ponder how to encourage mangrove ...
215,00465,1.0,"Heartland anger, election heartbreak?",The Malaysian plantation district of Sungkai h...
216,00698,1.0,Indonesian pulp mill causing huge environmenta...,Jakarta (AFP) - Green groups said Thursday tha...
217,00854,1.0,"Indonesia to donate 10,000 MT of rice to Sri L...",The Ministry of Rural Economic Affairs has tak...
