In [1]:
import os
import numpy as np
import pandas as pd
import string, re
import json

import nltk
from nltk.corpus import stopwords

from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize


In [None]:
os.chdir("..")
INTER_PATH = os.path.join("data", "interim")
with open(os.path.join(INTER_PATH, "pdf_files.json")) as json_file:
    data = json.load(json_file)
    
#data.keys()

In [None]:
spa_stopwords = set(stopwords.words('spanish'))

In [None]:
def most_freq_words(doc_list, stopwords_set, num_freq_words):
    filtered_words = list()
    for doc in doc_list:
        tokens = [word.lower() for word in word_tokenize(doc)]
        filtered_words.extend([word for word in tokens if word.isalpha() and word not in stopwords_set and len(word) > 1])
    
    return FreqDist(filtered_words).most_common(num_freq_words)

In [None]:
def docs_from_json(data):
    doc_list = list()
    for doc_json in data.values():
        doc_list.append(doc_json["Text"])
    
    return doc_list

In [None]:
all_docs = docs_from_json(data)

In [None]:
spa_stopwords.add('ley')
spa_stopwords.add('artículo')
freq_words_map = most_freq_words(all_docs, spa_stopwords, 50)

In [None]:
freq_words_map

In [None]:
sents = [sent.lower() for sent in sent_tokenize(sample_txt)]
filtered_sents = [" ".join([word for word in sent.split() if word.isalpha() and word not in spa_stopwords]) for sent in sents]

In [None]:
FreqDist(filtered_sents)

### 1. Data Cleaning

- 'CreditoGanadero_Mexico': not readable

In [2]:
stopwords_spanish = set(stopwords.words('spanish')).union(set(['ley', 'artículo']))

In [3]:
puntuation = [p for p in set(string.punctuation) if p not in (".")]

In [None]:
def clean_stopwords(df):
    df = df.lower().replace("\n", " ").replace("\t", " ").strip(" ")
    df = "".join( c for c in df.split(" ") if c not in puntuation)
    df = " ".join([w for w in df.split() if w not in stopwords_spanish])
    #df = " ".join([w for w in df.split() if w not in additional_stopwords])
    return df

In [None]:
# From Json to DF
clean_docs = pd.DataFrame([])
for doc_name, doc_json in data.items():
    doc = pd.json_normalize(doc_json)
    doc.insert(0, "Document", doc_name) 
    #print(doc)
    clean_docs = clean_docs.append(doc, ignore_index = True)

# Set label
clean_docs.insert(11,'Label','Relevant')

In [4]:
os.chdir("..")
INTER_PATH = os.path.join("data", "interim")
data = pd.read_excel(os.path.join(INTER_PATH, "WRI_Ecolex_relevance_classification_sample.xlsx"))

data.tail(5)

Unnamed: 0,POLICY_ID,FULL_DOC_LINK,TITLE,DATE,SUBJECT,KEYWORDS,ABSTRACT,IS_RELEVANT,REVIEWED_FULL_DOC
995,LEX-FAOC113577,http://extwprlegs1.fao.org/docs/pdf/per113577.pdf,Resolución Nº 012/12/AG/SENASA/DSV - Requisito...,2012,Cultivated plants,"Plant protection, Textile plants/fibres, Pests...",La presente Resolución establece los requisito...,,
996,LEX-FAOC142723,http://extwprlegs1.fao.org/docs/pdf/per142723.pdf,Resolución Nº 045/14/MINAGRI/SENASA/DSV - Reti...,2014,Cultivated plants,"Hygiene/sanitary procedures, Planting material...",La presente Resolución retira determinadas pla...,,
997,LEX-FAOC125593,http://extwprlegs1.fao.org/docs/pdf/per125593.pdf,Resolución Nº 178/13/MINCETUR - Reglamento Int...,2013,Agricultural & rural development,"Indigenous peoples, Cultural heritage, Extensi...",La presente Resolución aprueba el Reglamento I...,,
998,LEX-FAOC142231,http://extwprlegs1.fao.org/docs/pdf/per142231.pdf,Resolución Nº 034/14/MINAGRI/SENASA/DSV - Modi...,2014,Cultivated plants,"Plant protection, Pests/diseases, Planting mat...",La presente Resolución modifica la que estable...,,
999,LEX-FAOC105348,http://extwprlegs1.fao.org/docs/pdf/per105348.pdf,Decreto Supremo Nº 048/11/EM - Modifica el Dec...,2011,"Energy, Mineral resources","Energy conservation/energy production, Oil, En...",El presente Decreto Supremo modifica el Glosar...,,


In [60]:
data.dropna(how='any', inplace=True)
print(data.shape)

(608, 10)


In [7]:
data['IS_RELEVANT'].sum()

137.0

In [8]:
data['REVIEWED_FULL_DOC'].sum()

25.0

In [9]:
clean_docs=data

### Lexical Analysis

In [10]:
corpus = clean_docs.ABSTRACT

In [11]:
all_words = [w.split() for w in corpus]

all_flat_words = [ewords for words in all_words for ewords in words]

In [12]:
#removing all the stop words from the corpus
all_flat_words_ns = [w for w in all_flat_words if w not in stopwords_spanish]

#removing all duplicates
set_nf = set(all_flat_words_ns)

In [13]:
print("Number of unique vocabulary words in the text_description column of the dataframe: %d"%len(set_nf))

Number of unique vocabulary words in the text_description column of the dataframe: 6239


### Data Pre-Processing

The following steps are performed:
   - Converting all of the data into lower case.
   - FInd the root of the words to further reduce the feature size

to do: When the document have 'Keywords', the keywords are added to the description.


In [14]:
from nltk import word_tokenize
from nltk.stem import SnowballStemmer

In [15]:
clean_docs.columns

Index(['POLICY_ID', 'FULL_DOC_LINK', 'TITLE', 'DATE', 'SUBJECT', 'KEYWORDS',
       'ABSTRACT', 'IS_RELEVANT', 'REVIEWED_FULL_DOC'],
      dtype='object')

In [16]:
porter = SnowballStemmer('spanish')
#porter=nltk.PorterStemmer()

for each_row in clean_docs.itertuples():
    # Add Keywords and Subject as part of the corpus
    m1=map(lambda x: x , (str(each_row[5])+' '+str(each_row[6])).lower().split())
    #m1 = map(lambda x: x , (each_row[11]).lower().split())
    #print(each_row[11])
    #Using Porter Stemmer in NLTK, find root
    m2 = map(lambda x: porter.stem(x), m1)
    #pre-processed string is stored in new column
    clean_docs.loc[each_row[0],'Desc'] = ' '.join(m2)

In [17]:
clean_docs.head(5)

Unnamed: 0,POLICY_ID,FULL_DOC_LINK,TITLE,DATE,SUBJECT,KEYWORDS,ABSTRACT,IS_RELEVANT,REVIEWED_FULL_DOC,Desc
0,LEX-FAOC091761,http://extwprlegs1.fao.org/docs/pdf/per91761.pdf,Resolución Nº 050/09/AG - Crea el Consejo Naci...,2009,Cultivated plants,"Institution, Textile plants/fibres, Contract/a...",La presente Resolución crea el Consejo Naciona...,0.0,0.0,"cultivat plants institution, textil plants/fib..."
1,LEX-FAOC122131,http://extwprlegs1.fao.org/docs/texts/mex12213...,Acuerdo que adiciona párrafos a la especificac...,2013,Wild species & ecosystems,"Aquatic animals, Management/conservation, Enda...",Estas disposiciones incorporan una nueva norma...,0.0,0.0,"wild speci & ecosystems aquatic animals, manag..."
2,LEX-FAOC035623,http://extwprlegs1.fao.org/docs/texts/per35623...,Resolución Nº 016/02/CONAM - Crea el Grupo Téc...,2002,"Environment gen., Waste & hazardous substances","Institution, Hazardous substances, Pesticides,...",La presente Resolución crea el Grupo Técnico d...,0.0,0.0,"environment gen., wast & hazardous substanc in..."
3,LEX-FAOC137450,http://extwprlegs1.fao.org/docs/pdf/chi137450.pdf,Decreto Supremo Nº 66 - Aprueba reglamento que...,2013,Land & soil,"Agricultural land, Land tenure, Traditional ri...",En virtud del presente Decreto se aprueba el r...,0.0,0.0,"land & soil agricultural land, land tenure, tr..."
4,LEX-FAOC134825,http://extwprlegs1.fao.org/docs/pdf/per134825.pdf,Resolución Nº 027/14/MINAGRI/SENASA/DSV - Requ...,2014,Cultivated plants,"Plant protection, Pests/diseases, Planting mat...",La presente Resolución establece los requisito...,0.0,0.0,"cultivat plants plant protection, pests/diseas..."


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# New corpus
corpus = clean_docs['Desc']
#Initializing TFIDF vectorizer to conver the raw corpus to a matrix of TFIDF features 
# and also enabling the removal of stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords_spanish)

In [20]:
#Creating TFIDF features sparse matrix by fitting it on the specified corpus 
tfidf_matrix = vectorizer.fit_transform(corpus).todense()

#Grabbing the name of the features.
tfidf_names = vectorizer.get_feature_names()

In [21]:
print("Number of TF-IDF Features: ", tfidf_matrix.shape[1])

# There are  34,617 columns that will be used for training the classifier
# These are much smaller than the total number of unique vocabulary words
# (79,262) that previously calculated

# With only the abstract
# 498 columns vs 6239 unique vocabulary words 

Number of TF-IDF Features:  498


In [22]:
training_time_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}
prediction_time_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}

accuracy_container={'b_naive_bayes':0,'mn_naive_bayes':0,'random_forest':0,'linear_svm':0}

### Classifiers

First, split our existing dataset into training and test data.


In [54]:
from time import time

import sklearn.metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import classification_report


In [74]:
# Split dataset into training and test data (70-30 ratio)

#considering the TFIDF features as the input
variables = tfidf_matrix
#labels for the classifier
labels = clean_docs.IS_RELEVANT
#splitting the data
var_train, var_test, labels_train, labels_test = train_test_split(variables, labels, test_size=.3)

In [75]:
#analyzing the shape of the training and test data-set:
print('Shape of Training Data: '+str(var_train.shape))
print('Shape of Test Data: '+str(var_test.shape))

Shape of Training Data: (425, 498)
Shape of Test Data: (183, 498)


#### Logistic Regression


In [76]:
#Define classifier
log_reg = LogisticRegression()
# Train classifier
log_reg = log_reg.fit(var_train, labels_train)
# Predictions
log_predictions = log_reg.predict(var_test)

In [81]:
#Metrics
print("***** Logistic Regression *****")
print("Accuracy Score: %f"%sklearn.metrics.accuracy_score(labels_test, log_predictions))
print("Recall Score: %f"%sklearn.metrics.recall_score(labels_test, log_predictions))
print("Confusion Matrix output: ")
con_mat = pd.DataFrame(sklearn.metrics.confusion_matrix(labels_test, log_predictions))
con_mat

***** Logistic Regression *****
Accuracy Score: 0.852459
Recall Score: 0.384615
Confusion Matrix output: 


Unnamed: 0,0,1
0,141,3
1,24,15


In [84]:
classifier = LogisticRegression()

# Hyperparameters
hyper_param_grid = {'penalty': ['l1', 'l2', 'elasticnet', 'none'], 
                   }

# Gid search
grid_search = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'recall',
                           cv = 10, 
                           n_jobs = -1,
                           verbose = 3)
grid_search.fit(var_train, labels_train)

# Best score
print('Best score:', grid_search.best_score_ * 100)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best score: 59.22222222222222


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    1.6s finished


In [85]:
# Predictions
pred_grid_search = grid_search.predict(var_test)

print ("Accuracy Score: ", sklearn.metrics.accuracy_score(labels_test, pred_grid_search))
print ("Recall Score: ", sklearn.metrics.recall_score(labels_test, pred_grid_search))
print("Confusion Matrix: ")
print(sklearn.metrics.confusion_matrix(labels_test, pred_grid_search))

Accuracy Score:  0.7978142076502732
Recall Score:  0.5128205128205128
Confusion Matrix: 
[[126  18]
 [ 19  20]]


#### Naive Bayes

Naive Bayes is one of the most widely used classification algorithm in text mining applications. The main assumption is that all the features are independent of each other. The condition of independence may not be valid in many circumstances but as a base line model, its a good starting point.

Naive Bayes uses the probabilities of each attribute belonging to each class to make a prediction. There are two forms of Naive Bayes:

   1. Bernoulli, designed for boolean/binary features i.e. just considers the presence or absense of a feature.
   2. Multinomial, which also considers the occurrence counts of the feature.

We will apply both and then will assess their respective accuracy scores.

In [94]:
classifier = BernoulliNB()

# Hyperparameters
hyper_param_grid = {'alpha': [0.1, 0.25, 0.5, 0.75, 1],
                    'fit_prior':['False', 'True']
                   }

# Gid search
grid_search = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'recall',
                           cv = 10, 
                           n_jobs = -1,
                           verbose = 3)
grid_search.fit(var_train, labels_train)

# Best score
print('Best score:', grid_search.best_score_ * 100)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.7s


Best score: 81.66666666666667


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.6s finished


In [95]:
# Predictions
pred_grid_search = grid_search.predict(var_test)

print ("Accuracy Score: ", sklearn.metrics.accuracy_score(labels_test, pred_grid_search))
print ("Recall Score: ", sklearn.metrics.recall_score(labels_test, pred_grid_search))
print("Confusion Matrix: ")
print(sklearn.metrics.confusion_matrix(labels_test, pred_grid_search))
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test, pred_grid_search))

Accuracy Score:  0.7814207650273224
Recall Score:  0.7948717948717948
Confusion Matrix: 
[[112  32]
 [  8  31]]
Classification Metrics: 
              precision    recall  f1-score   support

         0.0       0.93      0.78      0.85       144
         1.0       0.49      0.79      0.61        39

    accuracy                           0.78       183
   macro avg       0.71      0.79      0.73       183
weighted avg       0.84      0.78      0.80       183



#### Multinomial Naive Bayes

Bernoulli Naive Bayes only considers whether a feature is present or not. However, if we also take into account the occurrence weight or count of the feature as well (in our case, the TF-IDF weight of each feature), we can hypothesize that the performance of such classifier will be equally good, if not better. That is assumption for the Multi-nomial Naive Bayes.

In [96]:
classifier = MultinomialNB()

# Hyperparameters
hyper_param_grid = {'alpha': [0.1, 0.25, 0.5, 0.75, 1],
                    'fit_prior':['False', 'True']
                   }

# Gid search
grid_search = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'recall',
                           cv = 10, 
                           n_jobs = -1,
                           verbose = 3)
grid_search.fit(var_train, labels_train)

# Best score
print('Best score:', grid_search.best_score_ * 100)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.6s


Best score: 66.44444444444444


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.5s finished


In [97]:
# Predictions
pred_grid_search = grid_search.predict(var_test)

print ("Accuracy Score: ", sklearn.metrics.accuracy_score(labels_test, pred_grid_search))
print ("Recall Score: ", sklearn.metrics.recall_score(labels_test, pred_grid_search))
print("Confusion Matrix: ")
print(sklearn.metrics.confusion_matrix(labels_test, pred_grid_search))
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test, pred_grid_search))

Accuracy Score:  0.8469945355191257
Recall Score:  0.717948717948718
Confusion Matrix: 
[[127  17]
 [ 11  28]]
Classification Metrics: 
              precision    recall  f1-score   support

         0.0       0.92      0.88      0.90       144
         1.0       0.62      0.72      0.67        39

    accuracy                           0.85       183
   macro avg       0.77      0.80      0.78       183
weighted avg       0.86      0.85      0.85       183



### Random Forest Classifier

In Random Forest, a subset of the training data is fit on a number of decision trees. Random Forests have the characteristic to minimize variance if its there in the data-set. 

In [98]:
classifier = RandomForestClassifier()

# Hyperparameters
hyper_param_grid = {'n_estimators': [100, 200], 
                     'max_depth': [5,10,20,50,100], 
                     'max_features': ['sqrt','log2'],
                     'min_samples_split': [2,5,10,20,50]}

# Gid search
grid_search = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'recall',
                           cv = 10, 
                           n_jobs = -1,
                           verbose = 3)
grid_search.fit(var_train, labels_train)

# Best params
print(grid_search.best_params_)

# Best score
print('Valor de la mejor métrica para el modelo de Random Forest:', grid_search.best_score_ * 100)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  5.4min finished


Valor de la mejor métrica para el modelo de Random Forest: 50.11111111111111


In [99]:
pred_grid_search = grid_search.predict(var_test)

print ("Accuracy Score of Random Forests Classifier: ", sklearn.metrics.accuracy_score(labels_test, pred_grid_search))
print ("Recall Score of Random Forests Classifier: ", sklearn.metrics.recall_score(labels_test, pred_grid_search))
print("Confusion Matrix: ")
print(sklearn.metrics.confusion_matrix(labels_test, pred_grid_search))

Accuracy Score of Random Forests Classifier:  0.8360655737704918
Recall Score of Random Forests Classifier:  0.4358974358974359
Confusion Matrix: 
[[136   8]
 [ 22  17]]
