## Header 
Author : Amina Matt and Yichen Wang  
Date created : 14.10.2021  
Date last modified : 21.11.2021  
Python version : 3.8  
Description : Text processing of the CARICOM Compilation Archive (CCA) https://louverture.ch/cca/ 



# To Do List
- [X] check number items
- [X] to JSON 
- [ ] JSON person cleaning
- [ ] JSON location cleaning
- [ ] save NER 

# Initialization

In [None]:
# -*- coding: utf-8 -*-

import nltk #Natural Language Toolkit is a natural language programming library
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
import pandas as pd
from nltk import pos_tag
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from nltk.chunk import conlltags2tree
from nltk.tree import Tree
import random

# Text separation into items 
In the primary text source, each item is separated by a return and the '=>' starting string. Each item references a different actor of colonial entreprise. Separating each of them into items helps us to differentiate the extraction depending on the scheme they follow.

In [None]:
#Input: path for the .txt file 
#Output: list of string, where each element is an item, i.e. a separate entry in the document of origin
#Requirements: -
#Description: separate the items based on the '=>' string that characterize a new entry
def divide_items(textFilePath):
    f = open(textFilePath,"r")
    item = []
    for line in f: 
        if (line != '\n'):
            if (line[0] == '=') and (line[1] == '>'):
                item_text = ''
                while (line != '\n'):
                    item_text = item_text + line
                    line = f.readline()
                item.append(item_text)
    f.close()
    return item 

In [None]:
text_items = divide_items(caricom)
items_total = len(text_items)
print(f'There are {len(text_items)} items in total.')

In [None]:
print(f'This is one text item:\n{text_items[random.randrange(len(text_items))]}.\n')

## Named Entities Recognition with NER Stanford 
The first objective is to extract information of interest from the text. In this case we are interested in person's names, locations and activities. The first step towards this goal is to use Named Entities Recognition to recognize which words contain the information we are looking for.

In [None]:
#Stanford NER 
NER_FOLDER = './NER-Standford/stanford-ner-2020-11-17'
CLASSIFIER_PATH = NER_FOLDER+'/classifiers/'
JAR_PATH = NER_FOLDER+'/stanford-ner.jar'

#classifiers
classifier_3 = 'english.all.3class.distsim.crf.ser.gz'#3 class model for recognizing locations, persons, and organizations
classifier_4 = 'english.conll.4class.distsim.crf.ser.gz'#4 class model for recognizing locations, persons, organizations, and miscellaneous entities
classifier_7 = 'english.muc.7class.distsim.crf.ser.gz' #7 class model for recognizing locations, persons, organizations, times, money, percents, and dates

st = StanfordNERTagger(CLASSIFIER_PATH+classifier_7, JAR_PATH, encoding='utf-8')

#Text retrieving
DATA_FOLDER = './data/'
caricom_sample = DATA_FOLDER +'Caricom_Archive_Sample_Schema1.txt'
caricom = DATA_FOLDER +'Caricom_Archive.txt'

#Extracting named-entities
text = open(caricom_sample, 'r').read()
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)

At this point the whole text is tagged. However the entities aren't grouped together. For example, a person full name is separate into two tuples.

## BIO tagging for readable Named Entities (i.e. regrouped NE)

[BIO](https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging)) tags are a way to regroup tokens, to make the output more readable. 
A person name with first and last name should be regroup by assigning  
 -B to the beginning of named entities  
 -I assigned to inside  
 -O assigned to other  
This is done by checking the tokens just before and after the one of interest.

In [None]:
# Function imported from 
# https://pythonprogramming.net/using-bio-tags-create-named-entity-lists/?completed=/testing-stanford-ner-taggers-for-speed/

# Tag tokens with standard NLP BIO tags
def bio_tagger(ne_tagged):
		bio_tagged = [] #empty list
		prev_tag = "O" #starting with a O tag
		for token, tag in ne_tagged:
			if tag == "O": #O
				bio_tagged.append((token, tag))
				prev_tag = tag
				continue
			if tag != "O" and prev_tag == "O": # Begin NE
				bio_tagged.append((token, "B-"+tag))
				prev_tag = tag
			elif prev_tag != "O" and prev_tag == tag: # Inside NE
				bio_tagged.append((token, "I-"+tag))
				prev_tag = tag
			elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
				bio_tagged.append((token, "B-"+tag))
				prev_tag = tag
		return bio_tagged

In [None]:
bio_text = bio_tagger(classified_text)
bio_text

Using the BIO tags we can recreate a tokens list with regrouped/readable named entities. 

In [None]:
# Function imported from 
# https://pythonprogramming.net/using-bio-tags-create-named-entity-lists/?completed=/testing-stanford-ner-taggers-for-speed/

# Create tree       
def stanford_tree(bio_tagged):
	tokens_raw, ne_tags = zip(*bio_tagged)
	tokens = [word for word in tokens_raw if word]
	pos_tags = [pos for token, pos in pos_tag(tokens)]

	conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
	ne_tree = conlltags2tree(conlltags) #from BIO to tree format
	return ne_tree

In [None]:
tree_text = stanford_tree(bio_text)
tree_text

In [None]:
# Function imported from 
# https://pythonprogramming.net/using-bio-tags-create-named-entity-lists/?completed=/testing-stanford-ner-taggers-for-speed/

# Parse named entities from tree
def structure_ne(ne_tree):
	ne = []
	for subtree in ne_tree:
		if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
			ne_label = subtree.label()
			ne_string = " ".join([token for token, pos in subtree.leaves()])
			ne.append((ne_string, ne_label))
		else:
			ne_label = 'O'
			ne_string = subtree[0]
			ne.append((ne_string, ne_label))           
	return ne

In [None]:
clean_ne = structure_ne(tree_text)
clean_ne

In [None]:
def ner_text(text):
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    bio_text = bio_tagger(classified_text)
    tree_text = stanford_tree(bio_text)
    ner_item = structure_ne(tree_text)
    return ner_item

# From NE tree to JSON

The structure NE list for each text is transformed into an entry in a dataframe. The goal is to have for each sample of text an entry with the *relevant* informations.  
The difficult part is to sort the relevant informations. Which of the persons is the one of interest? Which location is the location where the organization or the person was involved? Which dates are the dates of interest? 
Here we deal only with the transformation.

## Use schema 1 **(*name* (date) from *origin*)** to retrieve JSON names, origins and dates attributes in the text item.

In [44]:
#Input:
#Output: 
#Requirements: 
#Description: 
def is_date(dateString):
    return any(s.isdigit() for s in dateString)
#Works for (1731-1820)

In [47]:
#Input: item is a single entry from text source 1 with NER tags (characterized by the '=>' starting string)
#Output: True is the text is structured as schema 1, False otherwise
#Requirements: is_date() function
#Description: Test if the first elements of a text match the schema 1. Namely, does the first words match the  **Name** (*date*) from *city* pattern.
def schema1_test(item):
    tags = [x[1] for x in item]
    text_middle= [x[0] for x in item]
    #start and end of piece of interest
    schema1 = False
    try:
        person_Index = tags.index('PERSON')
    except ValueError:
        person_Index = 1 #default
        print("List does not contain value")
    try: 
        location_Index = tags.index('LOCATION')
    except ValueError:
        print("List does not contain value")
        location_Index = 0 #default
    if person_Index < location_Index:
        ner_middle = item[person_Index+1:location_Index-1]
    #digit test
    digit_test = any(x.isdigit() for x in text_middle)
    #parenthesis test
    if digit_test :
        schema1 = ('(' and ')') in text_middle#parenthesis test

    return schema1

In [48]:
#Function test
schema1_test(ner_text(text_items[80]))

True

## Compute how many items follow the **(*name* (date) from *origin*)** schema (schema 1)

In [84]:
ner_items = []
for item in text_items:
    ner_item = ner_text(item)
    ner_items.append(ner_item)
len(ner_items)

464

In [None]:
s1_items= []
i = 0
s1 = 0
for item in ner_items:
    #print(ner_item[0:8])
    #print(str(i)+'\n')
    i = i+1
    if schema1_test(item):
        s1 = s1 + 1
        print(f'Total schema1 found: {s1}')
        s1_items.append(ner_item)

In [86]:
s1_tot = len(s1_items)
print(f'With the new function we found {s1_tot} items following schema 1.\n\nOne example is : {s1_items[6]}\n')

With the new function we found 322 items following schema 1.

One example is : [('=', 'O'), ('>', 'O'), ('Johann Viktor Travers von Ortenstein ( 1721–1776 )', 'PERSON'), (',', 'O'), ('of', 'O'), ('a', 'O'), ('noble', 'O'), ('family', 'O'), ('from', 'O'), ('TumeglDomleschg', 'O'), (',', 'O'), ('entered', 'O'), ('his', 'O'), ('father', 'O'), ('’', 'O'), ('s', 'O'), ('regiment', 'O'), ('in', 'O'), ('Valenciennes', 'LOCATION'), ('.', 'O'), ('After', 'O'), ('a', 'O'), ('military', 'O'), ('career', 'O'), ('in', 'O'), ('the', 'O'), ('Swiss', 'O'), ('Guards', 'O'), (',', 'O'), ('he', 'O'), ('became', 'O'), ('brigadier-general', 'O'), ('(', 'O'), ('1747', 'O'), (')', 'O'), (',', 'O'), ('marshal', 'O'), ('(', 'O'), ('1759', 'DATE'), (')', 'O'), ('and', 'O'), ('lieutenant-general', 'O'), ('(', 'O'), ('1762', 'O'), (')', 'O'), (',', 'O'), ('and', 'O'), ('was', 'O'), ('ennobled', 'O'), ('by', 'O'), ('Louis XVI', 'PERSON'), ('(', 'O'), ('«', 'O'), ('comte', 'O'), ('»', 'O'), (',', 'O'), ('1775', 'DA

In [87]:
perc_s1 = s1_tot/items_total*100
print(f'The amount of items following schema 1 are {perc_s1:2.0f}%.')

The amount of items following schema 1 are 69%.


### Additional items starting with date and not passing shcema 1 test

In [88]:
text_items[6]

'=> In 1677, Swiss medical doctor Felix Christian Spoerri (1615-1680) from Zurich wrote a detailed description of Barbados («Americanische Reiss-Beschreibung nach den Caribes Insslen, und Neu-Engelland»), which he had visited in 1661 and 1662, including the slavery economy, which produced sugar, tobacco, cotton, and indigo.\n'

In [89]:
date_items= []
c = 0
for item in ner_items:
    tag = item[3][1]
    if tag == 'DATE':
        c = c+1
        #print(f'items start with date : {c}')
        date_items.append(item)


In [90]:
print(f'Number of items starting with date, i.e. In 1781....: \n{len(date_items)}')

Number of items starting with date, i.e. In 1781....: 
101


In [None]:
additional = []
for i in date_items:
    if not schema1_test(i):
        c = c+1
        additional.append(i)
        print(f'start with date and not s1: {c}')

In [92]:
additional[0]

[('=', 'O'),
 ('>', 'O'),
 ('In', 'O'),
 ('August 1772', 'DATE'),
 (',', 'O'),
 ('Emanuel Correvon', 'ORGANIZATION'),
 ('(', 'O'),
 ('t', 'O'),
 (')', 'O'),
 ('from', 'O'),
 ('a', 'O'),
 ('Swiss', 'O'),
 ('family', 'O'),
 ('(', 'O'),
 ('either', 'O'),
 ('from', 'O'),
 ('Geneva', 'LOCATION'),
 ('or', 'O'),
 ('the', 'O'),
 ('Canton', 'LOCATION'),
 ('of', 'O'),
 ('BerneVaud', 'O'),
 (')', 'O'),
 ('left', 'O'),
 ('for', 'O'),
 ('Berbice', 'ORGANIZATION'),
 ('.', 'O'),
 ('He', 'O'),
 ('was', 'O'),
 ('in', 'O'),
 ('debt', 'O'),
 ('of', 'O'),
 ('f', 'O'),
 ('4,000', 'O'),
 ('guilders', 'O'),
 (',', 'O'),
 ('so', 'O'),
 ('he', 'O'),
 ('might', 'O'),
 ('have', 'O'),
 ('gone', 'O'),
 ('to', 'O'),
 ('Berbice', 'LOCATION'),
 ('as', 'O'),
 ('a', 'O'),
 ('soldier', 'O'),
 ('or', 'O'),
 ('a', 'O'),
 ('plantation', 'O'),
 ('obverseer', 'O'),
 ('.', 'O')]

In [93]:
print(f'Item starts with date that not pass schema 1 test: \n{len(additional)}')

Item starts with date that not pass schema 1 test: 
36


In [97]:
perc_withAdd = (s1_tot + len(additional))/items_total*100
print(f'The amount of items with additional test are {perc_withAdd:2.0f}%.')

The amount of items following schema 1 are 77%.


### Visual inspections of schema 1 items

In [None]:
#s1_items

## From NER to JSON

In [None]:
#Input: item is a single entry from text source 1 with NER tags (characterized by the '=>' starting string)
#Output: A JSON string with Person,Date,Location keys if is the text is structured as schema 1, None otherwise
#Requirements: is_date() function
#Description: Test if the first elements of a text match the schema 1. 
#Namely, does the first words match the  **Name** (*date*) from *city* pattern.
#If it matches schema1 it returns a dictionary 
def schema1_JSON(item):
    s1item_JSON = None
    tags = [x[1] for x in item]
    text = [x[0] for x in item]
    
    #start and end of piece of interest, i.e. 'PERSON'.....'LOCATION'
    schema1 = False
    try:
        person_Index = tags.index('PERSON')
    except ValueError:
        person_Index = -1 #default
        print("Item does not contain a PERSON value")
   
    try: 
        location_Index = tags.index('LOCATION')
    except ValueError:
        print("Item does not contain a LOCATION value")
        location_Index = -1 #default
   
    #If there are PERSON and LOCATION values, with PERSON first we continue the schema1 test
    if person_Index < location_Index and person_Index > 0 and location_Index > 0 :
        #define part in between PER and LOC tags
        ner_middle = item[person_Index+1:location_Index-1]
        text_middle = [x[0] for x in ner_middle]
        
        #parenthesis test
        try:
            par1_Index = text_middle.index('(')
        except ValueError:
            par1_Index = -1 #default
        print("par 1 index" + str(par1_Index))
              
        try:
            par2_Index = text_middle.index(')')
        except ValueError:
            par2_Index = -1 #default
        print("par 2 index" + str(par2_Index))
    
        if par1_Index < par2_Index and par2_Index >= 0 and par1_Index >= 0 :
            date_par = text_middle[par1_Index+1:par2_Index]
            print('This is the text in between parenthesis ' +str(date_par))
            #digit test
            digit_test = any(x.isdigit() for x in str(date_par))
            print('The digit test results : '+str(digit_test))
            #Save informations from schema 1
            if digit_test :
                
                #retrieve date
                date = ''
                date_split = str(date_par).split('–')
                for x in str(date_split):
                    if x.isdigit():
                        date = date +' '+ x
                print('The retrieved date is ' + date)
                #retrieve location and person values
                location = item[location_Index][0]
                person = item[person_Index][0]
        
            #Create a JSON dictionary
                s1item_JSON = {
                    'person' : person,
                    'date': date,
                    'location': location,
                #'field':NA
                }
    return s1item_JSON

In [None]:
#Function test
n = 30 
print(text_items[n])
schema1_JSON(ner_text(text_items[n]))

In [None]:
any(x.isdigit() for x in '1685–1740')
'1685–1740'.split('–')

## Create JSON from schema 1 items

In [None]:
s1_JSON= []
i = 0
s1 = 0
for item in text_items:
    ner_item = ner_text(item)
    print(ner_item[0:8])
    #print(str(i)+'\n')
    i = i+1
    json = schema1_JSON(ner_item)
    print(json)
    s1_JSON.append(json)

In [None]:
s1_JSON_clean = [] 
for val in s1_JSON:
    if val != None :
        s1_JSON_clean.append(val)

In [None]:
s1_JSON_clean

Exceptions, errors etc..

In [None]:
#some have '>' as person 
len(s1_JSON_clean)
#s1_JSON_clean[1]

#some have 'Canton' or 'City' instead of location

## Remove duplicate
If some entries have the samed person we need to merge or remove one of the entry.

In [None]:
s1_items[10]


## Get location when mentioned further to deal with : from the city of...

## Use section name to retrieve JSON colonial location attribute

To do 

## Use predefined categories to retrieve the JSON type attribute 

To do 

# Scratch

### Old version of schema 1 test
This version is outdated. To restrictive it gets only 18 items.

In [None]:
#Input: item is a single entry from text source 1 with NER tags (characterized by the '=>' starting string)
#Output: True is the text is structured as schema 1, False otherwise
#Requirements: is_date() function
#Description: Test if the first elements of a text match the schema 1. Namely, does the first words match the  **Name** (*date*) from *city* pattern.
def schema1_test(item): 
    testValue = (item[2][1] == ('PERSON' or 'ORGANIZATION)')) and (item[3][0] == '(') and (is_date(item[4][0]) == True) and (item[5][0] == ')') and (item[6][0] == 'from') and (item[7][1] == 'LOCATION')
    return testValue

schema1_test(clean_ne)

What about multiple persons in a paragraph?
    -> one ID per person with same organization groups etc...

In [None]:
dataSet = pd.DataFrame({
                     'id':[],
                     'person':[],
                     'location':[],
                     'period':[],})
dataSet.

In [None]:
person_list = []

for ent in tokens.ents:
    if ent.label_ == 'PERSON':
        person_list.append(ent.text)
        
person_counts = Counter(person_list).most_common(20)
df_person = pd.DataFrame(person_counts, columns =['text', 'count'])

In [None]:
len(classified_text)

In [None]:
json