## Header 
Author : Amina Matt and Yichen Wang  
Date created : 20.12.2021  
Date last modified : 20.12.2021  
Python version : 3.8  
Description : Text processing of the CARICOM Compilation Archive (CCA) https://louverture.ch/cca/ 

### Librairies

In [3]:
# -*- coding: utf-8 -*-
import pandas as pd
import json
import math #for isnan
from pandas.io.json import json_normalize

### Initialization 

In [4]:
#PATHS
DATA_FOLDER = './data/'
caricom_sample = DATA_FOLDER +'Caricom_Archive_Sample_Schema1.txt'
caricom = DATA_FOLDER +'Caricom_Archive.txt'

### Load 

In [5]:
df = pd.read_pickle("./generated_data/caricom_with_geoid.pkl")
df.head()

Unnamed: 0,person,date,origin,colonial_Location,confidence_date,confidence_person,confidence_origin,activities,whole_entry,tmp_colonial,col_loc_geonameid,col_loc_geo_name,col_capital,col_latitude,col_longitude,origin_as_found,origin_loc_geonameid,origin_geo_name,origin_latitude,origin_longitude
0,Arthur Thellusson,,Geneva,Antigua and Barbuda,0.0,25.0,100.0,[slave owner],"=> Arthur Thellusson, son of Lord Rendlesham a...",Antigua and Barbuda,3576396,Antigua and Barbuda,St. John's,17.12096,-61.84329,Geneva,2660650.0,Genève,46.20222,6.14569
1,Peter Thelluson,1767.0,Geneva,Barbados,33.33,100.0,100.0,[slave owner],"=> In 1767, Peter Thelluson (1737-1797), a Swi...",Barbados,3374084,Barbados,Bridgetown,13.10732,-59.62021,Geneva,2660650.0,Genève,46.20222,6.14569
2,JeanAntoine Bertrand,,Geneva,Dominica,0.0,25.0,100.0,"[slave owner, plantation owner, trading]",=> Jean-Antoine Bertrand (1726-1780) from the ...,Dominica,3575830,Dominica,Roseau,15.30174,-61.38808,Geneva,2660650.0,Genève,46.20222,6.14569
3,Peter Thelluson,1768.0,Geneva,Grenada,10.0,33.33,100.0,"[slave owner, plantation owner, trading]","=> In 1768, Peter Thelluson (1737-1797), a Swi...",Grenada,3580239,Grenada,St. George's,12.05288,-61.75226,Geneva,2660650.0,Genève,46.20222,6.14569
4,Jan Vincent Mittelholzer,1760.0,Geneva,Guyana,11.11,12.5,50.0,"[slave owner, plantation owner, trading]","=> Around 1760, the sugar plantation De Vreede...",Guyana,3378535,Guyana,Georgetown,6.80448,-58.15527,Geneva,2660650.0,Genève,46.20222,6.14569


### Functions

In [6]:
def add_coordinates(col_lat,col_lon,or_lat,or_lon):
    '''
    Describe: function that create a geojson with data from dataframe
    '''
    geojson_structure['geometry']['coordinates'] =  [[col_lat, col_lon], [or_lat, or_lon]]
    return geojson_structure

### Dataframe processing
#### Create a list in which new geojson are added for each dataframe entry

In [8]:
iter = 0
# empty list
geojson_with_coo_list = []

# loop on dataframe
for i in range(len(df)):
    entry = df.iloc[i]
    
    #define structure 
    geojson_structure ={"type": "Feature",
     "properties": {
         "person": '',
         "date": '',
         "origin": '',
         "colonial_location":'',
         "activities":'',
         "full_entry":'',
         "confidence_date":'',
         "confidence_person":'',
         "confidence_origin":'',
         
     },
     "geometry":
         { "type": "LineString", 
          "coordinates": []
         }
    }
    # get entry values
    or_lat = entry['origin_latitude']
    or_lon = entry['origin_longitude']
    col_lat = entry['col_latitude']
    col_lon = entry['col_longitude']

    entry['confidence_date']
    entry['confidence_origin']
    entry['confidence_person']
    # no lines if NaN values
    if math.isnan(or_lat) or  math.isnan(or_lon) or  math.isnan(col_lat) or  math.isnan(col_lon) :
        iter +=1
        continue
    else :   
        # create geojson with coordinates
        geojson_with_coo = add_coordinates(or_lon,or_lat,col_lon,col_lat)
        geojson_structure['properties']['person'] =  entry['person']
        geojson_structure['properties']['date'] =  entry['date']
        geojson_structure['properties']['origin'] =  entry['origin']
        geojson_structure['properties']['activities'] =  entry['activities']
        geojson_structure['properties']['full_entry'] =  entry['whole_entry']
        geojson_structure['properties']['colonial_location'] =  entry['colonial_Location']
        geojson_structure['properties']['confidence_date']=entry['confidence_date']
        geojson_structure['properties']['confidence_person']=entry['confidence_person']
        geojson_structure['properties']['confidence_origin']=entry['confidence_origin']
         
        # add to list
        geojson_with_coo_list.append(geojson_with_coo)

In [9]:
print(f'We have {len(geojson_with_coo_list)} entries for the geojsons, {iter} were dropped from the initial dataset of length {len(df)} because they had no geographical coordinates.')

We have 106 entries for the geojsons, 221 were dropped from the initial dataset of length 327 because they had no geographical coordinates.


### JSON for the collection

In [10]:
overall_json = {"type": "FeatureCollection","features": []}
overall_json['features']= geojson_with_coo_list
#overall_json

### Dump GeoJSON

In [11]:
a_file = open("./generated_data/lines.json", "w")
a_file = json.dump(overall_json, a_file) 

## Assessment 

It is interesting to understand why some of the data don't have geographical coordinates and thus canno't be visualized.
In this case we cannot visualize if we don't have an origin location. 

The following list is all the origin for which  we weren't able to retrieve geographical informations

In [12]:
#df.head()

In [13]:
no_geo_inf = df[df['origin_as_found'].isnull()]['origin']
no_geo_inf

16                None
17                None
18                None
19                None
20                None
            ...       
322            Germany
323            Neuthal
324            Rümlang
325       Lichtensteig
326    TumeglDomleschg
Name: origin, Length: 221, dtype: object

In [14]:
no_geo_inf.unique()

array([None, 'Saint-Aubin', 'Bournens', 'Bourmens', 'Echallens',
       'Obersimmental', 'Bâle', '', 'Noraz', 'Le Locle', 'Rehetobel',
       'Brazil', 'Morges', 'Ropraz', 'Gourgy', 'Africa', 'Lelienburg',
       'Bürglen', 'Burgdorf', 'Thurgau', 'Treytorrens', 'Speicher',
       'Walenstadt', 'La Tour-de-Peilz', 'Lutry', 'Murten', 'Switzerland',
       'La Rochelle', 'Versoix', 'Sonvillier', 'Schftland',
       'Saint-Domingue', 'Hunziker', 'Solothurn', 'Aargau', 'Dornach',
       'Graubünden', 'Jamaica', 'Rougement', 'Mtier', 'Bischofszell',
       'Unterseen BE', 'Couvet', 'Nantes', 'Zofingen', 'Klosters',
       'Saint-Saphorin', 'Saint-Lgier-La Chisaz', 'Saint-Sulpice',
       'La Cluse', 'Schwyz', 'Vendlincourt', 'Lenzburg', 'Avenches',
       'Martinique', 'Guttannen', 'North Carolina', 'South Carolina',
       'Bilten', 'Tenessee', 'Henau', 'BerneVaud', 'Frschels', 'Aa',
       'Benken', 'Moudon', 'Java', 'Celigny', 'Soglio', 'Germany',
       'Neuthal', 'Rümlang', 'Lichtenstei

In [15]:
print(f'There are {len(no_geo_inf)} origin with no geographic information, which represents {len(no_geo_inf.unique())} different locations.')

There are 221 origin with no geographic information, which represents 74 different locations.


In [16]:
originEmpty = len(df[df['origin']==''])

In [17]:
print(f'On this {len(no_geo_inf)} entries without geographical coordinates, {originEmpty} were not retrieved to start with')

On this 221 entries without geographical coordinates, 9 were not retrieved to start with


## Further work 

### Thresholds

In [18]:
df['confidence_date']= df['confidence_date'].apply(lambda x : float(x))
df['confidence_origin']= df['confidence_origin'].apply(lambda x : float(x))
df['confidence_person']= df['confidence_person'].apply(lambda x : float(x))

In [19]:
mean_conf_date = df.confidence_date.describe()['mean']
mean_conf_origin = df.confidence_origin.describe()['mean']
mean_conf_person = df.confidence_person.describe()['mean']

print(f'The average values for confidence level are the following : \n For date :   {mean_conf_date}\n For person : {mean_conf_person} \n For origin : {mean_conf_origin}')

The average values for confidence level are the following : 
 For date :   15.276452599388382
 For person : 51.8868501529052 
 For origin : 52.0488379204893
