In [4]:
%load_ext autoreload
%autoreload 2

from build_events import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# load spacy model
nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True,
    stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True, verbose=False)

# load files and geoview metadata
capstone_files, files = get_report_data(count_sentences=True, return_files=True)

metadata = pd.read_csv('data/geoview/capstone_metadata.zip', compression='zip', parse_dates=['report_year'],
    usecols=['anumber','title','report_type','project','keywords','commodity','report_year'])

geoview = gpd.read_file('zip://data/geoview/capstone_shapefiles.shp.zip')

# specify labellers
users = ('daniel','charlie')
dataset = {
    user : pd.read_csv(f'data/labels/{user}_dataset.csv', index_col=0).rename(
        columns={'idx': 'sentence_idx'}) for user in users}
confs = ('low','medium','high',)
group_all_labelled = False

for user in users:
    print(f'{len(dataset[user].loc[dataset[user].reviewed])} events labelled by {user}.')

Loading files as dict: 100%|██████████| 32646/32646 [00:05<00:00, 5715.21it/s] 


251 events labelled by daniel.
1008 events labelled by charlie.


In [6]:
# loads events by confidence - note will not load group labelled
events = {conf: build_event_data(dataset, confidence=conf, files=files, nlp=nlp, capstone_files=capstone_files,
    geoview=geoview, return_entities=True, group_all_labelled=group_all_labelled) for conf in confs}

# build geopandas.geodataframe.GeoDataFrame (to start with geoview to preserve data type for plotly map)
# join geoview shape files, geoview metadata, capstone json to anumber mapping, and aggregated event statistics
df = {conf : geoview.merge(metadata, on='anumber').merge(capstone_files, on='anumber').merge(
    events[conf].groupby('filename')['label'].sum().reset_index(), on='filename') for conf in confs}

# store one hot encodings for each of the events dataframes
commodities = {}
for conf in confs:
    df[conf]['commodity_list'] = df[conf]['commodity'].apply(lambda x : to_list(x, sep=';', default='NO TARGET COMMODITY'))  # expand string separated strings to list
    mlb = MultiLabelBinarizer()
    mlb.fit(df[conf]['commodity_list'])
    commodities[conf] = pd.DataFrame(mlb.transform(df[conf]['commodity_list']), columns=mlb.classes_, index=df[conf].index)


Extracting low confidence events: 1212it [00:26, 45.06it/s] 
Extracting medium confidence events: 1212it [00:25, 47.20it/s] 
Extracting high confidence events: 1212it [00:26, 45.20it/s] 


In [7]:
# check duplicates
for conf in confs:
    display(events[conf].loc[
        (events[conf][['event_id']].duplicated(keep=False)) &
        ~(events[conf][['event_id','label']].duplicated(keep=False))
    ].sort_values('event_id')[['event_id','filename','sentence_idx','label']])

Unnamed: 0,event_id,filename,sentence_idx,label


Unnamed: 0,event_id,filename,sentence_idx,label


Unnamed: 0,event_id,filename,sentence_idx,label


In [8]:
for conf in confs:
    if group_all_labelled:
        events[conf].to_csv(f'data/events/events_{conf}-conf-extra.csv')
        commodities[conf].to_csv(f'data/events/commodities_{conf}-conf-extra.csv')
    else:
        events[conf].to_csv(f'data/events/events_{conf}-conf.csv')
        commodities[conf].to_csv(f'data/events/commodities_{conf}-conf.csv')