### This notebook provides functions, that take annotated conll2002 input to: 
1. count the number of NE annotations 
2. count it of all existing NE-labels or multi-token annotations
3. check which documents contain B-null tags; tokens that have been selected as a NE in INCEpTION, but which NE is not specified
4. create a new file

In [None]:
import pandas as pd
import numpy as np
import glob
import re
import os
import csv

### 1.1 COUNTING NEs

In [2]:
def count_NE(filepath):
    """ takes filepath as input an returns a dictionary of all exisiting labels and the number of times they occur"""
    cound = dict()
    all_df=[]
    colnames=["token", "NE"]
    for filename in glob.glob(filepath):
        df = pd.read_csv(filename, names=colnames, delimiter=" ", engine="python", encoding='utf-8', quoting=csv.QUOTE_NONE)
        df = df.replace(np.nan, " ")
        all_df.append(df)
    cf = pd.concat([d.set_index("token") for d in all_df], axis=0, ignore_index=False).reset_index()
    cf=cf.loc[cf["NE"].str.contains("B-", regex=True)] #select strings containing "B-"
    cound["annotations"] = len(cf)
    
    gf=cf.groupby(["NE"]).size() #count per label
    d=dict(gf)
    for key, value in d.items():
        if "B-" in key:
            cound[key] = value
    
    cound["len"] = len(cound)-1

    return cound

### 1.2 COUNTING multi-token NEs

In [None]:
def count_I(filepath):
    cound = dict()
    all_df=[]
    colnames=["token", "NE"]
    for filename in glob.glob(filepath):
        df = pd.read_csv(filename, names=colnames, delimiter=" ", engine="python", encoding='utf-8', quoting=csv.QUOTE_NONE)
        df = df.replace(np.nan, " ")
        all_df.append(df)
    cf = pd.concat([d.set_index("token") for d in all_df], axis=0, ignore_index=False).reset_index()
    cf=cf.loc[cf["NE"].str.contains("I-", regex=True)] #select strings containing "I-"
    cound["annotations"] = len(cf)
    
    gf=cf.groupby(["NE"]).size() #count per label
    d=dict(gf)
    for key, value in d.items():
        if "I-" in key:
            cound[key] = value
    
    cound["len"] = len(cound)-1

    return cound

### 1.3 TRACE B-null

In [None]:
def trace_null(filepath):
    nulls=[]
    colnames=["token", "NE"]
    for filename in glob.glob(filepath):
            df = pd.read_csv(filename, names=colnames, delimiter=" ", encoding='utf-8', quoting=csv.QUOTE_NONE)
            df = df.replace(np.nan, " ")
            nf = df.loc[df["NE"] == "B-null"]
            if len(nf) > 0:
                nf = nf.rename(columns = { "token": "", "NE": ""})
                b_null=dict()
                b_null["doc"] = filename[20:-11]
                b_null["tokens"] = nf
                nulls.append(b_null)
    return nulls

### 1.4 CONCATENATE & CREATE NEW FILE

In [None]:
def create_csv(filepath, csvfilename):
    all_df=[]
    colnames=["token", "NE"]
    for filename in glob.glob(filepath):
        df = pd.read_csv(filename, names=colnames, delimiter=" ", engine="python", encoding='utf-8', quoting=csv.QUOTE_NONE)
        df = df.replace(np.nan, " ")
        all_df.append(df)

    cf = pd.concat([d.set_index("token") for d in all_df], axis=0, ignore_index=True).reset_index()
    cf.to_csv(csvfilename, sep=" ", index=False)