## Generate KGTK Person Abbreviations file
This notebook relates to [KGTK Issue 260](https://github.com/usc-isi-i2/kgtk/issues/260)

Example command to run the notebook using papermill:

```papermill abbreviate_human_labels.ipynb abbr_output.ipynb -p data_folder /Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files \
                                                          -p wikidata_item_filename claims.wikibase-item.tsv.gz \
                                                          -p wikidata_label_filename labels.en.tsv.gz \
                                                          -p wikidata_alias_filename aliases.en.tsv.gz```

In [1]:
# Parameters
data_folder = '/Users/rijulvohra/Documents/work/Novartis-ISI/global_data_folder/kgtk_edge_files'
wikidata_item_filename = 'claims.wikibase-item.tsv.gz'
wikidata_label_filename = 'labels.en.tsv.gz'
wikidata_alias_filename = 'aliases.en.tsv.gz'

In [None]:
import os
import pandas as pd
import gzip
import shutil
from collections import defaultdict
import json
import ast
import time

In [None]:
os.environ['FILE'] = data_folder
wikibase_item_file = os.path.join(data_folder,wikidata_item_filename)
human_kgtk_edge = os.path.join(data_folder, 'human_kgtk_edge.tsv.gz')
label_file = os.path.join(data_folder,wikidata_label_filename)
alias_file = os.path.join(data_folder,wikidata_alias_filename)
human_label_output = os.path.join(data_folder, 'human_label_edge.tsv.gz')
human_alias_output = os.path.join(data_folder, 'human_alias_edge.tsv.gz')
human_label_alias_output = os.path.join(data_folder, 'human_label_alias.tsv')
abbr_kgtk_file = os.path.join(data_folder,'derived.Q5.abbreviations.meta.tsv.gz')
final_abbr_kgtk_file = os.path.join(data_folder,'derived.Q5.abbreviations.tsv.gz')

### Filter Human Labels

In [None]:
!time | kgtk filter -p " ;P31;Q5" -i $wikibase_item_file -o $human_kgtk_edge
pd.read_csv(human_kgtk_edge, sep = '\t', nrows = 10)

In [None]:
#get the labels for the Q nodes
!kgtk ifexists -i $label_file --input-keys node1 --filter-on $human_kgtk_edge \
        --filter-keys node1 -o $human_label_output
pd.read_csv(human_label_output, sep = '\t', nrows = 10)

In [None]:
#get the alias for the Q nodes
!kgtk ifexists -i $alias_file --input-keys node1 --filter-on $human_kgtk_edge \
        --filter-keys node1 -o $human_alias_output
pd.read_csv(human_alias_output, sep = '\t', nrows = 10)

### Concat the human label and alias file

In [None]:
!kgtk cat -i $human_label_output $human_alias_output | kgtk sort -c node1 -o $human_label_alias_output
with open(human_label_alias_output, 'rb') as f_in:
    with gzip.open(human_label_alias_output + '.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        os.remove(f_in.name)
human_label_alias_output_gzip = human_label_alias_output + '.gz'
pd.read_csv(human_label_alias_output_gzip, sep = '\t', nrows = 10)

### Functions to generate Abbreviations

Algorithm for generating the person abbreviations:
Abbreviations for label property:
 * If the label has 2 words, abbreviate the first word. 
 * If label has more than two words(eg. Michael Jeffrey Jordan), then: <br/>
    a) Generate Abbreviation for all words leading upto the end word. (eg. M. J. Jordan) <br/>
    b) Generate Abbreviations for the middle words. (eg. Michael J. Jordan). <br/>
    c) If the generated abbreviations are present in the alias, then leave them. <br/>
	
Abbreviations for alias property:

Alias may have new words other than the words present in the label. For new words present at the start and end leave them as it is. Generate abbreviations for new words in the middle.

In [None]:
def generate_abbreviations(name_split,word_index):
    '''
    Helper function to generate the abbreviation.
    Input: name_split: List of the words in a name
    Output: Abbreviated Name
    '''
    abbr_label = ''
    if word_index is None:
        for word in name_split[:-1]:
            abbr_label += word[0].upper() + '.' + ' '
        abbr_label += name_split[-1]
        if len(name_split) >= 2:
            abbr_label_end = name_split[-1] + ',' + ' '
            for word in name_split[:-1]:
                abbr_label_end += word[0].upper() + '.' + ' '
            
            return abbr_label, abbr_label_end.strip()
            
        return abbr_label, None
    else:
        for i in range(len(name_split) - 1):
            if i != word_index:
                abbr_label += name_split[i] + ' '
            else:
                abbr_label += name_split[i][0].upper() + '.' + ' '
        abbr_label += name_split[-1]
        return abbr_label

In [None]:
def abbreviate_human_labels(human_label_file,output_file):
    '''
    Traverses the concatenated human labels and aliases, creates the abbreviations for the labels and aliases
    '''
    with gzip.open(human_label_file,'rt') as file:
        prev = None
        lines_to_write = list()

        first_line = file.readline().replace('\n','').replace('\r','')
        columns = first_line.split('\t')
        prop_index = columns.index('label')
        node1_index = columns.index("node1")
        id_index = columns.index("id")
        node2_index = columns.index("node2")
        flag = False
        st = time.time()
        for i,line in enumerate(file):
            if i%100000 == 0:
                print("Time taken for {} is {}".format(i,time.time() - st))
                print("Previous Qnode is:",prev)
            vals = line.split('\t')
            prop_label = vals[prop_index]
            node1 = vals[node1_index]
            id_val = vals[id_index]
            node2 = vals[node2_index]
            if node1.startswith('Q'):
                if prev is None:
                    prev = node1
                    abbr_dict = defaultdict(set)
                    alias_dict = defaultdict(set)
                    label_dict = defaultdict(list)
                    
                if not prev.strip() == node1.strip():
                    if len(label_dict[prev]) == 0:
                        prev = node1
                        continue
                    node_label_list = label_dict[prev][0].split()
                    abbr_str, abbr_str_end = generate_abbreviations(node_label_list,None)
                    abbr_dict[prev].add(abbr_str)
                    if abbr_str_end is not None:
                        abbr_dict[prev].add(abbr_str_end)
                    if len(node_label_list) > 2:
                        for i in range(1,len(node_label_list) - 1):
                            abbr_str = generate_abbreviations(node_label_list,i)
                            abbr_dict[prev].add(abbr_str)

                    #alias
                    if prev in alias_dict:
                        for alias in alias_dict[prev]:
                            node_alias_split = alias.split()
                            #check if first and last word of label and alias are the same. Generate abbreviation 
                            #for new middle words
                            if node_alias_split[0] == node_label_list[0]:
                                abbr_str, abbr_str_end = generate_abbreviations(node_alias_split,None)
                                abbr_dict[prev].add(abbr_str)
                                if abbr_str_end is not None:
                                    abbr_dict[prev].add(abbr_str_end)
                                
                                if len(node_alias_split) > 2:
                                    for i in range(1,len(node_alias_split) - 1):
                                        abbr_str = generate_abbreviations(node_alias_split,i)
                                        abbr_dict[prev].add(abbr_str)
                                continue

                            if node_alias_split[0] != node_label_list[0]:
                                if len(node_alias_split) > 2:
                                    for i in range(1,len(node_alias_split) - 1):
                                        abbr_str = generate_abbreviations(node_alias_split,i)
                                        abbr_dict[prev].add(abbr_str)



                    #unique abbreviation edges to write
                    for lab in abbr_dict[prev]:
                        if prev in alias_dict:
                            if lab in alias_dict[prev]:
                                continue
                        lines_to_write.append('\t' + prev + '\t' + 'abbreviated_name' + '\t' + "\'" + lab + "\'" +'@en')
                    prev = node1
                
                if prev.strip() == node1.strip():
                    if prop_label == 'alias':
                        alias_dict[node1].add(ast.literal_eval(node2.split('@en')[0]))
                    
                    if prop_label == 'label':
                        label_dict[node1].append(ast.literal_eval(node2.split('@en')[0]))
                        
                        
                if len(lines_to_write) > 100000:
                    with gzip.open(output_file,'a') as writer:
                        if flag == False:
                            header = first_line + '\n'
                            writer.write(header.encode('utf8'))
                            flag = True
                        
                        writer.write('\n'.join(lines_to_write).encode('utf8'))
                        writer.write('\n'.encode('utf8')) 
                        lines_to_write = list()
         
        for lab in abbr_dict[prev]:
            lines_to_write.append('\t' + prev + '\t' + 'abbreviated_name' + '\t' + "\'" + lab + "\'" +'@en')
        #print(lines_to_write)                
        if len(lines_to_write) > 0:
            #print(lines_to_write)
            with gzip.open(output_file,'a') as writer:
                if flag == False:
                    header = first_line + '\n'
                    writer.write(header.encode('utf8'))
                    flag = True
                writer.write('\n'.join(lines_to_write).encode('utf8'))
                writer.write('\n'.encode('utf8'))               

In [None]:
abbreviate_human_labels(human_label_alias_output_gzip, abbr_kgtk_file)

In [None]:
!kgtk add-id -i $abbr_kgtk_file --id-style wikidata --overwrite-id True -o $final_abbr_kgtk_file

In [None]:
pd.read_csv(final_abbr_kgtk_file,sep = '\t', nrows = 10)

### CleanUp Temporary files

In [None]:
os.remove(human_kgtk_edge)
os.remove(human_label_output)
os.remove(human_alias_output)
os.remove(human_label_alias_output_gzip)
os.remove(abbr_kgtk_file)