In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
from nltk import *

# Part 1. TAGGING 

### Phase 1. Preparation

##### Packages:

In [2]:
from collections import Counter
import nltk.tokenize.punkt
import pickle
import csv
import json
import codecs
import string

##### External Packages:

In [3]:
from stanfordcorenlp import StanfordCoreNLP
# https://stanfordnlp.github.io/CoreNLP/
sf_nlp_core = StanfordCoreNLP(r'stanford-corenlp-full-2018-02-27')

from nltk.tag import StanfordPOSTagger, StanfordNERTagger
# https://nlp.stanford.edu/software/tagger.shtml
pos_jar = r'stanford-postagger-full-2018-02-27/stanford-postagger.jar'
pos_model = r'stanford-postagger-full-2018-02-27/models/english-left3words-distsim.tagger'
sf_pos_tagger = StanfordPOSTagger(pos_model, pos_jar, encoding='utf8')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [4]:
def save(path, data):
    if '.p' in path or '.pickle' in path:
        save_pickle(path, data)
    else:
        raise NotImplementedError()
        
def load(path):
    if '.p' in path or '.pickle' in path:
        return load_pickle(path)
    else:
        raise NotImplementedError()
    
def save_pickle(path, data):
    with open(path, 'wb') as f:
        pickle.dump(data, f)

def load_pickle(path):
    result = None
    with open(path, 'rb') as f:
        result = pickle.load(f)
    return result

### Phase 2. Load Allowed Words and change existing tag to Complex POS format

In [5]:
words = []
with open('allowed_words.txt', 'r') as f:
    for lines in f:
        words.append(lines.replace('\n', ''))

awords = pd.DataFrame({'words':words})

In [6]:
awords.sample(frac=1).head()

Unnamed: 0,words
1008,heroic
210,Hurry
780,down
221,Ives
1720,town


##### Sperater set to be 4 consecative space, change the file if this is not the case

In [7]:
t = pd.read_csv('Vocab.gr', sep='    ', comment='#', header=None, engine='python', names=['p','tag','words'])
j = awords.merge(t, on='words', how='left')
j = j.drop_duplicates()

In [8]:
j.sample(frac=1).head()

Unnamed: 0,words,p,tag
143,Daffy,,
831,economic,,
791,doctors,,
1725,the,1.0,Det
1418,proceed,,


#### Map our old tag to new POS tag

In [9]:
map_to_pos = {
    'VerbT': 'c', 
    'Number': 'CD', 
    'Proper': 'NNP',  # or NNPS
    'Det': 'DT',
    'Prep': 'IN', 
    'Noun': 'NN' # or NNS
}
can_map = j['tag'].isin(map_to_pos.keys())
j.loc[can_map, 'tag'] = j.loc[can_map, 'tag'].apply(lambda x : map_to_pos[x])

In [10]:
j['first'] = j['words'].str[0]

is_shorten = j['words'].str.startswith("'") & (j['words'] != "'")
is_number = j['first'].str.isdigit()
is_punc = j['first'].isin(set(string.punctuation)) & ~((j['first'] == "'") & (j['words'].str.len() > 1))

# Default the shorten form to "Verb, 3rd person singular present"
# It may be wrong, but we can update it using parsed data
j.loc[is_shorten, 'tag'] = 'VBZ'
j.loc[is_number,'tag'] = 'CD'
# Punc can be { . , : '' ``}
# . end of line
# , pause
# : conjunction or end of line
# '' just '
# `` just `
# *** We will fix this later ***
j.loc[is_punc, 'tag'] = '.' 

In [11]:
j[(is_shorten)| (is_number) | (is_punc)].sample(frac=1).head()

Unnamed: 0,words,p,tag,first
23,-,,.,-
1,"""",,.,""""
21,),,.,)
5,'ll,1.0,VBZ,'
32,24,,CD,2


### Phase 3. Tagging with POS and Name Entity for single word
---------------------
in case we dont have enough sample

In [12]:
# Find the tags and entity information from StanfordNLP
j.loc[(~is_shorten) & (~is_number) & (~is_punc), 'sftag'] = \
    j[(~is_shorten) & (~is_number) & (~is_punc)]['words'].apply(lambda x: sf_nlp_core.pos_tag(x)[0][1])
j.loc[(~is_shorten) & (~is_number) & (~is_punc), 'entity'] = \
    j[(~is_shorten) & (~is_number) & (~is_punc)]['words'].apply(lambda x: sf_nlp_core.ner(x)[0][1])

In [13]:
j.sample(frac=0.8).head()

Unnamed: 0,words,p,tag,first,sftag,entity
1850,wants,,,w,VBZ,O
724,country,,,c,NN,O
141,Crepper,,,C,NNP,PERSON
745,cut,,,c,NN,O
1215,make,,,m,VB,O


### Phase 4. Word Segmentation & Tagging from samples

In [14]:
def load_sent(paths=[]):
    tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
    if isinstance(paths, str):
        paths = [paths]
    if len(paths) > 0:
        text = ""
        for path in paths:
            text += codecs.open(path, "r" , "utf8").read()
    return tokenizer.tokenize(text.strip())

examples = load_sent(['devset.txt', 'quotes_new_preprocessed.txt','example_sentences.txt'])

In [15]:
# Remove duplication?
# examples = set(examples)

In [16]:
examples[:10]

['Whoa there !',
 'Halt !',
 'Who goes there ?',
 'It is I , Arthur , son of Uther Pendragon , from the castle of Camelot .',
 'King of the Britons , defeator of the Saxons , sovereign of all England !',
 'Pull the other one !',
 'I am .',
 'And this my trusty servant Patsy .',
 'We have ridden the length and breadth of the land in search of knights who will join me in my court of Camelot .',
 'I must speak with your lord and master .']

In [17]:
sf_tags = []
examples_parsed = []
try:
    sf_tags, examples_parsed = load('sf_tags_mp_allow.p')
except:
    for example in examples:
        word = word_tokenize(example)
        p = sf_pos_tagger.tag(word)
        sf_tags += p
        examples_parsed.append(p)
    # Cache...
    save('sf_tags_mp_allow.p', (sf_tags, examples_parsed))

In [18]:
df_sf_tags = pd.DataFrame({'words': [i[0] for i in sf_tags], 'sftag':[i[1] for i in sf_tags], 'dummy': 1})
df_sf_tags['counts'] = df_sf_tags.groupby(['words', 'sftag']).transform('count')
# we count every word + pos
df_sf_tags = df_sf_tags.drop_duplicates().reset_index(drop=True)
del df_sf_tags['dummy']

In [19]:
j = j.merge(df_sf_tags.rename(columns={'sftag':'parsed_sftag'}), on=['words'], how='left')

In [20]:
j.sample(frac=0.8).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts
1710,stone,,,s,NN,O,,
600,binding,,,b,NN,O,VBG,1.0
729,compared,,,c,VBN,O,VBN,1.0
671,called,,,c,VBN,O,,
810,discovered,,,d,VBN,O,VBN,1.0


#### If tag & sftag & parsed_sftag matched, we think its a good tag

In [21]:
is_same_tag = (j['sftag']==j['parsed_sftag']) & ((j['tag'].isnull()) | (j['tag'] == 'Misc'))
j.loc[is_same_tag, 'new_tag'] = j['parsed_sftag']

In [22]:
j[is_same_tag].sample(frac=0.8).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag
1588,seek,,,s,VB,O,VB,3.0,VB
750,course,,,c,NN,O,NN,4.0,NN
984,girl,,,g,NN,O,NN,2.0,NN
711,clear,,,c,JJ,O,JJ,2.0,JJ
1406,our,,,o,PRP$,O,PRP$,7.0,PRP$


#### If it is a frequent one, we think its pretty good tag

In [23]:
COUNT_THRESHOLD = 5
is_frequent = j['counts'] >= COUNT_THRESHOLD
j.loc[is_frequent, 'new_tag'] = j['parsed_sftag']

In [24]:
is_matched = (j['tag'] == j['sftag']) & (j['tag'] == j['parsed_sftag'])
j.loc[is_matched, 'new_tag'] = j['parsed_sftag']

In [25]:
j[is_matched].sample(frac=0.8).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag
1156,king,1.0,NN,k,NN,TITLE,NN,6.0,NN
1412,over,1.0,IN,o,IN,O,IN,1.0,IN
453,Zoot,1.0,NNP,Z,NNP,O,NNP,8.0,NNP
1491,quest,1.0,NN,q,NN,O,NN,8.0,NN
456,a,1.0,DT,a,DT,O,DT,92.0,DT


#### Add both sftag & parsed_sftag to new_tag

In [26]:
not_same = (j['new_tag'].isnull()) & (~j['sftag'].isnull()) & (~j['parsed_sftag'].isnull()) & (j['sftag'] != j['parsed_sftag'])
temp = j[not_same].copy()
temp['new_tag'] = temp['sftag']
j.loc[not_same, 'new_tag'] = j.loc[not_same, 'parsed_sftag']
j = j.append([temp], ignore_index=True)

#### Add the remining tag to new_tag

In [27]:
j.loc[j['new_tag'].isnull(), 'new_tag'] = j.loc[j['new_tag'].isnull(), 'sftag']

In [28]:
j.sample(frac=0.8).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag
539,asks,,,a,VBZ,O,VBZ,1.0,VBZ
1395,open,,,o,JJ,O,JJ,2.0,JJ
68,Are,,,A,VBP,O,VBP,3.0,VBP
2164,heh,,,h,NN,O,RB,1.0,NN
360,Shh,,,S,RB,O,NNP,1.0,NNP


### Phase 5. Special Case Handling

#### Special Case - Puncuations

In [29]:
is_punc = (j['first'].isin(set(string.punctuation))) & ~((j['first'] == "'") & (j['words'].str.len() > 1))

def set_punc(s):
    import re
    if s in ['(']:
        return 'BGNBK'
    if s in [')']:
        return 'ENDBK'
    if s in ['"', "'"]:
        return 'QUOTE'
    if re.match('`+$', s):
        return 'QUOTE'
    if s in [':',';']:
        return 'BREAK'
    if s in ['.','!','?']:
        return 'END'
    if s in [',']:
        return 'PAUSE'
    if re.match('(\.\.+)$|(-+)$', s):
        return 'BREAK'
    raise Exception("{} not in cases".format(s))
j.loc[is_punc, 'new_tag'] = j.loc[is_punc, 'words'].apply(lambda x: set_punc(x))

In [30]:
j[is_punc].sample(frac=1).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag
16,",",1.0,.,",",,,",",544.0,PAUSE
455,`,,.,`,,,``,6.0,QUOTE
19,.,1.0,.,.,,,.,369.0,END
21,..,,.,.,,,,,BREAK
3,',,.,',,,POS,1.0,QUOTE


#### Special Case - Numbers

In [31]:
is_number = (j['new_tag'].isnull()) & ((j['tag'] == 'CD')| (j['tag']=='Number') | (j['entity']=='NUMBER'))
j.loc[is_number, 'new_tag'] = 'CD'

In [32]:
j[((j['tag'] == 'CD')| (j['tag']=='Number') | (j['entity']=='NUMBER'))].sample(frac=1).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag
412,Two,,,T,CD,NUMBER,CD,1.0,CD
1389,one,1.0,Misc,o,CD,NUMBER,NN,1.0,NN
1867,two,,,t,CD,NUMBER,CD,5.0,CD
27,5000,1.0,CD,5,,,,,CD
861,eight,1.0,Misc,e,CD,NUMBER,CD,3.0,CD


#### Special Case - Shorten words
-------------
Example:   
- 'ow -> how   
- 'em -> them   

In [33]:
is_shorten = (j['new_tag'].isnull()) & (j['first'] == "'") & (j['words'].str.len() > 1)
def set_shorten(s):
    if s == "'ow":
        return 'RB'
    if s == "'em":
        return 'PRP'
    return 'PRP'
j.loc[is_shorten, 'new_tag'] = j.loc[is_shorten, 'words'].apply(lambda x: set_shorten(x) )

In [34]:
j[(j['first'] == "'") & (j['words'].str.len() > 1)].sample(frac=1).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag
12,'s,1.0,VBZ,',,,PRP,2.0,PRP
13,'ve,,VBZ,',,,VBP,10.0,VBP
7,'m,1.0,VBZ,',,,VBP,19.0,VBP
8,'ow,1.0,VBZ,',,,,,RB
9,'re,1.0,VBZ,',,,VBP,22.0,VBP


#### Special Case - Shorten words
---------
Example:
- Aaaaagh, Aaauggh, Aah
- ug, uh, um
- Ooh, Oof

In [35]:
# Minior Cases - Specail UTs
UT_list = [
    '[A|a][a|u|g]*h+$', # Aaaaagh, Aaauggh, Aah ... 
    'u[u|g|h|m|n]+$', # ug, uh, um ..
    'Noo', 
    'Oo[h|f|o]*$', # Ooh, Oof ..
    '[O|o]+[u|i|w|l|p]*$',
    'e+m*$',
    '([H|h][a|e|h|o|y|l]+)$~[H|h]e[ll]*$', # Hello, Hallo, Holy, Hee ...
    '[S|s]h+$', # Shh
    'whoa', 
    '[Y|y]*[E|e]*$' # Yee, ye ...
]

cond = False
for exp in UT_list:
    extra = True
    if '~' in exp:
        sp = exp.split('~')
        exp = sp[0]
        extra = ~(j['words'].str.match(sp[1]))
    cond = ((j['words'].str.match(exp)) & (extra)) | (cond)
j.loc[cond, 'new_tag'] = 'UT'

In [36]:
j[cond].sample(frac=1).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag
2163,hee,,,h,NN,O,NNP,1.0,UT
2066,Oooh,,,O,NNP,O,RB,3.0,UT
1414,ow,,,o,NN,O,NN,2.0,UT
2063,O,,,O,NN,O,NNP,2.0,UT
202,Ha,,,H,FW,O,SYM,2.0,UT


### Phase 6. Wrap up
------
1. Remove duplicates
2. Sorting
3. Counting

In [37]:
j.drop_duplicates(['words', 'new_tag'], inplace=True)

j.sort_values(['new_tag', 'words', 'counts'], ascending=False, inplace=True)

j.loc[~j['counts'].isnull(), 'counts'] += 1
j.loc[j['counts'].isnull(), 'counts'] = 1

j['group_counts'] = 1
j['group_counts'] = j.groupby(['new_tag']).transform('count')
j.loc[j['group_counts']==1, 'counts'] = 1

j['counts'] = j['counts'].astype(int)

In [38]:
j.sample(frac=1).head()

Unnamed: 0,words,p,tag,first,sftag,entity,parsed_sftag,counts,new_tag,group_counts
542,at,1.0,IN,a,IN,O,IN,12,IN,74.0
1985,with,1.0,IN,w,IN,O,IN,25,IN,74.0
1662,so,1.0,Misc,s,RB,O,IN,3,IN,74.0
2027,zone,,,z,NN,O,,1,NN,574.0
1718,streak,,,s,NN,O,NN,2,NN,574.0


### Phase 7.  Export to file

In [39]:
j[['counts', 'new_tag', 'words']].to_csv('_Tagged_Vocab.gr', header=False, sep=" ", index=False, escapechar=' ', doublequote=False)
# Handle escape char
with open('_Tagged_Vocab.gr', 'r') as f:
    with open('Tagged_Vocab.gr', 'w+') as mf:
        text = f.readline()
        while text:
            if "  " in text:
                mf.write(text.replace("  ", " "))
                text = f.readline()
                continue
            mf.write(text)
            text = f.readline()
os.remove('_Tagged_Vocab.gr')