In [1]:
import pandas as pd
import spacy

In [2]:
# This df contains only and all valid English words.

rmls = pd.read_csv('reverse_mapping_new.csv', sep=',')
enwords = rmls[(rmls['lang_GT_API'] == 'en') & 
               (rmls['remove_class'] != 'sign') & 
               (rmls['remove_class'] != 'letter') & 
               (rmls['remove_class'] != 'numeric')]
# The stem column is dropped, because otherwise the getIndex function would return two positions of each word.
enwords = enwords.drop(columns=['stem']).reset_index(drop=True)
enwords

Unnamed: 0,word_m,lang_GT_API,stopword,remove?,remove_class
0,hatred,en,False,False,
1,yellow,en,False,False,
2,four,en,False,False,
3,sleeve,en,False,False,
4,sleep,en,False,False,
...,...,...,...,...,...
3185,yell,en,False,False,
3186,at,en,,True,nltk_sw
3187,confess,en,False,False,
3188,sincere,en,False,False,


In [3]:
# SpaCy acts weird with df-extracted strings: it only processes partial values.
# I failed to fix it, so I take a detour and use text file.

word_m = enwords[['word_m']]
word_m.to_csv('to_be_tagged.txt', header=None, sep=',', index=False)

In [4]:
# Select and categorise words-to-tag.

nouns = []
verbs = []
adjectives = []
adverbs = []

nlp = spacy.load("en_core_web_sm")
text = open('to_be_tagged.txt').read()
words_to_tag = nlp(text)
for word in words_to_tag:
    if word.pos_ == 'NOUN' or word.pos_ == 'PROPN' or word.pos_ == 'NUM':
        nouns.append(word)
    if word.pos_ == 'VERB':
        verbs.append(word)  
    if word.pos_ == 'ADJ':
        adjectives.append(word)
    if word.pos_ == 'ADV':
        adverbs.append(word)
        
print('Nouns count: ' + str(len(nouns)))
print('Verbs count: ' + str(len(verbs)))
print('Adjectives count: '+ str(len(adjectives)))
print('Adverbs count: ' + str(len(adverbs)))
print('Total count: ' + str((len(nouns)+len(verbs)+len(adjectives)+len(adverbs))))

Nouns count: 2044
Verbs count: 554
Adjectives count: 335
Adverbs count: 99
Total count: 3032


In [5]:
# Check the index of words-to-tag.

def getIndex(df,value):
    index = []
    result = df.isin([value])
    series_object = result.any()
    columns = list(series_object[series_object == True].index)
    for col in columns:
        rows = list(result[col][result[col] == True].index)
        for row in rows:
            index.append(row)
    return index

In [6]:
# Tag the nouns.

nouns = [str(i) for i in nouns]
n_index = {word: getIndex(enwords, word) for word in nouns}

for key, value in n_index.items():
    rowIndex = [int(i) for i in value]
    enwords.loc[enwords.index[rowIndex], 'pos'] = 'n.'

In [7]:
# Tag the verbs.

verbs = [str(i) for i in verbs]
v_index = {word: getIndex(enwords, word) for word in verbs}

for key, value in v_index.items():
    rowIndex = [int(i) for i in value]
    enwords.loc[enwords.index[rowIndex], 'pos'] = 'v.'

In [8]:
# Tag the adjectives.

adjectives = [str(i) for i in adjectives]
adj_index = {word: getIndex(enwords, word) for word in adjectives}

for key, value in adj_index.items():
    rowIndex = [int(i) for i in value]
    enwords.loc[enwords.index[rowIndex], 'pos'] = 'adj.'

In [9]:
# Tag the adverbs.

adverbs = [str(i) for i in adverbs]
adv_index = {word: getIndex(enwords, word) for word in adverbs}

for key, value in adv_index.items():
    rowIndex = [int(i) for i in value]
    enwords.loc[enwords.index[rowIndex], 'pos'] = 'adv.'

### Check if all words are tagged -- 31 words are missing.

In [10]:
# Should be 2044 nouns. 10 missing.

enwords[(enwords['pos'] == 'n.')].shape

(2032, 6)

In [11]:
# Should be 554 verbs. 21 missing.

enwords[(enwords['pos'] == 'v.')].shape

(535, 6)

In [12]:
enwords[(enwords['pos'] == 'adj.')].shape

(335, 6)

In [13]:
enwords[(enwords['pos'] == 'adv.')].shape

(99, 6)

In [14]:
# Check missing nouns: 8 shown, 2 still missing.

def missingValues(a, b):
    return [[x for x in a if x not in b],[x for x in b if x not in a]]

nouns_tagged = enwords[enwords['pos']=='n.']
nouns_tagged = nouns_tagged['word_m'].to_list()
missingValues(nouns, nouns_tagged)

[['spinning', 'mr', '.', 'dr', '.', 'alleluia', 'fore', 'st'], []]

In [15]:
# Check missing verbs: 18 shown, 3 still missing.

verbs_tagged = enwords[enwords['pos']=='v.']
verbs_tagged = verbs_tagged['word_m'].to_list()
missingValues(verbs, verbs_tagged)

[['ca',
  'gon',
  'ca',
  '’s',
  'wo',
  '’s',
  'wo',
  '’ll',
  'ai',
  've',
  'gon',
  'cause',
  '’re',
  '’m',
  '’m',
  'wo',
  's',
  '’re'],
 []]

#### ... So it is the puncuation that causes the mismatches. The noun/verb lists are longer (for splitting original words) than supposed to be. 

#### 1) I will manually add 'alleluia', 'fore' and 'cause' into the output csv file; the others seem less relevant. 

#### 2) How come there are still 5 missing words? I disregard them but it's confusing.

In [16]:
# Append the previously deleted 'stem' to DF

rmls = pd.read_csv('reverse_mapping_new.csv', sep=',')
enwords_fix = rmls[(rmls['lang_GT_API'] == 'en') & 
                   (rmls['remove_class'] != 'sign') & 
                   (rmls['remove_class'] != 'letter') & 
                   (rmls['remove_class'] != 'numeric')]
enwords['stem'] = enwords_fix[['stem']].reset_index(drop=True)
enwords

Unnamed: 0,word_m,lang_GT_API,stopword,remove?,remove_class,pos,stem
0,hatred,en,False,False,,n.,hatr
1,yellow,en,False,False,,adj.,yellow
2,four,en,False,False,,n.,four
3,sleeve,en,False,False,,n.,sleev
4,sleep,en,False,False,,n.,sleep
...,...,...,...,...,...,...,...
3185,yell,en,False,False,,n.,yell
3186,at,en,,True,nltk_sw,,at
3187,confess,en,False,False,,n.,confess
3188,sincere,en,False,False,,adj.,sincer


In [17]:
# .insert() doesn't take series as value. Error: “Series objects are mutable and cannot be hashed”
# Although pandas 1.0.1 documentation says it CAN take series as value. 
# Anyway, I didn't figure out why, but take a detour.

cols = enwords.columns.tolist()
cols = cols[-1:] + cols[:-1]
enwords = enwords[cols]
enwords

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?,remove_class,pos
0,hatr,hatred,en,False,False,,n.
1,yellow,yellow,en,False,False,,adj.
2,four,four,en,False,False,,n.
3,sleev,sleeve,en,False,False,,n.
4,sleep,sleep,en,False,False,,n.
...,...,...,...,...,...,...,...
3185,yell,yell,en,False,False,,n.
3186,at,at,en,,True,nltk_sw,
3187,confess,confess,en,False,False,,n.
3188,sincer,sincere,en,False,False,,adj.


In [19]:
enwords.to_csv('wordlist_pos.csv', sep=',', index=False)