## Tagging New Stopwords

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
rmls = pd.read_csv("Raphael_reverse_mapping_lang_stop.csv", sep="\t")
rmls

Unnamed: 0,stem,word_m,lang_GT_API,stopword
0,día,día,es,False
1,pido,pido,es,False
2,hatr,hatred,en,False
3,pide,pide,es,False
4,yellow,yellow,en,False
...,...,...,...,...
4995,yell,yell,en,False
4996,at,at,,
4997,confess,confess,en,False
4998,sincer,sincere,en,False


### 1. An overview of the reverse-mapping list and the nltk package

In [3]:
# There are 3168 English words, which consists of 63.36% of the entire list.

rmls['lang_GT_API'].value_counts()

en       3168
es        520
fr        241
de        218
pt        153
it        143
sv         60
nl         46
fi         33
id         16
ro         16
ar         13
no         13
pl         11
da         11
vi          9
tr          8
hu          6
bs          6
tl          6
zh-CN       5
et          5
ja          4
lt          4
sl          4
ku          4
sq          4
la          3
lv          3
is          3
af          3
ca          3
el          3
cs          3
ru          2
sk          2
ms          2
mt          2
hi          2
bg          2
cy          2
az          2
eo          1
sw          1
ha          1
eu          1
ga          1
bn          1
uz          1
gl          1
so          1
ht          1
mi          1
Name: lang_GT_API, dtype: int64

In [5]:
# There are 179 pre-defined EN stopwords in the nltk package.

stopwords = nltk.corpus.stopwords
stopwords_en = set(stopwords.words('english'))
print(len(stopwords_en))
print(stopwords_en)

179
{'being', 'weren', 'above', 'more', 'out', "couldn't", "should've", 'isn', 'down', 'couldn', 'o', 'if', 'now', 'doing', 'we', "hasn't", "weren't", 'ours', 's', "mightn't", 'against', 'have', 'off', 'which', 'how', 'can', 'some', 'a', 'who', 'those', 'then', 've', 'any', "hadn't", 'my', 'own', 'before', 'wouldn', 'no', 'itself', 'just', 'will', 'your', 'myself', 'herself', 'ourselves', 'all', 't', 'an', 'are', 'i', 'few', 'or', 'its', 'the', 'most', 'needn', 'll', 'they', 'very', 'doesn', 'didn', 'should', 'm', 'themselves', 'under', 'd', 'does', "aren't", 'don', 'with', 'at', 'by', 'other', 'that', 'why', 'shan', 'and', "you'll", 'were', 'over', "isn't", 'mustn', 'yourself', 'further', "won't", 'their', 'about', 'same', 'here', 'theirs', 'his', 'too', "she's", 'below', 'her', 'whom', 'it', 'himself', 'be', 'to', 'wasn', "shouldn't", 'as', 'me', 'while', 'until', "wouldn't", 'had', 'shouldn', 'aren', 'has', 'these', 'was', 'our', 'hadn', 'him', 'hers', 'again', 'on', 'this', 'betwee

In [6]:
# There are 93 EN words tagged as stopwords in our list
# => 2.9% EN words are stopwords in our list.
# => 48% EN stopwords from the nltk package doesn't exist in our list.

rmls[(rmls.stopword == True) & (rmls.lang_GT_API == 'en')]

Unnamed: 0,stem,word_m,lang_GT_API,stopword
30,under,under,en,True
77,again,again,en,True
121,here,here,en,True
252,how,how,en,True
270,after,after,en,True
...,...,...,...,...
4691,other,other,en,True
4828,own,own,en,True
4908,below,below,en,True
4922,dure,during,en,True


In [7]:
# There are 225 words do not have a language tag, among which some are English.

rmls[rmls.lang_GT_API.isnull()]

Unnamed: 0,stem,word_m,lang_GT_API,stopword
89,n,n,,
204,me,me,,
207,ma,ma,,
209,mc,mc,,
211,mm,mm,,
...,...,...,...,...
4945,40,40,,
4980,–,–,,
4992,là,là,,
4993,lá,lá,,


In [8]:
# There are 331 stopwords with null-value, among which 225 (see above) don't have a language tag.
# So: there are 106 words do have a language tag but DO NOT have a stopword tag.

rmls['stopword'].isna().value_counts()

False    4669
True      331
Name: stopword, dtype: int64

### 2. Tagging more EN stopwords in addition to nltk

Goals (30/Mar.):
1. 3 types of words that should be tagged and filtered out: 1) numbers, 2) lone letters, 3) non-English words, 4) other miscellaneous ('-','4x','x4', etc.)

2. Add missing language tags. For instance, many English words with len( ) < 3 are not pre-tagged, although these are most likely stopwords. 

3. Discuss and add/remove new/pre-defined stopwords in the nltk package. (Can I edit it in the txt file?)

Work summary (07/Apr.):
1. All numbers, lone letters, non-English words, and unwanted signs are tagged out.
2. I did not add missing language tag for English, because they turn out to be words with len( )<3, which makes them stopwords anyways.
3. We can discuss and decide what stopwords to keep/delete. Especially with personal pronouns.

In [9]:
rmls = pd.read_csv("Raphael_reverse_mapping_lang_stop.csv", sep="\t")
rmls

Unnamed: 0,stem,word_m,lang_GT_API,stopword
0,día,día,es,False
1,pido,pido,es,False
2,hatr,hatred,en,False
3,pide,pide,es,False
4,yellow,yellow,en,False
...,...,...,...,...
4995,yell,yell,en,False
4996,at,at,,
4997,confess,confess,en,False
4998,sincer,sincere,en,False


In [10]:
# This new column "remove?" will tag all unwanted values as True, and wanted ones as False.
rmls['remove?'] = ''
rmls

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
0,día,día,es,False,
1,pido,pido,es,False,
2,hatr,hatred,en,False,
3,pide,pide,es,False,
4,yellow,yellow,en,False,
...,...,...,...,...,...
4995,yell,yell,en,False,
4996,at,at,,,
4997,confess,confess,en,False,
4998,sincer,sincere,en,False,


In [11]:
# Tag out 1832 non-English words
# Note: many English words with len()<3 are also tagged out, among which we may manually restore "me","we","if", etc.

non_en = rmls[rmls.lang_GT_API != 'en'].reset_index(drop=True)
non_en['remove?'] = True
non_en

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
0,día,día,es,False,True
1,pido,pido,es,False,True
2,pide,pide,es,False,True
3,otro,otro,es,True,True
4,auf,auf,de,True,True
...,...,...,...,...,...
1827,vuoi,vuoi,it,False,True
1828,là,là,,,True
1829,lá,lá,,,True
1830,allein,allein,de,False,True


In [12]:
# Tag 3168 English words

en = rmls[rmls.lang_GT_API == 'en'].reset_index(drop=True)
en['remove?'] = False
en

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
0,hatr,hatred,en,False,False
1,yellow,yellow,en,False,False
2,four,four,en,False,False
3,sleev,sleeve,en,False,False
4,sleep,sleep,en,False,False
...,...,...,...,...,...
3163,raven,raven,en,False,False
3164,yell,yell,en,False,False
3165,confess,confess,en,False,False
3166,sincer,sincere,en,False,False


In [13]:
# Check numbers
# Note: "100" and "000" are previously tagged as False, but their tag should be True.

check_num = rmls[rmls['word_m'].str.isnumeric()].reset_index(drop=True)
check_num

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
0,100,100,en,False,
1,13,13,,,
2,50,50,,,
3,20,20,,,
4,2,2,,,
5,12,12,,,
6,15,15,,,
7,16,16,,,
8,3,3,,,
9,4,4,,,


In [14]:
# Tag out and check the index of "100" and "000"

numbers = en[en['word_m'].str.isnumeric()].reset_index()
numbers['remove?'] = True
numbers

Unnamed: 0,index,stem,word_m,lang_GT_API,stopword,remove?
0,88,100,100,en,False,True
1,2154,0,0,en,False,True


In [15]:
# Remove "100" and "000" from en (for concatenation)

en = en.drop([en.index[88], en.index[2154]])
en

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
0,hatr,hatred,en,False,False
1,yellow,yellow,en,False,False
2,four,four,en,False,False
3,sleev,sleeve,en,False,False
4,sleep,sleep,en,False,False
...,...,...,...,...,...
3163,raven,raven,en,False,False
3164,yell,yell,en,False,False
3165,confess,confess,en,False,False
3166,sincer,sincere,en,False,False


In [16]:
# Check lone letters
# Note: all values are already tagged as True; so no need to do anything

def loneLetters(word_m):
    if len(word_m) == 1:
        return word_m
    else:
        return None

rmls['word_m'] = rmls.apply(lambda row: loneLetters(row['word_m']), axis=1)
rmls[rmls.word_m.notnull()]

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
89,n,n,,,
368,l,l,,,
462,é,é,,,
463,è,è,,,
467,à,à,,,
470,å,å,,,
512,k,k,,,
1267,p,p,,,
1440,o,o,,,
1651,g,g,,,


In [18]:
# Re-tag EN stopwords based on nltk package (originally tagged by Raphael)

stopword = en[en.stopword == True].reset_index(drop=True)
stopword['remove?'] = True
stopword

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
0,under,under,en,True,True
1,again,again,en,True,True
2,here,here,en,True,True
3,how,how,en,True,True
4,after,after,en,True,True
...,...,...,...,...,...
88,other,other,en,True,True
89,own,own,en,True,True
90,below,below,en,True,True
91,dure,during,en,True,True


In [19]:
# Remove the stopwords from en (for concatenation)

en.drop(en[en.stopword == True].index, inplace=True)
en

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?
0,hatr,hatred,en,False,False
1,yellow,yellow,en,False,False
2,four,four,en,False,False
3,sleev,sleeve,en,False,False
4,sleep,sleep,en,False,False
...,...,...,...,...,...
3163,raven,raven,en,False,False
3164,yell,yell,en,False,False
3165,confess,confess,en,False,False
3166,sincer,sincere,en,False,False


In [20]:
rmls_new = pd.concat([en, non_en, numbers, stopword],ignore_index=True, sort=False)
rmls_new

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?,index
0,hatr,hatred,en,False,False,
1,yellow,yellow,en,False,False,
2,four,four,en,False,False,
3,sleev,sleeve,en,False,False,
4,sleep,sleep,en,False,False,
...,...,...,...,...,...,...
4995,other,other,en,True,True,
4996,own,own,en,True,True,
4997,below,below,en,True,True,
4998,dure,during,en,True,True,


In [21]:
rmls_new.to_csv("reverse_mapping_lang_stop_new.csv",sep="\t",index=False)