## (Re)tagging stopwords with a descriptive class
#### In this notebook I redo and refine the previous stopword-tagging process, which results in a far more accurate and Tableau-friendly version.

In [15]:
import pandas as pd

In [16]:
rmls = pd.read_csv("Raphael_reverse_mapping_lang_stop.csv", sep="\t")
rmls

Unnamed: 0,stem,word_m,lang_GT_API,stopword
0,día,día,es,False
1,pido,pido,es,False
2,hatr,hatred,en,False
3,pide,pide,es,False
4,yellow,yellow,en,False
...,...,...,...,...
4995,yell,yell,en,False
4996,at,at,,
4997,confess,confess,en,False
4998,sincer,sincere,en,False


In [17]:
rmls['remove?'] = False
rmls['remove_class'] = ''
rmls

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?,remove_class
0,día,día,es,False,False,
1,pido,pido,es,False,False,
2,hatr,hatred,en,False,False,
3,pide,pide,es,False,False,
4,yellow,yellow,en,False,False,
...,...,...,...,...,...,...
4995,yell,yell,en,False,False,
4996,at,at,,,False,
4997,confess,confess,en,False,False,
4998,sincer,sincere,en,False,False,


In [18]:
rmls.loc[rmls['stopword'] == True, ['remove?']] = True
rmls.loc[rmls['stopword'] == True, ['remove_class']] = 'nltk_sw'

In [19]:
rmls.loc[rmls['lang_GT_API'] != 'en', ['remove?']] = True
rmls.loc[rmls['lang_GT_API'] != 'en', ['remove_class']] = 'non_en'

In [20]:
rmls.loc[rmls['word_m'].str.isnumeric(), ['remove?']] = True
rmls.loc[rmls['word_m'].str.isnumeric(), ['remove_class']] = 'numeric'

In [21]:
rmls.loc[rmls['word_m'].str.len() == 1 & rmls['word_m'].str.isalpha(), ['remove?']] = True
rmls.loc[rmls['word_m'].str.len() == 1 & rmls['word_m'].str.isalpha(), ['remove_class']] = 'letter'

In [22]:
# Manually tag out 8 signs: "&", "", "-", "(x2)", "(x3)", "(feat.", "ref:" and "intro:"

rmls.loc[rmls['word_m'].isin(['(x2)','(x3)','(feat.','','&','–','ref:','intro:']), ['remove?']] = True
rmls.loc[rmls['word_m'].isin(['(x2)','(x3)','(feat.','','&','–','ref:','intro:']), ['remove_class']] = 'sign'

In [23]:
print(rmls[rmls['remove_class'] == 'sign'])

       stem  word_m lang_GT_API stopword  remove? remove_class
64      ref    ref:          en    False     True         sign
1733     x2    (x2)          en    False     True         sign
1734     x3    (x3)          en    False     True         sign
2373  intro  intro:          en    False     True         sign
3846      &       &         NaN      NaN     True         sign
4858   feat  (feat.          en    False     True         sign
4881                      NaN      NaN     True         sign
4980      –       –         NaN      NaN     True         sign


##### About the cell below:
The English words below are wrongly tagged as 'non_en', because all words with len( ) < 3 do not receive a language tag from the Google API. I manually gave them an English tag, as assigned them each a "remove?" and a "remove_class" tag according to nltk defined stopwords.

Word list: i, a, am, an, as, at, be, by, do, go, hi, if, in, is, it, no, me, my, of, ok, on, or, re, so, to, tv, up, us, we 

In [24]:
# Reference: nltk English stopwords

import nltk
stopwords = nltk.corpus.stopwords
stopwords_en = set(stopwords.words('english'))
print(stopwords_en)

{'be', "weren't", "mightn't", 'below', 'not', 'aren', "should've", 'wasn', "you've", 'i', 'been', 'theirs', 'all', 'wouldn', 'from', 'himself', "shouldn't", 'should', 'and', 'we', 'is', 'any', 'by', 'can', 'such', 'you', 'down', 'having', "didn't", 'do', 'its', 'as', 'itself', 'the', 'herself', "hadn't", "haven't", 'some', 'me', 'but', 'them', 'didn', 'most', 'while', 'yourself', 'until', 'my', 'had', 'because', 'over', 'no', "couldn't", 'only', 'shouldn', 'needn', 'being', 'his', 'just', "won't", 'more', 'or', "you're", 'to', 'shan', 'own', "aren't", 'other', 'if', 'what', 'mightn', 'which', 'did', 'a', 'through', 'hasn', 'this', 'here', 'o', 'between', "doesn't", 'ourselves', 't', 'an', "isn't", 'then', 'few', 'at', 'don', 'y', 'that', 'they', 'against', 'm', "mustn't", 'now', 'him', 'their', 'too', 'same', 'couldn', 'each', 'her', 'ours', 'are', "wasn't", 'there', "don't", "you'd", 'd', 'very', 'for', 'under', 'haven', 'hadn', 'both', 'who', 'on', 'when', 'am', 'isn', 'he', 'these',

In [25]:
# Manually process 29 English words with len( ) < 3.

rmls.loc[rmls['word_m'].isin(['i','a','am','an','as','at','be','by','do','go','hi','if','in','is','it','no','me','my','of','ok','on','or','re','so','to','tv','up','us','we']), ['lang_GT_API']] = 'en'
rmls.loc[rmls['word_m'].isin(['i','a','am','an','as','at','be','by','do','hi','if','in','is','it','no','me','my','of','on','or','re','so','to','up','us','we']), ['remove?']] = True                            
rmls.loc[rmls['word_m'].isin(['i','a','am','an','as','at','be','by','if','in','is','it','no','me','my','of','on','or','re','so','to','up','us','we']), ['remove_class']] = 'nltk_sw'

In [26]:
rmls

Unnamed: 0,stem,word_m,lang_GT_API,stopword,remove?,remove_class
0,día,día,es,False,True,non_en
1,pido,pido,es,False,True,non_en
2,hatr,hatred,en,False,False,
3,pide,pide,es,False,True,non_en
4,yellow,yellow,en,False,False,
...,...,...,...,...,...,...
4995,yell,yell,en,False,False,
4996,at,at,en,,True,nltk_sw
4997,confess,confess,en,False,False,
4998,sincer,sincere,en,False,False,


In [27]:
rmls.to_csv('reverse_mapping_new.csv', sep=',', index=False)