## This notebook will do the followings:
1. verify the word mapping against the main song file (words_songs_matched.csv). result: one-to-one mapping between words_songs_matched.csv, the mapping file mxm_reverse_mapping.txt, the regular word, and the word stem is confirmed.
2. guess the language of the words w/ Google Translate API
3. tag stop words using nltk.corpus.stopwords

In [127]:
!pip install textblob
!pip install nltk
!pip install iso-639



In [1]:
import pandas as pd
import numpy as np
m = pd.read_csv("mxm_reverse_mapping.txt", sep="<SEP>", engine='python', names=["stem","word_m"])
m
#load wordstem <-> regular word mapping data

Unnamed: 0,stem,word_m
0,día,día
1,pido,pido
2,hatr,hatred
3,pide,pide
4,yellow,yellow
...,...,...
4995,yell,yell
4996,at,at
4997,confess,confess
4998,sincer,sincere


In [72]:
m.groupby('stem').agg('count').shape
#check if stem can be mapped to > 1 regular word
#still 5000 row after groupby, thus the case above does not exist

(5000, 1)

In [74]:
df = pd.read_csv('../merged_data/words_songs_matched.csv', header=0, sep=',')
df

Unnamed: 0,word,count,track_id,song_id,artist_id,title,artist_name,duration,year,artist_hotttnesss
0,i,6,TRAAAAV128F421A322,SOQPWCR12A6D4FB2A3,AR73AIO1187B9AD57B,A Poor Recipe For Civic Cohesion,Western Addiction,118.07302,2005,0.386606
1,the,4,TRAAAAV128F421A322,SOQPWCR12A6D4FB2A3,AR73AIO1187B9AD57B,A Poor Recipe For Civic Cohesion,Western Addiction,118.07302,2005,0.386606
2,you,2,TRAAAAV128F421A322,SOQPWCR12A6D4FB2A3,AR73AIO1187B9AD57B,A Poor Recipe For Civic Cohesion,Western Addiction,118.07302,2005,0.386606
3,to,2,TRAAAAV128F421A322,SOQPWCR12A6D4FB2A3,AR73AIO1187B9AD57B,A Poor Recipe For Civic Cohesion,Western Addiction,118.07302,2005,0.386606
4,and,5,TRAAAAV128F421A322,SOQPWCR12A6D4FB2A3,AR73AIO1187B9AD57B,A Poor Recipe For Civic Cohesion,Western Addiction,118.07302,2005,0.386606
...,...,...,...,...,...,...,...,...,...,...
19045327,easili,1,TRZZZZD128F4236844,SOANRDO12A81C21E36,ART48CP1187B9AE314,Mr. Soul (Album Version),Rush,229.27628,0,0.546253
19045328,disast,1,TRZZZZD128F4236844,SOANRDO12A81C21E36,ART48CP1187B9AE314,Mr. Soul (Album Version),Rush,229.27628,0,0.546253
19045329,frown,1,TRZZZZD128F4236844,SOANRDO12A81C21E36,ART48CP1187B9AE314,Mr. Soul (Album Version),Rush,229.27628,0,0.546253
19045330,teas,1,TRZZZZD128F4236844,SOANRDO12A81C21E36,ART48CP1187B9AE314,Mr. Soul (Album Version),Rush,229.27628,0,0.546253


In [75]:
merged = pd.merge(left=m, right=df, left_on="stem", right_on="word")
(merged['stem'] == merged['word']).value_counts()
# verify if all words(stem) in the main csv exists in the mapping file

True    19045332
dtype: int64

In [36]:
import nltk
stopwords = nltk.corpus.stopwords

In [38]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [2]:
from textblob import TextBlob
b = TextBlob("ball")
b.detect_language()

HTTPError: HTTP Error 429: Too Many Requests

### Perform Google Translate API language guessing
Guess window is limited to 500 rows for IP due to anonymous API call limit

Result express as ISO 639-1 lang code (https://cloud.google.com/translate/docs/basic/translating-text#language-params)

In [4]:
#split m into list of 10. 500 rows per sub-dataframe
mlist = np.array_split(m, 10)

In [6]:
def language_find(text):
    try:
        return TextBlob(text).detect_language()
    except:
        return None

In [7]:
seg = 0
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Finland

In [8]:
seg = 1
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Luxembourg

In [10]:
seg = 2
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#France

In [12]:
seg = 3
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Sweden

In [14]:
seg = 4
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Norway

In [16]:
seg = 5
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Austria

In [18]:
seg = 6
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Italy

In [22]:
seg = 7
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Spain

In [28]:
seg = 8
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#UAE

In [30]:
seg = 9
mlist[seg]['lang_GT_API'] = mlist[seg]['word_m'].apply(language_find)
#Japan

In [31]:
mlist[seg][mlist[seg]['lang_GT_API'].isna()]

Unnamed: 0,stem,word_m,lang_GT_API
4525,tv,tv,
4556,as,as,
4559,ar,ar,
4563,ku,ku,
4567,va,va,
4570,ve,ve,
4574,vi,vi,
4583,vu,vu,
4640,i,i,
4648,1,1,


In [35]:
#rebuild complete df from list of sub-df
m3 = pd.concat(mlist)

In [59]:
#store result df into csv
m3.to_csv("mxm_reverse_mapping_lang.csv",sep="\t",index=False)

In [49]:
#count null value among guess language result
m3[m3['lang_GT_API'].isna()].shape

(225, 3)

In [54]:
#check len(word_m) of null value
for i, r in m3[m3['lang_GT_API'].isna()].iterrows():
    if len(r['word_m']) >= 3:
        print(r['word_m'])

In [60]:
#reload df from csv
m = pd.read_csv("mxm_reverse_mapping_lang.csv", sep="\t")
m

Unnamed: 0,stem,word_m,lang_GT_API
0,día,día,es
1,pido,pido,es
2,hatr,hatred,en
3,pide,pide,es
4,yellow,yellow,en
...,...,...,...
4995,yell,yell,en
4996,at,at,
4997,confess,confess,en
4998,sincer,sincere,en


In [85]:
m['lang_GT_API'].value_counts()

en       3168
es        520
fr        241
de        218
pt        153
it        143
sv         60
nl         46
fi         33
ro         16
id         16
no         13
ar         13
da         11
pl         11
vi          9
tr          8
tl          6
hu          6
bs          6
et          5
zh-CN       5
sq          4
lt          4
ja          4
sl          4
ku          4
is          3
la          3
af          3
ca          3
lv          3
cs          3
el          3
az          2
hi          2
bg          2
sk          2
mt          2
ru          2
ms          2
cy          2
so          1
sw          1
eu          1
ht          1
bn          1
eo          1
gl          1
ga          1
mi          1
ha          1
uz          1
Name: lang_GT_API, dtype: int64

## stop-word tagging

In [112]:
#overview of nltk stopwords' supported languages
#trying to see how good or how bad the support is against our list of words and guess languages
from nltk.corpus import stopwords
import iso639

def isoToLabel(code):
    return iso639.languages.get(alpha2=code[0:2]).name.lower()

supportedLang = []
unsupportedLang = []
for lang in m['lang_GT_API'][~m['lang_GT_API'].isna()].unique():
    langLabel = isoToLabel(lang)
    if langLabel in stopwords.fileids():
        supportedLang.append(langLabel)
    else:
        unsupportedLang.append(langLabel)
print("----Stop word list supported Lang----")
for langLabel in supportedLang:
    print(langLabel)
print("----Stop word list unsupported Lang----")
for langLabel in unsupportedLang:
    print(langLabel)

----Stop word list supported Lang----
spanish
english
german
finnish
french
swedish
italian
hungarian
portuguese
dutch
indonesian
arabic
norwegian
danish
turkish
romanian
azerbaijani
russian
----Stop word list unsupported Lang----
bulgarian
latvian
polish
catalan
albanian
malay (macrolanguage)
slovenian
irish
basque
bosnian
afrikaans
czech
modern greek (1453-)
hindi
kurdish
hausa
vietnamese
tagalog
japanese
bengali
slovak
welsh
estonian
chinese
uzbek
lithuanian
latin
galician
maltese
icelandic
esperanto
haitian
swahili (macrolanguage)
somali
maori


In [122]:
#tag stopword according to guess languages
def isStopWord(word, langcode):
    try:
        return word in stopwords.words(isoToLabel(langcode))
    except:
        return np.nan

m['stopword'] = m.apply(lambda row: isStopWord(row['word_m'], row['lang_GT_API']), axis=1)

In [124]:
#showing count of stop words
#503 stop words tagged
m['stopword'].value_counts()

False    4166
True      503
Name: stopword, dtype: int64

In [126]:
#showing words not evaluated
#recall: 225 of them are len() < 3
#the rest are from unsupported language
m['stopword'].isna().value_counts()

False    4669
True      331
Name: stopword, dtype: int64

In [125]:
m.to_csv("mxm_reverse_mapping_lang_stop.csv",sep="\t",index=False)