In [1]:
import pandas as pd
import s3fs
import boto3
from io import StringIO # python3; python2: BytesIO 
from boto3.s3.transfer import TransferConfig
import numpy as np
import re
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import gensim
from gensim import models
from gensim import corpora
import lda
import fasttext
import fasttext.util

unable to import 'smart_open.gcs', disabling that module


In [2]:
train_input = pd.read_csv('s3://recsys-challenge-2020/train_input.csv')
val_input = pd.read_csv('s3://recsys-challenge-2020/val_input.csv')
test_input = pd.read_csv('s3://recsys-challenge-2020/test_input.csv')

In [3]:
train_input.head()

Unnamed: 0,tweet_text,name
0,envivo buenas noches comienza una nueva edició...,pt
1,esta bebé nació con una marca que la ha hecho ...,pt
2,elsalvador el tiempo y el olvido juegan en con...,pt
3,menciona a la mane _ acasore de tu karime _ ac...,pt
4,ddn apocalipsis zombi tras visitar otra agenci...,pt


In [4]:
train_input.index = train_input.index.astype(str) + '_train'
val_input.index = val_input.index.astype(str) + '_val'
test_input.index = test_input.index.astype(str) + '_test'

In [5]:
all_data = pd.concat([train_input, val_input, test_input])

In [6]:
all_data.head()

Unnamed: 0,tweet_text,name
0_train,envivo buenas noches comienza una nueva edició...,pt
1_train,esta bebé nació con una marca que la ha hecho ...,pt
2_train,elsalvador el tiempo y el olvido juegan en con...,pt
3_train,menciona a la mane _ acasore de tu karime _ ac...,pt
4_train,ddn apocalipsis zombi tras visitar otra agenci...,pt


In [7]:
len(all_data)

163202921

In [8]:
full_input = all_data.sort_values(by='name')

In [9]:
languages = list(full_input.name.unique())

In [10]:
languages

['af',
 'ar',
 'bg',
 'bn',
 'ca',
 'de',
 'el',
 'en',
 'et',
 'fa',
 'fi',
 'fr',
 'gu',
 'he',
 'hr',
 'hu',
 'id',
 'it',
 'ja',
 'kn',
 'ko',
 'lv',
 'mk',
 'ml',
 'mr',
 'pa',
 'pt',
 'ru',
 'sw',
 'ta',
 'te',
 'tl',
 'tr',
 'ur',
 'vi']

In [11]:
def get_fasttext_sentence_embedding(row, ft):
    if pd.isna(row):
        return np.zeros(20)
    return ft.get_sentence_vector(row)

In [12]:
all_lang_output = pd.DataFrame()

In [None]:
for language in languages:
    if (language == 'af') or (language == 'ar') or (language == 'bg') or (language == 'bn')\
    or (language == 'ca') or (language == 'de') or (language == 'el'):
        continue
    print('starting langage: ' + language)
    lang_output = pd.DataFrame()
    lang_input = full_input.loc[full_input.name == language]
    print('number of lines to compute: ' + str(len(lang_input)))
    fasttext.util.download_model(language, if_exists='ignore')  # English
    ft = fasttext.load_model('cc.'+language+'.300.bin')
    fasttext.util.reduce_model(ft, 20)
    lang_output['sentence_embedding'] = lang_input.apply(lambda x: get_fasttext_sentence_embedding(x.tweet_text, ft), axis = 1)
    all_lang_output = pd.concat([all_lang_output, lang_output])
    print('finished language: ' + language)

starting langage: en
number of lines to compute: 80296505




In [None]:
! rm -rf ./cc.*.300.bin*

In [None]:
all_lang_output.to_csv('./all_lang_output.csv', index = False)

### Try parallelization with English

In [None]:
def get_fasttext_sentence_embedding_try(row, ft):
    if pd.isna(row['tweet_text']):
        return np.zeros(20)
    return ft.get_sentence_vector(row['tweet_text'])

In [None]:
from multiprocessing import cpu_count, Pool

cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
 
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func(ft), data_split))
    pool.close()
    pool.join()
    return data

lang_output = pd.DataFrame()
lang_output['sentence_embedding'] = parallelize(lang_input[['tweet_text']], get_fasttext_sentence_embedding_try(ft));

In [None]:
lang_output

### Try with English

In [None]:
import dask.dataframe as dd
from dask.multiprocessing import get

In [None]:
lang_input

In [None]:
print('starting langage: ' + 'en')
lang_output = pd.DataFrame()
#lang_input = full_input.loc[full_input.name == 'en']
ddata = dd.from_pandas(lang_input, npartitions = 96)
print('number of lines to compute: ' + str(len(lang_input)))
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.'+'en'+'.300.bin')
fasttext.util.reduce_model(ft, 20)
lang_output['sentence_embedding'] = ddata.map_partitions(lambda lang_input: lang_input.apply((lambda x: get_fasttext_sentence_embedding(x.tweet_text, ft)), axis = 1)).compute(scheduler='processes')
# res = ddata.map_partitions(lambda df: df.apply((lambda row: myfunc(*row)), axis=1)).compute(get=get)
print('finished en')

In [None]:
lang_output['sentence_embedding'] = ddata.map_partitions(lambda lang_input: lang_input.apply((lambda x: get_fasttext_sentence_embedding(x.tweet_text, ft)), axis = 1)).compute(scheduler='processes')
# res = ddata.map_partitions(lambda df: df.apply((lambda row: myfunc(*row)), axis=1)).compute(get=get)
print('finished en')

In [None]:
all_lang_output = pd.concat([all_lang_output, lang_output])

In [None]:
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')
fasttext.util.reduce_model(ft, 20)
lang_output = pd.DataFrame()
lang_input = full_input.loc[full_input.name == 'en']
lang_output['sentence_embedding'] = lang_input.apply(lambda x: get_fasttext_sentence_embedding(x.tweet_text), axis = 1)

In [None]:
all_lang_output = all_lang_output.concat([all_lang_output, lang_output])