In [None]:
import pandas as pd
import s3fs
import boto3
from io import StringIO # python3; python2: BytesIO 
from boto3.s3.transfer import TransferConfig
import numpy as np
import re
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import gensim
from gensim import models
from gensim import corpora
import lda
import fasttext
import fasttext.util

In [None]:
train_input = pd.read_csv('s3://recsys-challenge-2020/train_input.csv')
val_input = pd.read_csv('s3://recsys-challenge-2020/val_input.csv')
test_input = pd.read_csv('s3://recsys-challenge-2020/test_input.csv')

In [None]:
train_input.head()

In [None]:
train_input.index = train_input.index.astype(str) + '_train'
val_input.index = val_input.index.astype(str) + '_val'
test_input.index = test_input.index.astype(str) + '_test'

In [None]:
all_data = pd.concat([train_input, val_input, test_input])

In [None]:
all_data.head()

In [None]:
len(all_data)

In [None]:
full_input = all_data.sort_values(by='name')

In [None]:
languages = list(full_input.name.unique())

In [None]:
languages

In [None]:
def get_fasttext_sentence_embedding(row, ft):
    if pd.isna(row):
        return np.zeros(20)
    return ft.get_sentence_vector(row)

In [None]:
all_lang_output = pd.DataFrame()

In [None]:
for language in languages:
    print('starting langage: ' + language)
    lang_output = pd.DataFrame()
    lang_input = full_input.loc[full_input.name == language]
    fasttext.util.download_model(language, if_exists='ignore')  # English
    ft = fasttext.load_model('cc.'+language+'.300.bin')
    fasttext.util.reduce_model(ft, 20)
    lang_output['sentence_embedding'] = lang_input.apply(lambda x: get_fasttext_sentence_embedding(x.tweet_text, ft), axis = 1)
    all_lang_output = all_lang_output.concat([all_lang_output, lang_output])
    print('finished language: ' + language)

### Try with English

In [None]:
def get_fasttext_sentence_embedding(row):
    if pd.isna(row):
        return np.zeros(20)
    return ft.get_sentence_vector(row)

In [None]:
>>> import fasttext.util
>>> fasttext.util.download_model('en', if_exists='ignore')  # English
>>> ft = fasttext.load_model('cc.en.300.bin')

In [None]:
fasttext.util.reduce_model(ft, 20)

In [None]:
lang_output = pd.DataFrame()
lang_input = full_input.loc[full_input.name == 'en']

In [None]:
lang_output['sentence_embedding'] = lang_input.apply(lambda x: get_fasttext_sentence_embedding(x.tweet_text), axis = 1)

In [None]:
all_lang_output = all_lang_output.concat([all_lang_output, lang_output])