In [1]:
import pandas as pd
import s3fs
import boto3
from io import StringIO # python3; python2: BytesIO 
from boto3.s3.transfer import TransferConfig
import numpy as np
import re
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import gensim
from gensim import models
from gensim import corpora
import lda

unable to import 'smart_open.gcs', disabling that module


In [2]:
stop_words = ['rt','_']
stop_words

['rt', '_']

In [22]:
def preprocessing(row):
    if pd.isna(row):
        return [""]
    stopword_removed_output = [w for w in row.split(' ') if not w in stop_words]
    return stopword_removed_output

In [4]:
train_input = pd.read_csv('s3://recsys-challenge-2020/train_input.csv')
val_input = pd.read_csv('s3://recsys-challenge-2020/val_input.csv')
test_input = pd.read_csv('s3://recsys-challenge-2020/test_input.csv')

In [5]:
train_input.head()

Unnamed: 0,tweet_text,name
0,envivo buenas noches comienza una nueva edició...,pt
1,esta bebé nació con una marca que la ha hecho ...,pt
2,elsalvador el tiempo y el olvido juegan en con...,pt
3,menciona a la mane _ acasore de tu karime _ ac...,pt
4,ddn apocalipsis zombi tras visitar otra agenci...,pt


In [6]:
train_input.index = train_input.index.astype(str) + '_train'

In [7]:
val_input.head()

Unnamed: 0,tweet_text,name
0,sm 사옥빌딩 앞에 있는 전광판차 사진들입니다 전광판차는 사옥 앞에서 오늘 아침 9...,ko
1,방송 막하지 말라던 최민호는 본인이 막히기 시작하는데 t co wqz58zzz5l,ko
2,rt lovablebh _ 0506 백현이 오늘 심각하게 unk 던지고 가만히 서서...,ko
3,sm 사옥빌딩 앞에 있는 전광판차 사진들입니다 전광판차는 사옥 앞에서 오늘 아침 9...,ko
4,sm 사옥빌딩 앞에 있는 전광판차 사진들입니다 전광판차는 사옥 앞에서 오늘 아침 9...,ko


In [8]:
val_input.index = val_input.index.astype(str) + '_val'

In [9]:
test_input.head()

Unnamed: 0,tweet_text,name
0,funky techno witch t co ydfhit7ncn,en
1,why lesbian couples are more likely to divorce...,en
2,the c e o of acronym the tech consultancy behi...,en
3,people that live under the philosophy of unk i...,en
4,the man is literally barking,en


In [10]:
test_input.index = test_input.index.astype(str) + '_test'

In [11]:
all_data = pd.concat([train_input, val_input, test_input])

In [12]:
all_data

Unnamed: 0,tweet_text,name
0_train,envivo buenas noches comienza una nueva edició...,pt
1_train,esta bebé nació con una marca que la ha hecho ...,pt
2_train,elsalvador el tiempo y el olvido juegan en con...,pt
3_train,menciona a la mane _ acasore de tu karime _ ac...,pt
4_train,ddn apocalipsis zombi tras visitar otra agenci...,pt
...,...,...
15127679_test,გაერთიანებული სამეფოს პარლამენტმა დაასრულა 21 ...,en
15127680_test,ვაშინგტონ პოსტის თანახმად map of the soul 7 ის...,en
15127681_test,cooperaciónyhermandad ههههه unk unk unk nicola...,en
15127682_test,unk unk unk གནས unk ར ར གས unk unk unk unk རང ...,en


In [13]:
full_input = all_data.sort_values(by='name')

In [14]:
languages = list(full_input.name.unique())

In [15]:
full_input.head()

Unnamed: 0,tweet_text,name
127389791_train,coens wil kleinere kieskringen wat een slecht ...,af
127380736_train,omg hahahahhah,af
127380735_train,omg hahahahhah,af
127380734_train,rt lene _ denissen ben aant wachten op iets da...,af
127380733_train,rt vrouwvdvrijheid het aantal asielzoekers dat...,af


In [16]:
languages

['af',
 'ar',
 'bg',
 'bn',
 'ca',
 'de',
 'el',
 'en',
 'et',
 'fa',
 'fi',
 'fr',
 'gu',
 'he',
 'hr',
 'hu',
 'id',
 'it',
 'ja',
 'kn',
 'ko',
 'lv',
 'mk',
 'ml',
 'mr',
 'pa',
 'pt',
 'ru',
 'sw',
 'ta',
 'te',
 'tl',
 'tr',
 'ur',
 'vi']

In [None]:
all_lang_output = pd.DataFrame()
for language in languages:
    print('starting langage: ' + language)
    lang_input = full_input.loc[full_input.name == language]
    lang_input['tweet_tokens'] = lang_input.apply(lambda x: preprocessing(x.tweet_text), axis = 1)
    dictionary = corpora.Dictionary(lang_input.tweet_tokens)
    corpus = [dictionary.doc2bow(text) for text in lang_input.tweet_tokens]
    ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = 10, id2word=dictionary, passes=5, workers=96)
    all_topics = ldamodel.get_document_topics(corpus, minimum_probability=0.0)
    all_topics_csr = gensim.matutils.corpus2csc(all_topics)
    all_topics_numpy = all_topics_csr.T.toarray()
    all_topics_df = pd.DataFrame(all_topics_numpy)
    all_topics_df.index = lang_input.index
    lang_output = pd.concat([lang_input, all_topics_df], axis = 1)
    all_lang_output = pd.concat([all_lang_output, lang_output])
    print('finished language: ' + language)

starting langage: af


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


finished language: af
starting langage: ar


In [None]:
all_lang_output