In [7]:
import pandas as pd
import s3fs
import boto3
from io import StringIO # python3; python2: BytesIO 
from boto3.s3.transfer import TransferConfig
import numpy as np
import re
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import gensim
from gensim import models
from gensim import corpora
import lda

In [None]:
nltk.download('stopwords')

In [8]:
stemmer = FrenchStemmer()

In [9]:
pd.set_option('display.max_colwidth', -1)

  """Entry point for launching an IPython kernel.


In [10]:
val_input = pd.read_csv('s3://recsys-challenge-2020/val_input.csv')

In [11]:
languages = list(val_input.name.unique())

In [41]:
languages

['ko',
 'en',
 'id',
 'pt',
 'de',
 'ja',
 'fr',
 'ru',
 'fa',
 'ar',
 'tr',
 'te',
 'vi',
 'tl',
 'el',
 'ur',
 'it',
 'et',
 'ta',
 'mr',
 'ca',
 'af',
 'fi',
 'gu',
 'he',
 'hr',
 'mk',
 'hu',
 'ml',
 'kn',
 'bn',
 'lv',
 'pa',
 'sw',
 'bg']

In [12]:
val_input.head()

Unnamed: 0,tweet_text,name
0,sm 사옥빌딩 앞에 있는 전광판차 사진들입니다 전광판차는 사옥 앞에서 오늘 아침 9시부터 오후 4시까지 있을 예정입니다 weareoneexo exo 엑소 t co cs70eh6ikt,ko
1,방송 막하지 말라던 최민호는 본인이 막히기 시작하는데 t co wqz58zzz5l,ko
2,rt lovablebh _ 0506 백현이 오늘 심각하게 unk 던지고 가만히 서서 빵싯 웃는 거 unk 뛰어 내려갈 때는 또 얼마나 unk t co j9e1xykief,ko
3,sm 사옥빌딩 앞에 있는 전광판차 사진들입니다 전광판차는 사옥 앞에서 오늘 아침 9시부터 오후 4시까지 있을 예정입니다 weareoneexo exo 엑소 t co cs70eh6ikt,ko
4,sm 사옥빌딩 앞에 있는 전광판차 사진들입니다 전광판차는 사옥 앞에서 오늘 아침 9시부터 오후 4시까지 있을 예정입니다 weareoneexo exo 엑소 t co cs70eh6ikt,ko


In [13]:
val_input.sort_values(by='name', inplace=True)

In [15]:
val_input.head()

Unnamed: 0,tweet_text,name
14618031,een ss monster in de europese unie van de j t co pspcrigsrh,af
14609658,vorige week toevallig keertje meegedaan aan top5over9 van skyradio101fm en mijn top 5 werd afgelopen maandag uitgezonden sliep zelf vanwege nachtdiensten maar dutchelevation hoorde het gister ontving ik mijn kado heel blij mee dank skyradio101fm t co uqgigyiyom,af
14609659,aberdeen to psv for 1p unk,af
14609660,rt apink _ 2011 에이핑크 naeun x chicor x bazaar fashion film unk t co jtkbl112te apink 손나은 나은 t co mcc4gxhdce,af
14609661,apple laat zijn apple maps plattegronddienst dit jaar in europa een flinke metamorfose ondergaan en geeft de navigatiekaarten meer details en functies domende maanden begint de fabrikant met de aanpassingen nadat deze eerst in de vs zijn doorgevoerd t co kl1nioo2dp,af


In [None]:
for language in languages:
    val_input.loc[val_input.name == language]

In [14]:
en_val_input = val_input.loc[val_input.name == 'en']

In [None]:
en_val_input.head()

In [16]:
fr_val_input = val_input.loc[val_input.name == 'fr']

In [17]:
stop_words = list(get_stop_words('fr'))         #About 900 stopwords
nltk_words = list(stopwords.words('french')) #About 150 stopwords
stop_words.extend(nltk_words)
stop_words.extend(['rt'])
stop_words.extend(['_'])

In [19]:
def stop_word_removal(row):
    stopword_removed_output = [w for w in row.split(' ') if not w in stop_words]
    lemmatized_output = [stemmer.stem(w) for w in stopword_removed_output]
    return lemmatized_output

In [20]:
fr_val_input['tweet_tokens'] = fr_val_input.apply(lambda x: stop_word_removal(x.tweet_text), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
fr_val_input.head()

Unnamed: 0,tweet_text,name,tweet_tokens
10507192,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]"
10507191,rt paulineolg moi tu me dis ça j unk te traite de mytho en te regardant droit dans les yeux faut que j unk adoucisse mon coeur un peu,fr,"[paulineolg, dis, unk, trait, mytho, regard, droit, yeux, faut, unk, adouc, coeur]"
10507190,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]"
10507189,rt biboo _ r6 retrouvez moi dimanche 2 février à 19h30 dans l émission radio esix présenté par le magnifique r3siak unk je compte sur vous unk,fr,"[biboo, r6, retrouv, dimanch, 2, févri, 19h30, émiss, radio, esix, présent, magnif, r3siak, unk, compt, unk]"
10507188,rt nayonek _ pour le dire plus poliment que certains tu as percé grâce à ta victimisation sur les réseaux et sur tes vidéos le fait que unk,fr,"[nayonek, dir, plus, pol, certain, perc, grâc, victimis, réseau, vidéos, unk]"


In [23]:
dictionary = corpora.Dictionary(fr_val_input.tweet_tokens)
corpus = [dictionary.doc2bow(text) for text in fr_val_input.tweet_tokens]

In [53]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(6, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1)],
 [(6, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1)],
 [(6, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1)],
 [(47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1)],
 [(58, 1), (59, 1), (60, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(6, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1)],
 [(6, 2), (41, 1), (42, 1), (43, 1), 

In [33]:
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = 100, id2word=dictionary, passes=5, workers=96)

Process ForkPoolWorker-459:
Process ForkPoolWorker-454:
Process ForkPoolWorker-456:
Process ForkPoolWorker-455:
Process ForkPoolWorker-458:
Process ForkPoolWorker-460:
Traceback (most recent call last):
Process ForkPoolWorker-452:
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Process ForkPoolWorker-451:
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line

  File "/home/ubuntu/anaconda3/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 334, in worker_e_step
    chunk_no, chunk, worker_lda = input_queue.get()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._ta

KeyboardInterrupt
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 334, in worker_e_step
    chunk_no, chunk, worker_lda = input_queue.get()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 334, in worker_e_step
    chunk_no, chunk, worker_lda = input_queue.get()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/home/ubu

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 334, in worker_e_step
    chunk_no, chunk, worker_lda = input_queue.get()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/queues.py", line 93, in get
    with self._rlock:
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/home/ubuntu/anaconda3/lib/py

  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)

During handling of the above exception, another exception occurred:

  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 201

  File "/home/ubuntu/anaconda3/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
KeyboardInterrupt
  File "/home/ubuntu/anaconda3/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/home/ubuntu/anaconda3/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/home/ubuntu/anaconda3/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/home/ubuntu/anaconda3/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()
  File "/home/ubuntu/anaconda3/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/home/ubuntu/anaconda3/lib/python3.7/threading.py", line 1060,

In [35]:
print(len(topics))

20


In [34]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(94, '0.079*"unk" + 0.031*"co" + 0.021*"messag" + 0.020*"plaidoi" + 0.014*"supprim" + 0.013*"qd" + 0.012*"directric" + 0.011*"va" + 0.010*"sent" + 0.009*"ignor"')
(0, '0.066*"unk" + 0.029*"co" + 0.012*"fcnant" + 0.012*"1" + 0.009*"agress" + 0.008*"mdrrrr" + 0.008*"2" + 0.008*"vendr" + 0.007*"bien" + 0.007*"plus"')
(66, '0.177*"unk" + 0.033*"co" + 0.018*"derb" + 0.017*"halamadrid" + 0.015*"derby" + 0.014*"attitud" + 0.009*"rien" + 0.009*"femm" + 0.007*"assist" + 0.007*"el"')
(47, '0.056*"unk" + 0.053*"co" + 0.012*"6" + 0.007*"1" + 0.007*"extraordinair" + 0.006*"chaussur" + 0.006*"3" + 0.005*"plus" + 0.005*"4" + 0.005*"thiem"')
(49, '0.113*"unk" + 0.034*"co" + 0.014*"ouais" + 0.011*"non" + 0.008*"ptdrrr" + 0.008*"prendr" + 0.007*"5" + 0.007*"awbach" + 0.007*"oubli" + 0.007*"plus"')
(63, '0.130*"unk" + 0.050*"co" + 0.023*"ptdrrr" + 0.012*"lebron" + 0.012*"poussin" + 0.009*"merveil" + 0.007*"adn" + 0.006*"préfer" + 0.006*"cloch" + 0.006*"anim"')
(92, '0.098*"unk" + 0.055*"co" + 0.045*"prêt

In [72]:
ldamodel.get_document_topics(corpus[0])

[(0, 0.0016676844),
 (1, 0.0016676844),
 (2, 0.0016676844),
 (3, 0.0016676844),
 (4, 0.0016676844),
 (5, 0.0016676844),
 (6, 0.0016676844),
 (7, 0.0016676844),
 (8, 0.0016676844),
 (9, 0.0016676844),
 (10, 0.0016676844),
 (11, 0.0016676844),
 (12, 0.0016676844),
 (13, 0.0016676844),
 (14, 0.0016676844),
 (15, 0.0016676844),
 (16, 0.0016676844),
 (17, 0.0016676844),
 (18, 0.0016676844),
 (19, 0.0016676844),
 (20, 0.0016676844),
 (21, 0.0016676844),
 (22, 0.0016676844),
 (23, 0.0016676844),
 (24, 0.0016676844),
 (25, 0.0016676844),
 (26, 0.0016676844),
 (27, 0.0016676844),
 (28, 0.0016676844),
 (29, 0.0016676844),
 (30, 0.0016676844),
 (31, 0.0016676844),
 (32, 0.0016676844),
 (33, 0.0016676844),
 (34, 0.0016676844),
 (35, 0.0016676844),
 (36, 0.0016676844),
 (37, 0.0016676844),
 (38, 0.0016676844),
 (39, 0.0016676844),
 (40, 0.0016676844),
 (41, 0.0016676844),
 (42, 0.0016676844),
 (43, 0.0016676844),
 (44, 0.0016676844),
 (45, 0.0016676844),
 (46, 0.0016676844),
 (47, 0.0016676844),
 (

In [82]:
all_topics = ldamodel.get_document_topics(corpus, per_word_topics=True)

for doc_topics, word_topics, phi_values in all_topics:
    print('New Document \n')
    print('Document topics:', doc_topics)
    print('Word topics:', word_topics)
    print('Phi values:', phi_values)
    print(" ")
    print('-------------- \n')

New Document 

Document topics: [(0, 0.001667684), (1, 0.001667684), (2, 0.001667684), (3, 0.001667684), (4, 0.001667684), (5, 0.001667684), (6, 0.001667684), (7, 0.001667684), (8, 0.001667684), (9, 0.001667684), (10, 0.001667684), (11, 0.001667684), (12, 0.001667684), (13, 0.001667684), (14, 0.001667684), (15, 0.001667684), (16, 0.001667684), (17, 0.001667684), (18, 0.001667684), (19, 0.001667684), (20, 0.001667684), (21, 0.001667684), (22, 0.001667684), (23, 0.001667684), (24, 0.001667684), (25, 0.001667684), (26, 0.001667684), (27, 0.001667684), (28, 0.001667684), (29, 0.001667684), (30, 0.001667684), (31, 0.001667684), (32, 0.001667684), (33, 0.001667684), (34, 0.001667684), (35, 0.001667684), (36, 0.001667684), (37, 0.001667684), (38, 0.001667684), (39, 0.001667684), (40, 0.001667684), (41, 0.001667684), (42, 0.001667684), (43, 0.001667684), (44, 0.001667684), (45, 0.001667684), (46, 0.001667684), (47, 0.001667684), (48, 0.001667684), (49, 0.001667684), (50, 0.001667684), (51, 0.0

New Document 

Document topics: [(0, 0.0025012307), (1, 0.0025012307), (2, 0.0025012307), (3, 0.0025012307), (4, 0.0025012307), (5, 0.0025012307), (6, 0.0025012307), (7, 0.0025012307), (8, 0.0025012307), (9, 0.0025012307), (10, 0.0025012307), (11, 0.0025012307), (12, 0.0025012307), (13, 0.0025012307), (14, 0.0025012307), (15, 0.0025012307), (16, 0.0025012307), (17, 0.0025012307), (18, 0.0025012307), (19, 0.0025012307), (20, 0.0025012307), (21, 0.0025012307), (22, 0.0025012307), (23, 0.0025012307), (24, 0.0025012307), (25, 0.0025012307), (26, 0.0025012307), (27, 0.75237817), (28, 0.0025012307), (29, 0.0025012307), (30, 0.0025012307), (31, 0.0025012307), (32, 0.0025012307), (33, 0.0025012307), (34, 0.0025012307), (35, 0.0025012307), (36, 0.0025012307), (37, 0.0025012307), (38, 0.0025012307), (39, 0.0025012307), (40, 0.0025012307), (41, 0.0025012307), (42, 0.0025012307), (43, 0.0025012307), (44, 0.0025012307), (45, 0.0025012307), (46, 0.0025012307), (47, 0.0025012307), (48, 0.0025012307),

New Document 

Document topics: [(0, 0.0005884072), (1, 0.0005884072), (2, 0.0005884072), (3, 0.0005884072), (4, 0.0005884072), (5, 0.10528291), (6, 0.38119242), (7, 0.0005884072), (8, 0.0005884072), (9, 0.0005884072), (10, 0.27801082), (11, 0.17902675), (12, 0.0005884072), (13, 0.0005884072), (14, 0.0005884072), (15, 0.0005884072), (16, 0.0005884072), (17, 0.0005884072), (18, 0.0005884072), (19, 0.0005884072), (20, 0.0005884072), (21, 0.0005884072), (22, 0.0005884072), (23, 0.0005884072), (24, 0.0005884072), (25, 0.0005884072), (26, 0.0005884072), (27, 0.0005884072), (28, 0.0005884072), (29, 0.0005884072), (30, 0.0005884072), (31, 0.0005884072), (32, 0.0005884072), (33, 0.0005884072), (34, 0.0005884072), (35, 0.0005884072), (36, 0.0005884072), (37, 0.0005884072), (38, 0.0005884072), (39, 0.0005884072), (40, 0.0005884072), (41, 0.0005884072), (42, 0.0005884072), (43, 0.0005884072), (44, 0.0005884072), (45, 0.0005884072), (46, 0.0005884072), (47, 0.0005884072), (48, 0.0005884072), (49, 

New Document 

Document topics: [(0, 0.00083350495), (1, 0.00083350495), (2, 0.00083350495), (3, 0.00083350495), (4, 0.00083350495), (5, 0.00083350495), (6, 0.00083350495), (7, 0.09262548), (8, 0.00083350495), (9, 0.00083350495), (10, 0.13238639), (11, 0.00083350495), (12, 0.00083350495), (13, 0.00083350495), (14, 0.00083350495), (15, 0.09999642), (16, 0.00083350495), (17, 0.00083350495), (18, 0.00083350495), (19, 0.00083350495), (20, 0.00083350495), (21, 0.00083350495), (22, 0.00083350495), (23, 0.00083350495), (24, 0.00083350495), (25, 0.00083350495), (26, 0.00083350495), (27, 0.00083350495), (28, 0.00083350495), (29, 0.00083350495), (30, 0.00083350495), (31, 0.00083350495), (32, 0.00083350495), (33, 0.00083350495), (34, 0.00083350495), (35, 0.38457543), (36, 0.00083350495), (37, 0.00083350495), (38, 0.00083350495), (39, 0.00083350495), (40, 0.00083350495), (41, 0.00083350495), (42, 0.00083350495), (43, 0.00083350495), (44, 0.00083350495), (45, 0.00083350495), (46, 0.00083350495), (4

New Document 

Document topics: [(0, 0.12527187), (1, 0.0012542927), (2, 0.0012542927), (3, 0.0012542927), (4, 0.0012542927), (5, 0.0012542927), (6, 0.0012542927), (7, 0.0012542927), (8, 0.0012542927), (9, 0.0012542927), (10, 0.0012542927), (11, 0.0012542927), (12, 0.0012542927), (13, 0.0012542927), (14, 0.0012542927), (15, 0.0012542927), (16, 0.0012542927), (17, 0.0012542927), (18, 0.0012542927), (19, 0.0012542927), (20, 0.0012542927), (21, 0.0012542927), (22, 0.0012542927), (23, 0.0012542927), (24, 0.0012542927), (25, 0.0012542927), (26, 0.0012542927), (27, 0.0012542927), (28, 0.14063121), (29, 0.0012542927), (30, 0.0012542927), (31, 0.0012542927), (32, 0.0012542927), (33, 0.0012542927), (34, 0.0012542927), (35, 0.0012542927), (36, 0.0012542927), (37, 0.0012542927), (38, 0.0012542927), (39, 0.0012542927), (40, 0.0012542927), (41, 0.0012542927), (42, 0.0012542927), (43, 0.0012542927), (44, 0.0012542927), (45, 0.0012542927), (46, 0.0012542927), (47, 0.0012542927), (48, 0.0012542927), (

New Document 

Document topics: [(0, 0.0014287417), (1, 0.0014287417), (2, 0.0014287417), (3, 0.0014287417), (4, 0.0014287417), (5, 0.0014287417), (6, 0.0014287417), (7, 0.0014287417), (8, 0.0014287417), (9, 0.0014287417), (10, 0.0014287417), (11, 0.0014287417), (12, 0.0014287417), (13, 0.0014287417), (14, 0.0014287417), (15, 0.0014287417), (16, 0.0014287417), (17, 0.0014287417), (18, 0.0014287417), (19, 0.0014287417), (20, 0.0014287417), (21, 0.0014287417), (22, 0.0014287417), (23, 0.0014287417), (24, 0.0014287417), (25, 0.0014287417), (26, 0.0014287417), (27, 0.0014287417), (28, 0.0014287417), (29, 0.0014287417), (30, 0.0014287417), (31, 0.0014287417), (32, 0.0014287417), (33, 0.0014287417), (34, 0.0014287417), (35, 0.0014287417), (36, 0.0014287417), (37, 0.0014287417), (38, 0.0014287417), (39, 0.0014287417), (40, 0.0014287417), (41, 0.0014287417), (42, 0.0014287417), (43, 0.0014287417), (44, 0.0014287417), (45, 0.0014287417), (46, 0.0014287417), (47, 0.0014287417), (48, 0.0014287417

New Document 

Document topics: [(0, 0.0011115726), (1, 0.0011115726), (2, 0.0011115726), (3, 0.0011115726), (4, 0.0011115726), (5, 0.0011115726), (6, 0.0011115726), (7, 0.0011115726), (8, 0.0011115726), (9, 0.0011115726), (10, 0.12066383), (11, 0.0011115726), (12, 0.0011115726), (13, 0.0011115726), (14, 0.0011115726), (15, 0.0011115726), (16, 0.0011115726), (17, 0.0011115726), (18, 0.0011115726), (19, 0.0011115726), (20, 0.0011115726), (21, 0.0011115726), (22, 0.0011115726), (23, 0.0011115726), (24, 0.0011115726), (25, 0.0011115726), (26, 0.0011115726), (27, 0.0011115726), (28, 0.0011115726), (29, 0.0011115726), (30, 0.0011115726), (31, 0.0011115726), (32, 0.0011115726), (33, 0.0011115726), (34, 0.0011115726), (35, 0.0011115726), (36, 0.0011115726), (37, 0.0011115726), (38, 0.0011115726), (39, 0.0011115726), (40, 0.0011115726), (41, 0.0011115726), (42, 0.0011115726), (43, 0.0011115726), (44, 0.0011115726), (45, 0.0011115726), (46, 0.0011115726), (47, 0.0011115726), (48, 0.0011115726),

New Document 

Document topics: [(0, 0.0011111139), (1, 0.0011111139), (2, 0.0011111139), (3, 0.0011111139), (4, 0.0011111139), (5, 0.0011111139), (6, 0.0011111139), (7, 0.0011111139), (8, 0.0011111139), (9, 0.0011111139), (10, 0.0011111139), (11, 0.0011111139), (12, 0.0011111139), (13, 0.0011111139), (14, 0.0011111139), (15, 0.88999975), (16, 0.0011111139), (17, 0.0011111139), (18, 0.0011111139), (19, 0.0011111139), (20, 0.0011111139), (21, 0.0011111139), (22, 0.0011111139), (23, 0.0011111139), (24, 0.0011111139), (25, 0.0011111139), (26, 0.0011111139), (27, 0.0011111139), (28, 0.0011111139), (29, 0.0011111139), (30, 0.0011111139), (31, 0.0011111139), (32, 0.0011111139), (33, 0.0011111139), (34, 0.0011111139), (35, 0.0011111139), (36, 0.0011111139), (37, 0.0011111139), (38, 0.0011111139), (39, 0.0011111139), (40, 0.0011111139), (41, 0.0011111139), (42, 0.0011111139), (43, 0.0011111139), (44, 0.0011111139), (45, 0.0011111139), (46, 0.0011111139), (47, 0.0011111139), (48, 0.0011111139),

New Document 

Document topics: [(0, 0.0020009773), (1, 0.0020009773), (2, 0.0020009773), (3, 0.0020009773), (4, 0.0020009773), (5, 0.0020009773), (6, 0.0020009773), (7, 0.0020009773), (8, 0.0020009773), (9, 0.0020009773), (10, 0.0020009773), (11, 0.0020009773), (12, 0.0020009773), (13, 0.0020009773), (14, 0.0020009773), (15, 0.0020009773), (16, 0.0020009773), (17, 0.0020009773), (18, 0.0020009773), (19, 0.0020009773), (20, 0.0020009773), (21, 0.0020009773), (22, 0.0020009773), (23, 0.0020009773), (24, 0.0020009773), (25, 0.0020009773), (26, 0.0020009773), (27, 0.0020009773), (28, 0.0020009773), (29, 0.0020009773), (30, 0.0020009773), (31, 0.0020009773), (32, 0.0020009773), (33, 0.0020009773), (34, 0.0020009773), (35, 0.3762474), (36, 0.0020009773), (37, 0.0020009773), (38, 0.0020009773), (39, 0.0020009773), (40, 0.0020009773), (41, 0.0020009773), (42, 0.0020009773), (43, 0.0020009773), (44, 0.0020009773), (45, 0.0020009773), (46, 0.0020009773), (47, 0.0020009773), (48, 0.2210852), (49

New Document 

Document topics: [(0, 0.0011165966), (1, 0.0011165966), (2, 0.0011165966), (3, 0.0011165966), (4, 0.27261433), (5, 0.0011165966), (6, 0.0011165966), (7, 0.0011165966), (8, 0.0011165966), (9, 0.0011165966), (10, 0.1448488), (11, 0.0011165966), (12, 0.0011165966), (13, 0.0011165966), (14, 0.0011165966), (15, 0.101680495), (16, 0.0011165966), (17, 0.18322448), (18, 0.0011165966), (19, 0.0011165966), (20, 0.0011165966), (21, 0.0011165966), (22, 0.0011165966), (23, 0.0011165966), (24, 0.0011165966), (25, 0.0011165966), (26, 0.0011165966), (27, 0.0011165966), (28, 0.0011165966), (29, 0.0011165966), (30, 0.0011165966), (31, 0.0011165966), (32, 0.0011165966), (33, 0.0011165966), (34, 0.0011165966), (35, 0.0011165966), (36, 0.0011165966), (37, 0.0011165966), (38, 0.0011165966), (39, 0.0011165966), (40, 0.0011165966), (41, 0.0011165966), (42, 0.0011165966), (43, 0.0011165966), (44, 0.0011165966), (45, 0.0011165966), (46, 0.0011165966), (47, 0.0011165966), (48, 0.0011165966), (49, 

New Document 

Document topics: [(0, 0.00052772666), (1, 0.31264547), (2, 0.00052772666), (3, 0.00052772666), (4, 0.00052772666), (5, 0.00052772666), (6, 0.00052772666), (7, 0.03988783), (8, 0.00052772666), (9, 0.00052772666), (10, 0.00052772666), (11, 0.00052772666), (12, 0.00052772666), (13, 0.00052772666), (14, 0.00052772666), (15, 0.00052772666), (16, 0.00052772666), (17, 0.00052772666), (18, 0.00052772666), (19, 0.19111449), (20, 0.00052772666), (21, 0.00052772666), (22, 0.00052772666), (23, 0.00052772666), (24, 0.00052772666), (25, 0.00052772666), (26, 0.00052772666), (27, 0.00052772666), (28, 0.00052772666), (29, 0.00052772666), (30, 0.00052772666), (31, 0.00052772666), (32, 0.110657856), (33, 0.00052772666), (34, 0.00052772666), (35, 0.00052772666), (36, 0.00052772666), (37, 0.00052772666), (38, 0.00052772666), (39, 0.00052772666), (40, 0.00052772666), (41, 0.00052772666), (42, 0.00052772666), (43, 0.00052772666), (44, 0.00052772666), (45, 0.22914879), (46, 0.00052772666), (47,

New Document 

Document topics: [(0, 0.0016666924), (1, 0.0016666924), (2, 0.0016666924), (3, 0.0016666924), (4, 0.0016666924), (5, 0.0016666924), (6, 0.0016666924), (7, 0.0016666924), (8, 0.0016666924), (9, 0.0016666924), (10, 0.0016666924), (11, 0.0016666924), (12, 0.0016666924), (13, 0.0016666924), (14, 0.0016666924), (15, 0.0016666924), (16, 0.0016666924), (17, 0.0016666924), (18, 0.0016666924), (19, 0.0016666924), (20, 0.0016666924), (21, 0.0016666924), (22, 0.0016666924), (23, 0.0016666924), (24, 0.0016666924), (25, 0.0016666924), (26, 0.0016666924), (27, 0.0016666924), (28, 0.0016666924), (29, 0.0016666924), (30, 0.0016666924), (31, 0.0016666924), (32, 0.0016666924), (33, 0.0016666924), (34, 0.0016666924), (35, 0.0016666924), (36, 0.0016666924), (37, 0.0016666924), (38, 0.0016666924), (39, 0.0016666924), (40, 0.66831696), (41, 0.0016666924), (42, 0.0016666924), (43, 0.0016666924), (44, 0.0016666924), (45, 0.0016666924), (46, 0.0016666924), (47, 0.0016666924), (48, 0.0016666924),

KeyboardInterrupt: 

In [78]:
all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()

In [92]:
all_topics_numpy

array([[0.00166769, 0.00166769, 0.00166769, ..., 0.00166769, 0.00166769,
        0.00166769],
       [0.00076925, 0.00076925, 0.00076925, ..., 0.00076925, 0.00076925,
        0.00076925],
       [0.00166774, 0.00166774, 0.00166774, ..., 0.00166774, 0.00166774,
        0.00166774],
       ...,
       [0.00066668, 0.00066668, 0.00066668, ..., 0.00066668, 0.00066668,
        0.00066668],
       [0.00250003, 0.00250003, 0.00250003, ..., 0.00250003, 0.00250003,
        0.00250003],
       [0.00083339, 0.00083339, 0.00083339, ..., 0.00083339, 0.00083339,
        0.00083339]])

In [109]:
topic_distribution_fr = pd.DataFrame(all_topics_numpy)

In [115]:
all_topics_numpy[0][89]

0.3045954704284668

In [117]:
topic_distribution_fr.loc[0][89]

0.3045954704284668

In [118]:
topic_distribution_fr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,...,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668
1,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,...,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769
2,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,...,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668
3,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,...,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588
4,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,...,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833


In [121]:
topic_distribution_fr.loc[10507192][89]

0.3045954704284668

In [97]:
len(topic_distribution_fr)

405160

In [105]:
fr_val_input.head()

Unnamed: 0,tweet_text,name,tweet_tokens
10507192,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]"
10507191,rt paulineolg moi tu me dis ça j unk te traite de mytho en te regardant droit dans les yeux faut que j unk adoucisse mon coeur un peu,fr,"[paulineolg, dis, unk, trait, mytho, regard, droit, yeux, faut, unk, adouc, coeur]"
10507190,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]"
10507189,rt biboo _ r6 retrouvez moi dimanche 2 février à 19h30 dans l émission radio esix présenté par le magnifique r3siak unk je compte sur vous unk,fr,"[biboo, r6, retrouv, dimanch, 2, févri, 19h30, émiss, radio, esix, présent, magnif, r3siak, unk, compt, unk]"
10507188,rt nayonek _ pour le dire plus poliment que certains tu as percé grâce à ta victimisation sur les réseaux et sur tes vidéos le fait que unk,fr,"[nayonek, dir, plus, pol, certain, perc, grâc, victimis, réseau, vidéos, unk]"


In [122]:
fr_val_feature_df = pd.concat([fr_val_input, topic_distribution_fr], axis = 1)

In [119]:
topic_distribution_fr.index = fr_val_input.index

In [123]:
fr_val_feature_df.head()

Unnamed: 0,tweet_text,name,tweet_tokens,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
10507192,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]",0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,...,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668
10507191,rt paulineolg moi tu me dis ça j unk te traite de mytho en te regardant droit dans les yeux faut que j unk adoucisse mon coeur un peu,fr,"[paulineolg, dis, unk, trait, mytho, regard, droit, yeux, faut, unk, adouc, coeur]",0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,...,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769,0.000769
10507190,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]",0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,...,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668,0.001668
10507189,rt biboo _ r6 retrouvez moi dimanche 2 février à 19h30 dans l émission radio esix présenté par le magnifique r3siak unk je compte sur vous unk,fr,"[biboo, r6, retrouv, dimanch, 2, févri, 19h30, émiss, radio, esix, présent, magnif, r3siak, unk, compt, unk]",0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,...,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588
10507188,rt nayonek _ pour le dire plus poliment que certains tu as percé grâce à ta victimisation sur les réseaux et sur tes vidéos le fait que unk,fr,"[nayonek, dir, plus, pol, certain, perc, grâc, victimis, réseau, vidéos, unk]",0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,...,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833,0.000833


In [65]:
corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]

In [61]:
ldamodel[corpus[0] minimum_probability=0.0]

SyntaxError: invalid syntax (<ipython-input-61-21abbc921170>, line 1)

In [None]:
univectorizer = CountVectorizer(analyzer = "word", min_df = 0.0, ngram_range = (1,1), strip_accents= None, tokenizer = None)

In [None]:
unicorpus = univectorizer.fit_transform(fr_val_input["tweet_tokens"])

In [None]:
unigrams = univectorizer.get_feature_names()

In [None]:
unigrams

In [None]:
print(unicorpus[0,:])

In [None]:
unicorpus.shape

In [None]:
unigrams[10204]

In [None]:
lda_model = lda.LDA(n_topics = 10)

In [None]:
lda_model.fit(unicorpus)

In [None]:
lda_model.

In [None]:
doc_topic = lda_model.doc_topic_

In [None]:
len(doc_topic)

In [None]:
topic_word = lda_model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_word_indexes = [np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_words = np.array(unigrams)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    print(topic_word_indexes)

In [None]:
print(topic_word.shape)

In [None]:
len(fr_val_input)