In [7]:
import pandas as pd
import s3fs
import boto3
from io import StringIO # python3; python2: BytesIO 
from boto3.s3.transfer import TransferConfig
import numpy as np
import re
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import gensim
from gensim import models
from gensim import corpora
import lda

In [4]:
stop_words = ['rt','_']
stop_words

['rt', '_']

In [None]:
def preprocessing(row):
    stopword_removed_output = [w for w in row.split(' ') if not w in stop_words]
    return stopword_removed_output

In [10]:
train_input = pd.read_csv('s3://recsys-challenge-2020/train_input.csv')
val_input = pd.read_csv('s3://recsys-challenge-2020/val_input.csv')
test_input = pd.read_csv('s3://recsys-challenge-2020/test_input.csv')

In [None]:
all_data = pd.concat(train_input, val_input, test_input)

In [13]:
full_input = all_input.sort_values(by='name')

In [None]:
languages = list(full_input.name.unique())

In [None]:
all_lang_output = pd.DataFrame()
for language in languages:
    lang_input = full_input.loc[full_input.name == language]
    lang_input['tweet_tokens'] = lang_input.apply(lambda x: preprocessing(x.tweet_text), axis = 1)
    dictionary = corpora.Dictionary(lang_input.tweet_tokens)
    corpus = [dictionary.doc2bow(text) for text in lang_input.tweet_tokens]
    ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = 100, id2word=dictionary, passes=5, workers=96)
    all_topics = ldamodel.get_document_topics(corpus, minimum_probability=0.0)
    all_topics_csr = gensim.matutils.corpus2csc(all_topics)
    all_topics_numpy = all_topics_csr.T.toarray()
    all_topics_df = pd.DataFrame(all_topics_numpy)
    all_topics_df.index = lang_input.index
    lang_output = pd.concat([lang_input, all_topics_df], axis = 1)
    all_lang_output = pd.concat(all_lang_output, lang_output)

In [21]:
fr_val_input.head()

Unnamed: 0,tweet_text,name,tweet_tokens
10507192,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]"
10507191,rt paulineolg moi tu me dis ça j unk te traite de mytho en te regardant droit dans les yeux faut que j unk adoucisse mon coeur un peu,fr,"[paulineolg, dis, unk, trait, mytho, regard, droit, yeux, faut, unk, adouc, coeur]"
10507190,rt _ momo58 les fléchettes aux states c unk est un autre délire t co pwtg8pezun,fr,"[momo58, fléchet, stat, unk, délir, co, pwtg8pezun]"
10507189,rt biboo _ r6 retrouvez moi dimanche 2 février à 19h30 dans l émission radio esix présenté par le magnifique r3siak unk je compte sur vous unk,fr,"[biboo, r6, retrouv, dimanch, 2, févri, 19h30, émiss, radio, esix, présent, magnif, r3siak, unk, compt, unk]"
10507188,rt nayonek _ pour le dire plus poliment que certains tu as percé grâce à ta victimisation sur les réseaux et sur tes vidéos le fait que unk,fr,"[nayonek, dir, plus, pol, certain, perc, grâc, victimis, réseau, vidéos, unk]"


In [23]:
dictionary = corpora.Dictionary(fr_val_input.tweet_tokens)
# corpus = [dictionary.doc2bow(text) for text in text_data]

In [24]:
corpus = [dictionary.doc2bow(text) for text in fr_val_input.tweet_tokens]

In [25]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(6, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1)],
 [(6, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1)],
 [(6, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1)],
 [(47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1)],
 [(58, 1), (59, 1), (60, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(6, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1)],
 [(6, 2), (41, 1), (42, 1), (43, 1), 

In [None]:
dictionary

In [32]:
import gensim
NUM_TOPICS = 100

In [33]:
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=5, workers=96)

In [35]:
print(len(topics))

20


In [34]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(94, '0.079*"unk" + 0.031*"co" + 0.021*"messag" + 0.020*"plaidoi" + 0.014*"supprim" + 0.013*"qd" + 0.012*"directric" + 0.011*"va" + 0.010*"sent" + 0.009*"ignor"')
(0, '0.066*"unk" + 0.029*"co" + 0.012*"fcnant" + 0.012*"1" + 0.009*"agress" + 0.008*"mdrrrr" + 0.008*"2" + 0.008*"vendr" + 0.007*"bien" + 0.007*"plus"')
(66, '0.177*"unk" + 0.033*"co" + 0.018*"derb" + 0.017*"halamadrid" + 0.015*"derby" + 0.014*"attitud" + 0.009*"rien" + 0.009*"femm" + 0.007*"assist" + 0.007*"el"')
(47, '0.056*"unk" + 0.053*"co" + 0.012*"6" + 0.007*"1" + 0.007*"extraordinair" + 0.006*"chaussur" + 0.006*"3" + 0.005*"plus" + 0.005*"4" + 0.005*"thiem"')
(49, '0.113*"unk" + 0.034*"co" + 0.014*"ouais" + 0.011*"non" + 0.008*"ptdrrr" + 0.008*"prendr" + 0.007*"5" + 0.007*"awbach" + 0.007*"oubli" + 0.007*"plus"')
(63, '0.130*"unk" + 0.050*"co" + 0.023*"ptdrrr" + 0.012*"lebron" + 0.012*"poussin" + 0.009*"merveil" + 0.007*"adn" + 0.006*"préfer" + 0.006*"cloch" + 0.006*"anim"')
(92, '0.098*"unk" + 0.055*"co" + 0.045*"prêt

In [39]:
ldamodel.get_document_topics(corpus[89])

[(34, 0.20132528), (49, 0.39939395), (54, 0.20507498)]

In [40]:
ldamodel[corpus[5]]

[(15, 0.88999975)]

In [None]:
univectorizer = CountVectorizer(analyzer = "word", min_df = 0.0, ngram_range = (1,1), strip_accents= None, tokenizer = None)

In [None]:
unicorpus = univectorizer.fit_transform(fr_val_input["tweet_tokens"])

In [None]:
unigrams = univectorizer.get_feature_names()

In [None]:
unigrams

In [None]:
print(unicorpus[0,:])

In [None]:
unicorpus.shape

In [None]:
unigrams[10204]

In [None]:
lda_model = lda.LDA(n_topics = 10)

In [None]:
lda_model.fit(unicorpus)

In [None]:
lda_model.

In [None]:
doc_topic = lda_model.doc_topic_

In [None]:
len(doc_topic)

In [None]:
topic_word = lda_model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_word_indexes = [np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_words = np.array(unigrams)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    print(topic_word_indexes)

In [None]:
print(topic_word.shape)

In [None]:
len(fr_val_input)