In [None]:
import pandas as pd
import glob
import os
import numpy as np
import lda
import lda.datasets

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [None]:
dataset = []

path = 'datas/Transcript/'
allFiles = glob.glob(os.path.join(path, '*.csv'))

for file_ in allFiles:
    data = pd.read_csv(file_)
    dataset = dataset + list(data[(data['Text'] == data['Text']) & (data['LanguageOfText'] == 'FR')]['Text'].values)
    
print(len(dataset))
dataset[0]

In [None]:
print(dataset[301])

In [None]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

data_samples = dataset

stop_words = ['de', 'il', 'et', 'je', 'nous', 'sur', 'par', 'der', 'die', 'und', 'le', 'la', 'les', 'qui', 'que', 'du', 'un', 'une',
             'dans', 'ne', 'on', 'au', 'ce', 'ans', 'vs', 'commission', 'se', 'leur', 'ont', 'si', 'aux', 'votre', 'qu']
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words=stop_words)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:

    
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
import logging, gensim, bz2

# remove common words and tokenize
stoplist = ['de', 'il', 'et', 'je', 'nous', 'sur', 'par', 'der', 'die', 'und', 'le', 'la', 'les', 'qui', 'que', 'du', 'un', 'une',
             'dans', 'ne', 'on', 'au', 'ce', 'ans', 'vs', 'commission', 'se', 'leur', 'ont', 'si', 'aux', 'votre', 'qu']

texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in dataset]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

In [None]:
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)

In [None]:
print(ldamodel.print_topics(num_topics=3, num_words=3))
ldamodel.print_topics()