notebook: https://github.com/some-labs-24/data-science/blob/master/python_notebooks/SoMe_NLP_Topic_Modeling.ipynb

In [1]:
!pip install emoji --upgrade
!pip install pandas-profiling==2.*
!pip install plotly==4.*
!pip install pyldavis
!pip install gensim
!pip install chart_studio
!pip install --upgrade autopep8

# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download fr_core_news_lg







In [2]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
#from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
#from wordcloud import STOPWORDS
#stopwords = set(STOPWORDS)


`scipy.sparse.sparsetools` is deprecated!
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.



In [None]:
!pip install demoji

import demoji
demoji.download_codes()

In [9]:
tweets_df = pd.read_json('../input/iphone-12-tweets-fr.json', lines=True)
# garder que les colonnes importantes
cols = ['date', 'content', 'lang']
tweets_df = tweets_df[cols]
# pour etre sur que tout les tweets sont en français
tweets_df = tweets_df[tweets_df['lang'] == 'fr']
tweets_df['lang'].unique()
# supprimer les tweets dupliqués, 1K ont été supprimés

tweets_df = tweets_df.sort_values("content") 
  
# dropping ALL duplicte values 
tweets_df = tweets_df.drop_duplicates(subset ="content", keep = 'first')

In [12]:
# selectionner que les tweets qui répondent au requetes de l'utilisateur

data = tweets_df['content']

keywords = ['stockage', 'écran', 'autonomie', 'réseau', 'alimentation',
           'appareil', 'appareil photo', 'processeur', 'multimédia', 'résolution',
           'batterie', 'système d\'exploitation', 'os', 'pièces', 'couleur',
           'Communication', 'sans fil', 'synchronisation', 'coloris', 'poids',
           'dimensions']

data = data[data.str.contains('|'.join(keywords), case=False)]

ads_words = [
    '#concours', '#jeuconcours', '#giveaway',
    '#gagne' ,'#gangner', '#promo', '#promotion', '#publicité',
    '#contest', '#ad', '#pub', '#réduction']

data = data[~data.str.contains('|'.join(ads_words), case=False)]

In [98]:
df = pd.DataFrame(data)
df.columns = ['original_tweets']

In [99]:
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    return demoji.replace(text, '').strip()

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'(?:\@|https?\://)\S+', '', text)
    return text

# remplacement des abréviations

with open('../input/abrivot_fr.json', encoding='utf-8') as f:
    abrivot = json.load(f)
    
def replace_abrivot(text):
    '''
    input: string
    output: string
    '''
    words = text.lower().split()
    text_out = [abrivot[word] if word in abrivot else word for word in words]
    return ' '.join(text_out)

In [100]:
# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['original_tweets'].apply(give_emoji_free_text)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

df['abrivots_free_tweets'] = df['url_free_tweets'].apply(replace_abrivot)

In [101]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('fr_core_news_lg')

In [102]:
# mettre à jour la liste d'aprés le fichies des mots vides
with open('../input/fr_stopwords.txt', encoding='utf-8') as f:
    fr_stopwords = f.read().splitlines()
    
# Custom stopwords
custom_stopwords = ['\n','\n\n', '&amp;', ' ', '.', '-','$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(fr_stopwords)

In [103]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []

for doc in tokenizer.pipe(df['abrivots_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in ALL_STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

In [105]:
# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

In [106]:
# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$



In [107]:
df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,abrivots_free_tweets,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text,lemma_tokens
21493,!!!! Jsuis en désaccord avec mon poto : votez ...,!!!! Jsuis en désaccord avec mon poto : votez ...,!!!! Jsuis en désaccord avec mon poto : votez ...,!!!! je suis en désaccord avec mon poto : vote...,"[!!!!, désaccord, :, votez, belle, couleur, d’...",!!!! désaccord : votez belle couleur d’iphone ...,"[désaccord, voter, bel, couleur, iphone, 12, p...",désaccord voter bel couleur iphone 12 pro max,"[désaccord, voter, bel, couleur, iphone, 12, p..."
13326,"""\n \n Actualité : L...","""\n \n Actualité : L...","""\n \n Actualité : L...",""" actualité : le format apple proraw disponibl...","["", actualité, :, format, apple, proraw, dispo...",""" actualité : format apple proraw disponible i...","[actualité, format, apple, proraw, disponible,...",actualité format apple proraw disponible iphon...,"[actualité, format, apple, proraw, disponible,..."
27727,"""Apple : certains iPhone 12 mini souffrent d'u...","""Apple : certains iPhone 12 mini souffrent d'u...","""Apple : certains iPhone 12 mini souffrent d'u...","""apple : certains iphone 12 mini souffrent d'u...","[""apple, :, iphone, 12, mini, souffrent, d'un,...","""apple : iphone 12 mini souffrent d'un problèm...","[apple, iphone, 12, mini, souffrir, problème, ...",apple iphone 12 mini souffrir problème réactiv...,"[apple, iphone, 12, mini, souffrir, problème, ..."
25119,"""Apple publie une mise à jour d'iOS pour corri...","""Apple publie une mise à jour d'iOS pour corri...","""Apple publie une mise à jour d'iOS pour corri...","""apple publie une mise à jour d'ios pour corri...","[""apple, publie, mise, jour, d'ios, corriger, ...","""apple publie mise jour d'ios corriger bugs d'...","[apple, publier, mettre, jour, io, corriger, b...",apple publier mettre jour io corriger bug écra...,"[apple, publier, mettre, jour, io, corriger, b..."
22736,"""Apple souhaiterait que le texte ne l'expose p...","""Apple souhaiterait que le texte ne l'expose p...","""Apple souhaiterait que le texte ne l'expose p...","""apple souhaiterait que le texte ne l'expose p...","[""apple, souhaiterait, texte, l'expose, conséq...","""apple souhaiterait texte l'expose conséquence...","[apple, souhaiter, texte, expose, conséquence,...",apple souhaiter texte expose conséquence cas l...,"[apple, souhaiter, texte, expose, conséquence,..."
...,...,...,...,...,...,...,...,...,...
34338,🤩 iPhone 12 🤩\n\nCraquez pour le nouvel #iPhon...,iPhone 12 \n\nCraquez pour le nouvel #iPhone12...,iPhone 12 \n\nCraquez pour le nouvel #iPhone12...,iphone 12 craquez pour le nouvel #iphone12 et ...,"[iphone, 12, craquez, nouvel, #iphone12, desig...",iphone 12 craquez nouvel #iphone12 design incr...,"[iphone, 12, craquer, nouveau, iphone12, desig...",iphone 12 craquer nouveau iphone12 design incr...,"[iphone, 12, craquer, nouveau, iphone12, desig..."
32618,🤩iPhone 12🤩\n\nOn craque pour le rouge de l'iP...,iPhone 12\n\nOn craque pour le rouge de l'iPho...,iPhone 12\n\nOn craque pour le rouge de l'iPho...,iphone 12 on craque pour le rouge de l'iphone ...,"[iphone, 12, craque, rouge, l'iphone, 12,, ?, ...","iphone 12 craque rouge l'iphone 12, ? venez dé...","[iphone, 12, craque, rouge, iphone, 12, venir,...",iphone 12 craque rouge iphone 12 venir découvr...,"[iphone, 12, craque, rouge, iphone, 12, venir,..."
49871,🤪 Bon avec tout ce buzz autour de Macron et le...,Bon avec tout ce buzz autour de Macron et le #...,Bon avec tout ce buzz autour de Macron et le #...,bon avec tout ce buzz autour de macron et le #...,"[buzz, autour, macron, #covid19, ,, sortie, l’...","buzz autour macron #covid19 , sortie l’iphone ...","[buzz, autour, macron, covid19, sortir, iphone...",buzz autour macron covid19 sortir iphone 12 mi...,"[buzz, autour, macron, covid19, sortir, iphone..."
62347,🦄 iPhone 12 : une autonomie sensiblement plus ...,iPhone 12 : une autonomie sensiblement plus im...,iPhone 12 : une autonomie sensiblement plus im...,iphone 12 : une autonomie sensiblement plus im...,"[iphone, 12, :, autonomie, sensiblement, impor...",iphone 12 : autonomie sensiblement importante ...,"[iphone, 12, autonomie, sensiblement, importan...",iphone 12 autonomie sensiblement important media,"[iphone, 12, autonomie, sensiblement, importan..."


# Topic Modeling

In [108]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

12474


In [109]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

5742


In [110]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

In [111]:
# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)


In [112]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

In [113]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [114]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
12 iphone apple iphone12 mini accessoire test coqu protection iphone12mini

------ Topic 1 ------
12 iphone pro max euro photo batterie accessoire nouveau iphon

------ Topic 2 ------
12 iphone photo prendre iphone12 couleur pro nouveau apple écran

------ Topic 3 ------
iphone 12 pro io apple écran max mini 5 batterie

------ Topic 4 ------
iphone 12 5 apple gramme mini pro iphone12 écran batterie



In [115]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -6.638487390526969

Coherence Score:  0.2627778941383277


In [116]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





# Hypermarameter Tuning

In [117]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])

In [118]:


# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             iid=True, n_jobs=1,
             param_grid={'n_topics': [10, 15, 20, 30], 
                         'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)

GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method=None,
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=1,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_

In [119]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -251964.004068782
Model Perplexity:  1030.5597887572637


In [120]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of tipics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic

In [121]:
# Can take a long time to run.
model_list_topic, coherence_values_topic = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=df['lemma_tokens'],
                                                        start=2, limit=200, step=6)

In [125]:
model_5_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=68,
                       random_state=42,
                       chunksize=2000,
                       passes=25,
                       decay=0.5,
                       iterations=70)

In [126]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = model_5_2.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=model_5_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -6.770005863053338

Coherence Score:  0.34552276209578503


In [127]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_5_2, corpus, id2word)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.



