In [19]:
import pickle

# load the model
lda_model = pickle.load(open("../saved models/ldamodel.pkl", "rb"))

In [43]:
from tensorflow import keras
gru_model = keras.models.load_model('../saved models/gru_model.h5')

# Prepare the topics

In [13]:
# load the corpus and the words dictionnary

with open("../saved models/corpus.txt", "rb") as fp:
    corpus = pickle.load(fp)

with open('../saved models/id2word.pkl', 'rb') as fp:
    id2word = pickle.load(fp)

In [20]:
lda_model.num_topics

27

In [21]:
from pprint import pprint
# display topics
pprint(lda_model.show_topics(formatted=True))

[(9,
  '0.611*"couleur" + 0.048*"fonds" + 0.048*"impression" + 0.041*"coque_etui" + '
  '0.040*"bonplan" + 0.038*"esr" + 0.035*"belkin" + 0.035*"chargement" + '
  '0.034*"histoire" + 0.026*"vitre"'),
 (25,
  '0.163*"main" + 0.090*"carte" + 0.088*"minute" + 0.078*"ligne" + '
  '0.078*"reduction" + 0.044*"nouveaute" + 0.043*"interet" + 0.041*"rumeur" + '
  '0.038*"antenne" + 0.036*"photographe"'),
 (0,
  '0.390*"qualite" + 0.252*"photo" + 0.095*"suite" + 0.085*"rapport" + '
  '0.047*"gnt" + 0.039*"pote" + 0.029*"enfant" + 0.024*"fabrication" + '
  '0.017*"achat" + 0.007*"part"'),
 (18,
  '0.742*"prix" + 0.049*"baisse" + 0.045*"lien" + 0.032*"demande" + '
  '0.015*"affaire" + 0.014*"mettre_fin" + 0.012*"deal" + 0.011*"entreprise" + '
  '0.011*"plan" + 0.010*"conseil"'),
 (5,
  '0.327*"taille" + 0.124*"forfait" + 0.067*"pouce" + 0.064*"souci" + '
  '0.060*"fonction" + 0.059*"stockage" + 0.046*"post" + 0.039*"chance" + '
  '0.032*"dollar" + 0.030*"android"'),
 (20,
  '0.137*"commande" + 0.1

In [22]:
# distribution of topics for each document

tm_results = lda_model[corpus]

# We can get the most dominant topic of each document as below:
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results]

In [35]:
# get most probable words for the given topicis

num_keywords = 10
topics = [[(term,
            round(wt, 3)) for term,
           wt in lda_model.show_topic(n, topn=num_keywords)] for n in range(0, lda_model.num_topics)]

In [41]:
# set column width
pd.set_option('display.max_colwidth', None)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics],
                         columns = ['Terme par Sujet'],
                         index=['Sujet '+str(t) for t in range(1, lda_model.num_topics+1)])

In [28]:
import pandas as pd

data_free_punct = pd.read_pickle("../saved models/data_free_punct.pkl")

In [39]:
# Dominant Topic for each Tweet

corpus_topic_df = pd.DataFrame()

# get the Titles from the original dataframe
corpus_topic_df['Tweet_id'] = data_free_punct.index
corpus_topic_df['Topic'] = [item[0]+1 for item in corpus_topics]
#corpus_topic_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
corpus_topic_df['Keywords'] = [topics_df.iloc[t[0]]['Terme par Sujet'] for t in corpus_topics]
corpus_topic_df.head(10)

Unnamed: 0,Tweet_id,Topic,Keywords
0,19381,22,"coque, protection, cote, format, chute, chaine, choix, bouton, exemple, etui_protection"
1,6029,25,"telephone, demontage, film, marque, balle, coloris, mere, reponse, verre, interesse"
2,61267,8,"chargeur, boite, ecouteur, charge, fil, livre, voiture, usb, secteur, absence"
3,61645,8,"chargeur, boite, ecouteur, charge, fil, livre, voiture, usb, secteur, absence"
4,53240,4,"vie, pixel, face, mode, bug, raison, concours, min, tete, option"
5,11631,19,"prix, baisse, lien, demande, affaire, mettre_fin, deal, entreprise, plan, conseil"
6,2833,1,"qualite, photo, suite, rapport, gnt, pote, enfant, fabrication, achat, part"
7,53242,8,"chargeur, boite, ecouteur, charge, fil, livre, voiture, usb, secteur, absence"
8,32661,27,"charge, reseau, probleme, compte, serie, attention, message, precommande, piece, maison"
9,48756,1,"qualite, photo, suite, rapport, gnt, pote, enfant, fabrication, achat, part"


In [50]:
# Topic distribution over the corpus
import numpy as np

topics_dist = corpus_topic_df.groupby('Topic').agg(
                                  Number_of_Documents = ('Tweet_id', np.size),
                                  Topic_Contribution = ('Tweet_id', np.size)).reset_index()

topics_dist['Topic_Contribution'] = topics_dist['Topic_Contribution'].\
                                        apply(lambda row: round((row*100) / len(corpus), 2))

topics_dist

Unnamed: 0,Topic,Number_of_Documents,Topic_Contribution
0,1,3535,18.36
1,2,363,1.89
2,3,261,1.36
3,4,257,1.34
4,5,872,4.53
5,6,463,2.41
6,7,553,2.87
7,8,1556,8.08
8,9,1370,7.12
9,10,647,3.36


# Measuring the sentiments

In [57]:
# load the tweets

df = pd.read_json('../input/iphone-12-tweets-fr.json', lines=True)
df = df[df['lang'] == 'fr']
df = df.sort_values("content") 
  
# dropping ALL duplicte values 
df = df.drop_duplicates(subset ="content", keep = 'first')

In [58]:
with open('../input/keywords.txt', encoding='utf-8') as f:
    keywords = f.read().splitlines()

df = df[df.content.str.contains('|'.join(keywords), case=False)]

# supprimer les tweets inutiles (publicité, concours ..)

with open('../input/ads_words.txt', encoding='utf-8') as f:
    ads_words = f.read().splitlines()

df = df[~df.content.str.contains('|'.join(ads_words), case=False)]

In [60]:
df.content.head()

19381    "\n            \n                Actualité : Le format Apple ProRAW disponible sur iPhone 12 Pro et 12 Pro Max avec iOS 14.3            \n        " https://t.co/PBtLEm2s9C #AnglohaTech #Anglohasys
6029                              " Apple est une marque de snobinards " vient de commander son Iphone 12 pro chez nous 🥱\n\nContactez nous pour vos achats avec ou sans reprise de téléphone 😊\n\n#AboRijaal
61267                                                                                                      " Pour des raisons écologiques, les iPhone 12 sont livrés sans chargeur. " arrête un peu ma gueule
61645                                                                                                      " Pour des raisons écologiques, les iPhone 12 sont livrés sans chargeur. " https://t.co/OCEJpZd011
53240                                                                                                       "5G : en quoi consiste le Smart Data Mode de l’iPhone 12 ?" \n      

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.utils import simple_preprocess

import spacy

import re
import json
from html.parser import HTMLParser
from io import StringIO

In [61]:
tweets = df['content'].values.tolist()

In [62]:
import re
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
tweets = normalize_texts(tweets)

In [65]:
from tensorflow.python.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [64]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(tweets)
tweets = tokenizer.texts_to_sequences(tweets)

In [66]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

tweets = pad_sequences(tweets, padding='post', maxlen=maxlen)

In [138]:
# predict the sentiments
preds = gru_model.predict(tweets)

In [139]:
sentiments = np.argmax(preds, axis=1)

In [140]:
unique, counts = np.unique(sentiments, return_counts=True)

In [141]:
df_senti = pd.DataFrame(index = unique, data=counts, columns=['Total'])
df_senti['Pourcentage %'] = round((df_senti.Total / len(sentiments)) * 100)
df_senti

Unnamed: 0,Total,Pourcentage %
0,11419,59.0
1,1219,6.0
2,6611,34.0


# Aggregation

In [142]:
df_agg = corpus_topic_df
df_agg['sentiment'] = sentiments

In [143]:
df_satisfaction = df_agg[df_agg['sentiment'] == 2]

In [144]:
df_satisfaction = df_satisfaction[['Topic', 'Tweet_id']].groupby('Topic').agg('count').reset_index()

In [147]:
df_satisfaction.columns = [['Topic','Client Satisfaction']]

In [148]:
df_satisfaction

Unnamed: 0,Topic,Client Satisfaction
0,1,1319
1,2,99
2,3,89
3,4,63
4,5,439
5,6,145
6,7,168
7,8,552
8,9,582
9,10,224


In [157]:
topics_dist['Client Satisfaction'] = df_satisfaction['Client Satisfaction']
topics_dist['Client Satisfaction'] = round(topics_dist['Client Satisfaction'] / topics_dist['Number_of_Documents'], 2) * 100

In [158]:
topics_dist

Unnamed: 0,Topic,Number_of_Documents,Topic_Contribution,Client Satisfaction
0,1,3535,18.36,37.0
1,2,363,1.89,27.0
2,3,261,1.36,34.0
3,4,257,1.34,25.0
4,5,872,4.53,50.0
5,6,463,2.41,31.0
6,7,553,2.87,30.0
7,8,1556,8.08,35.0
8,9,1370,7.12,42.0
9,10,647,3.36,35.0
