In [1]:
from multiprocessing import cpu_count

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN, KMeans

# Imports
import os,sys
import re
import string
import math
import pickle

import re, numpy as np, pandas as pd
from pprint import pprint
from collections import OrderedDict


# Gensim
import gensim,  logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.summarization import summarize,keywords
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel,TfidfModel

# Plotting & graphs
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
from wordcloud import WordCloud, STOPWORDS

# Spacy 
import spacy

#import Utils
from utils import get_corpus_dataframe

# Basic configuration 
%matplotlib inline
warnings.filterwarnings('ignore') 
#warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
# variables Initialize 

allowed_postags_filter=['NOUN', 'ADJ', 'VERB', 'ADV'] 

# Prepare Plotting:  Helpers & Initialization
word_plotting_colors = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

def my_tf_color_func(dictionary):
    def my_tf_color_func_inner(word, font_size, position, orientation, random_state=None, **kwargs):
        return "hsl(%d, 80%%, 50%%)" % (360 * dictionary[word])
    return my_tf_color_func_inner

cloud = WordCloud(
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: word_plotting_colors[i],
                  prefer_horizontal=1.0)

In [3]:
# Load data 
# Import Dataset into a Pandas Dataframe
df = get_corpus_dataframe(eu_only=False,allowed_postags_filter=allowed_postags_filter)

with open("data_ready.sav", "rb") as fp:   # Unpickling
    data_ready = pickle.load(fp)

In [4]:

all_docs= ''.join((''.join(map(str, data_ready))))

keywords(all_docs, ratio=0.2,words=30,scores=True,lemmatize=True,deacc=False)

#summarize(' '.join(df['content']), ratio=0.2,word_count=200)
#df['content'].values



[('datum', 0.20033834497027106),
 ('developing', 0.18288463293870558),
 ('technologically', 0.17909528256991225),
 ('researchs', 0.16639039748638557),
 ('artificial_intelligence', 0.14295933300793906),
 ('governing', 0.1164498766750751),
 ('working', 0.10934899002159822),
 ('innovator', 0.10084061334103553),
 ('sectoral', 0.09785866444246584),
 ('provider', 0.09756421138688329),
 ('industrie', 0.09439412456336659),
 ('supportive', 0.09415559832754866),
 ('examples', 0.09412512086489833),
 ('publically', 0.09369604760953412),
 ('applicative', 0.0936724739210218),
 ('digitalize', 0.092249759567319),
 ('servicing', 0.09193316219793946),
 ('company', 0.08402169847491403),
 ('include', 0.08290865930878577),
 ('required', 0.0827378399843652),
 ('possibly', 0.08200488950294771),
 ('timing', 0.08092746266533535),
 ('busy', 0.0804488165759748),
 ('base', 0.08023060612164333),
 ('area', 0.07925845589093031),
 ('generously', 0.07839037425982268),
 ('initiate', 0.0782450175589548),
 ('create', 0.0

In [5]:
def remove_common_words(dictionary,corpus,low_value=0.01):
    
    tfidf = TfidfModel(corpus, id2word=id2word)

    for i in range(0, len(corpus)):
        bow = corpus[i]
        low_value_words = [] #reinitialize to be safe. You can skip this.
        tfidf_ids = [id for id, value in tfidf[bow]]
        bow_ids = [id for id, value in bow]
        low_value_words = [id for id, value in tfidf[bow] if value < low_value]
        words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

        new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  

        #reassign        
        corpus[i] = new_bow
    return corpus

In [None]:
N_TOPICS = 4 #or 7
NO_ABOVE=1

# Create Dictionary
id2word = corpora.Dictionary(data_ready)


# Filter out words that occur less than 2 documents, or more than 85% of the documents.
id2word.filter_extremes(no_below=2, no_above=NO_ABOVE,keep_n=5000)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

#filter out most common words
corpus= remove_common_words(id2word,corpus,low_value = 0.15)
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=N_TOPICS, 
                                           random_state=100,
                                           update_every=0,
                                           passes=20,
                                           alpha='auto',
                                           minimum_probability=0.15,
                                           per_word_topics=True)
                                           
print('Number of unique tokens: %d' % len(id2word))
print('Number of documents: %d' % len(corpus))

pprint(lda_model.print_topics(num_words=30))
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_ready, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Number of unique tokens: 3659
Number of documents: 12
[(0,
  '0.032*"occupation" + 0.025*"public_authorities" + 0.025*"exception" + '
  '0.016*"competence" + 0.015*"ethical_principles" + 0.012*"recommend" + '
  '0.010*"business_community" + 0.008*"interview" + 0.008*"harness" + '
  '0.007*"freely_available" + 0.006*"automatable" + 0.006*"oxford_insights" + '
  '0.006*"safe_secure" + 0.004*"higher_education_institutions" + '
  '0.003*"county" + 0.002*"testbed" + 0.000*"adoption" + 0.000*"marketplace" + '
  '0.000*"annotation" + 0.000*"poor" + 0.000*"foster" + 0.000*"realize" + '
  '0.000*"maintenance" + 0.000*"utilize" + 0.000*"human_resources" + '
  '0.000*"industry_academia" + 0.000*"utilization" + 0.000*"personnel" + '
  '0.000*"attachment" + 0.000*"cluster"'),
 (1,
  '0.088*"intelligent" + 0.044*"theory" + 0.035*"utilisation" + '
  '0.025*"utilise" + 0.016*"construct" + 0.016*"vision" + '
  '0.015*"labour_market" + 0.014*"sensing" + 0.012*"accelerator" + '
  '0.012*"comprehensively"

## PRINT STATISTICS

### Print Top Words per Topic

In [38]:
from operator import itemgetter
from collections import OrderedDict
from collections import defaultdict

# the higher the TF*IDF score (weight), the rarer the term and vice versa
tfidf = TfidfModel(corpus, id2word=id2word)
d = {}
low_value = 0.005
low_value_words = []
for bow in corpus:
    d={id2word[key]: value for (key, value) in tfidf[bow] if value < low_value}
    sorted_x = OrderedDict(sorted(d.items(), key=lambda t: t[1]))
    print(sorted_x)
    low_value_words += [id for id, value in tfidf[bow] if value > low_value]


# print low value words
# for x in [id2word[word_id ] for word_id  in low_value_words]:
#     print(x)

print(sorted_x)

OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()
OrderedDict()


In [None]:
def evaluate_graph(texts, limit_start,limit_end,up_limit,topn_words,low_values):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
   
    c_v = []
    lm_list = []
    
    for above_limit in up_limit: #[.75,.80,.85,.90,.95,1]:
        
        for keepn in topn_words: #[2500,3000,3200,3400,8000]:
            for low_cut_limit in low_values:
            # Create Dictionary
                id2word = corpora.Dictionary(texts)
                # Filter out words that occur less than 2 documents, or more than 85% of the documents.
                id2word.filter_extremes(no_below=2, no_above=above_limit,keep_n=keepn)

                # Create Corpus: Term Document Frequency
                corpus = [id2word.doc2bow(text) for text in texts]

                #filter out most common words
                corpus= remove_common_words(id2word,corpus,low_value = low_cut_limit) 

                for num_topics in range(limit_start, limit_end):
                    lm = gensim.models.ldamodel.LdaModel(corpus=corpus,
                           id2word=id2word,
                           num_topics=num_topics, 
                           random_state=100,
                           update_every=1,
                           passes=20,
                           alpha='auto',
                           minimum_probability=0.05,
                           per_word_topics=True)
                    lm_list.append(lm)
                    cm = CoherenceModel(model=lm, texts=texts, dictionary=id2word, coherence='c_v',processes=4)
                    coherence_value = cm.get_coherence()
                    s = f'num_topics={num_topics} , above_limit={above_limit} ,keep_n={keepn}, low_cut={low_cut_limit} , coherence = {coherence_value}'
                    c_v.append(coherence_value)
                    print(s)
  
    # Show graph
#     x = range(2, limit)
#     plt.plot(x, c_v)
#     plt.xlabel("num_topics")
#     plt.ylabel("Coherence score")
#     plt.legend(("c_v"), loc='best')
#     plt.show()
    
    return lm_list, c_v

model_list, coherence_values = evaluate_graph(texts=data_ready, limit_start=3,limit_end=8,up_limit=[0.95,1],topn_words=[4600],low_values=[0.01,0.05,0.1,0.15])

# Show graph
# x = range(2, 10)
# plt.plot(x, coherence_values)
# plt.xlabel("num_topics")
# plt.ylabel("Coherence score")
# plt.legend(("c_v"), loc='best')
# plt.show()

##  What is the Dominant topic and its percentage contribution in each document

In [46]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data_ready):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords] ,name=df.iloc[i].name))
            else:
                break
   

    # Add original text to the end of the output
    contents = pd.Series(texts,index=df.index)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Keywords','Text']
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

df_topic_sents_keywords

Unnamed: 0_level_0,Dominant_Topic,Perc_Contribution,Keywords,Text
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
china,2.0,0.9999,"platform, intelligent, model, core, smart, ado...","[state, issue, seminal, document, entitle, gen..."
denmark,0.0,0.9999,"digital, public_sector, example, citizen, heal...","[national, artificial_intelligence, finance, i..."
finland,3.0,0.9999,"centre, digital, programme, european, job, fun...","[ensure, frontrunner, country, apply, artifici..."
france,1.0,1.0,"citizen, european, stakeholder, context, certa...","[meaningful, artificial_intelligence, european..."
germany,3.0,1.0,"centre, digital, programme, european, job, fun...","[artificial_intelligence, status, ki_strategie..."
india,2.0,1.0,"platform, intelligent, model, core, smart, ado...","[discussion_paper, national, artificial_intell..."
italy,1.0,0.9999,"citizen, european, stakeholder, context, certa...","[artificial_intelligence, service, citizen, ar..."
japan,0.0,0.9998,"digital, public_sector, example, citizen, heal...","[artificial_intelligence, technology, technolo..."
luxembourg,1.0,0.9998,"citizen, european, stakeholder, context, certa...","[artificial_intelligence, vision, content, for..."
mexico,3.0,0.9999,"centre, digital, programme, european, job, fun...","[harness, revolution, commission, embassy, fun..."


## Documents - Topics Distribution

In [47]:
def color_positive_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for positive
    strings, black otherwise.
    """
    color='black'
    
    if val>0.5:
        color = 'red' 
    elif val > 0:
        color = 'green'
    else:
        color='black'
        
    return 'color: %s' % color

get_document_topics = [lda_model.get_document_topics(item,minimum_probability=0.0) for item in corpus]

df_dt_dist_columns = ["Topic"+str(x).zfill(1) for x in range(N_TOPICS)]
df_dt_dist = pd.DataFrame(np.array(get_document_topics)[..., 1],index=df.index, columns=df_dt_dist_columns)
df_dt_dist[df_dt_dist < 0.01] = 0
df_dt_dist = df_dt_dist.round(3)
df_dt_dist = df_dt_dist.style.applymap(color_positive_red)



df_dt_dist

Unnamed: 0_level_0,Topic0,Topic1,Topic2,Topic3
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
china,0,0,1,0
denmark,1,0,0,0
finland,0,0,0,1
france,0,1,0,0
germany,0,0,0,1
india,0,0,1,0
italy,0,1,0,0
japan,1,0,0,0
luxembourg,0,1,0,0
mexico,0,0,0,1


In [48]:


def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data[data<1].max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data[data<1].max().max() 
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

array = np.array(get_document_topics)[..., 1]

sim = np.empty((array.shape[0], array.shape[0]), float)
for j in range(array.shape[0]):
    sim[j]=cosine_similarity([array[j]],array)
 
df_doc_similarities = pd.DataFrame(np.array(sim),index=df.index, columns=df.index)
df_doc_similarities[df_doc_similarities < 0.01] = 0
df_doc_similarities = df_doc_similarities.round(3)
df_doc_similarities_highlighted=df_doc_similarities.style.apply(highlight_max, color='darkorange', axis=1)

df_doc_similarities_highlighted


country,china,denmark,finland,france,germany,india,italy,japan,luxembourg,mexico,sweden,uk
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
china,1,0,0,0,0,1,0,0,0,0,0,0
denmark,0,1,0,0,0,0,0,1,0,0,1,0
finland,0,0,1,0,1,0,0,0,0,1,0,1
france,0,0,0,1,0,0,1,0,1,0,0,0
germany,0,0,1,0,1,0,0,0,0,1,0,1
india,1,0,0,0,0,1,0,0,0,0,0,0
italy,0,0,0,1,0,0,1,0,1,0,0,0
japan,0,1,0,0,0,0,0,1,0,0,1,0
luxembourg,0,0,0,1,0,0,1,0,1,0,0,0
mexico,0,0,1,0,1,0,0,0,0,1,0,1


In [72]:
similarities_cluster_matrix = df_doc_similarities.as_matrix()

dbscan = DBSCAN().fit_predict(array) # ,metric='cityblock'
kmeans2 = KMeans(n_clusters=2, random_state=100).fit(array)
kmeans3 = KMeans(n_clusters=3, random_state=100).fit(array)
kmeans4 = KMeans(n_clusters=4, random_state=100).fit(array)


#you can see the labels with:
print(kmeans2.labels_)
print (kmeans3.labels_)
print (kmeans4.labels_)


[0 0 0 1 1 0 1 0 1 0 0 0]
[2 1 1 0 0 0 0 2 0 1 1 1]
[2 1 1 0 0 3 0 2 0 1 1 1]


 ## The most representative sentence for each topic

## Wordcloud Per Topic

In [None]:
 # 1. Wordcloud of Top N words in each topic


topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(1, N_TOPICS, figsize=(18,20), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.max_words=15
    cloud.color_func = lambda *args, **kwargs: word_plotting_colors[i]
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()



## Wordcloud (N-GRAM) Per Topic

In [1]:
topic_ngrams_filter = lambda dict_topic:  {k: v for k, v in dict_topic.items() if '_' in k}
topics = lda_model.show_topics(num_words=1000,formatted=False)


fig, axes = plt.subplots(1, N_TOPICS, figsize=(18,20), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = topic_ngrams_filter(dict(topics[i][1]))
    cloud.max_words=10
    cloud.color_func = lambda *args, **kwargs: word_plotting_colors[i]
    cloud.generate_from_frequencies(topic_words, max_font_size=200)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

print(topic_ngrams_filter(dict(topics[i][1])))

NameError: name 'lda_model' is not defined

In [None]:
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(N_TOPICS), plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

In [None]:
# [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]
from collections import Counter
topics = lda_model.show_topics(formatted=False,num_words=5)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(1, N_TOPICS, figsize=(20,14), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

## pyLDAVis

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word,sort_topics=False)
vis