In [1]:
from multiprocessing import cpu_count

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Imports
import pandas as pd
import numpy as np
import spacy
import glob
import os
import re
import string
import math
import pickle
import collections
from pathlib import Path

In [2]:
nlp = spacy.load("en", disable=['tagger', 'ner'])

In [3]:
def load_corpus(eu_only=False):
    eu_list = ['denmark','france','germany','italy','sweden','uk']
    file_list = glob.glob(os.path.join(os.getcwd(), ".\\txts\\clean\\"+ "*.txt"))
            
    corpus = {}

    for file_path in file_list:
        filename = Path(Path(file_path).stem).stem
        if eu_only and filename in eu_list:
            with open(file_path,encoding='utf-8',newline=None) as f_input:
                text = f_input.read() #.replace("\n", "")
                clean_text = "".join([c for c in text if c.isprintable()])
                corpus[filename]=clean_text

    return corpus  

strategies = load_corpus(eu_only=True)
#strategies.keys()

In [4]:
##HELPERS ##

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you \nthink? 123#@!", 
                          remove_digits=True)

def contains_digit_or_num(i): return any(
    j.isdigit() or j in string.punctuation for j in i)

def prevent_sentence_boundaries(doc):
    for token in doc:
        if not can_be_sentence_start(token):
            token.is_sent_start = False
    return doc


def can_be_sentence_start(token):
    if token.i == 0:
        return True
    # We're not checking for is_title here to ignore arbitrary titlecased
    # tokens within sentences
    # elif token.is_title:
    #    return True
    elif token.nbor(-1).is_punct:
        return True
    elif token.nbor(-1).is_space:
        return True
    else:
        return False

In [5]:
lem = nlp("run runs running runner")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

run run
runs run
running run
runner runner


In [6]:
doc = nlp(strategies['uk'])
sentences = [sent.text.strip() for sent in doc.sents]
print("Before:", sentences)

nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)
nlp.add_pipe(prevent_sentence_boundaries, before="parser")
doc = nlp(strategies['uk'])
sentences = [sent.text.strip() for sent in doc.sents]
print("After:", sentences)

Before: ['GOV.UK 1.', 'Home (https://www.gov.uk/)2.', 'Artificial Intelligence Sector Deal (https://www.gov.uk/government/publications/artificial-intelligence-sector-deal)1.', 'Department forBusiness, Energy& Industrial Strategy (https://www.gov.uk/government/organisations/department-for-business-energy-and-industrial-strategy)2.', 'Department forDigital, Culture,Media & Sport (', 'https://www.gov.uk/government/organisations/department-for-digital-culture-media-sport)Policy paperAI', 'Sector DealUpdated 21 May 2019ContentsForewordIndustrial Strategy at a glanceExecutive summaryGrand', 'ChallengeKey commitmentsIdeasPeopleInfrastructureBusiness environmentPlacesFurther information', '© Crown copyright', '2019This publication is licensed under the terms of the Open Government Licence v3.0 except whereotherwise stated.', 'To view this licence, visit nationalarchives.gov.uk/doc/open-government-licence/version/3 (https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3) or w




After: ['GOV.UK 1.', 'Home (https://www.gov.uk/)2.', 'Artificial Intelligence Sector Deal (https://www.gov.uk/government/publications/artificial-intelligence-sector-deal)1.', 'Department forBusiness, Energy& Industrial Strategy (https://www.gov.uk/government/organisations/department-for-business-energy-and-industrial-strategy)2.', 'Department forDigital, Culture,Media & Sport (https://www.gov.uk/government/organisations/department-for-digital-culture-media-sport)Policy paperAI Sector DealUpdated 21 May 2019ContentsForewordIndustrial Strategy at a glanceExecutive summaryGrand ChallengeKey commitmentsIdeasPeopleInfrastructureBusiness environmentPlacesFurther information© Crown copyright 2019This publication is licensed under the terms of the Open Government Licence v3.0 except whereotherwise stated.', 'To view this licence, visit nationalarchives.gov.uk/doc/open-government-licence/version/3 (https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3) or write to theInforma

In [13]:
def build_data_samples(min_size=3):
    data_samples = []
    indices = {}
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']

    i = 0
    for country,txt in strategies.items():
        tmp = nlp(txt)
        corpus = []
        for j, word in enumerate(tmp):
            if contains_digit_or_num(word.text) or len(word.text) <= min_size:
                continue
            try:
                corpus.append(word.lemma_)
            except BaseException:
                corpus.append(str(word))
        corpus = ' '.join(corpus)

        data_samples.append(corpus)
        indices[i] = country
        i += 1

    return data_samples, indices

def process_topics(
        H,
        W,
        feature_names,
        data_samples,
        no_top_words,
        no_top_data_samples,
        indices):
    graph = {}
    topics = {}

    for topic_idx, topic in enumerate(H):

        print("Topic %d:" % (topic_idx))
        topics[topic_idx] = [feature_names[i]
                             for i in topic.argsort()[:-no_top_words - 1:-1]]

        print(" ".join(topics[topic_idx]))
        top_doc_indices = np.argsort(W[:, topic_idx])[
            ::-1][0:no_top_data_samples]

        similar = []
        for doc_index in top_doc_indices:
            print(indices[doc_index])
            similar.append(indices[doc_index])
            graph[doc_index] = list(
                filter(
                    lambda x: x != doc_index,
                    top_doc_indices))

        s = {
            '_id': topic_idx,
            'keywords': topics[topic_idx],
            'statutes': similar

        }

        
    print(graph)
    print(topics)
    return graph, topics, top_doc_indices

def build_gg_stoplist(data_samples, stopwords, gg_most_common=500):
    words = []
    for x in data_samples:
        words.extend(x.split(' '))
    print('Counting words')

    try:
        counter = pickle.load(open('gg_stoplist.pickle', 'rb'))
    except BaseException:
        counter = collections.Counter(words)
        pickle.dump(counter, open('gg_stoplist.pickle', 'wb'))
    finally:
        for w in counter.most_common(gg_most_common):
            stopwords.append(w[0])
    print('Done Counting')
    #exclude country names 
    country_names_words = ['european','europe','french', 'france','china', 'italy' ,'italian', 'india', 'mexico', 'japan' ,'denmark', 'uk', 'germany','german', 'sweden']
    special_excluded_words = ['strategy','strategic','federal','council','ministry']
    stopwords = stopwords + country_names_words + special_excluded_words
    return stopwords, words

def build_topics():
    spacy_stop_words = list(spacy.lang.en.stop_words.STOP_WORDS)
    #print('Number of stop words: %d' % len(stop_words))

    data_samples, indices = build_data_samples()
    stopwords, words = build_gg_stoplist(data_samples, spacy_stop_words)

    # Initial Parameters
    no_features = 1000  # Number of features
    n_samples = len(data_samples)  # Len of data samples
    no_top_words = 25  # Number of top words in each topic
    n_components = 2  # Number of topics
    # How many correlations under each topic
    no_top_data_samples = math.ceil(n_samples / n_components)

    # LDA can only use raw term counts for LDA because it is a probabilistic
    # graphical model
    tf_vectorizer = CountVectorizer(
        max_df=0.95,
        min_df=2,
        max_features=no_features,
        stop_words=stopwords)
    tf = tf_vectorizer.fit_transform(data_samples)
    tf_feature_names = tf_vectorizer.get_feature_names()

    lda_model = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=10,
                                          learning_method='online',
                                          learning_offset=50.,
                                          verbose=1,
                                          n_jobs=cpu_count() - 1,
                                          random_state=0)
    lda_model.fit(tf)

    print("Best Perplexity Score: ", lda_model.perplexity(tf))

    lda_W = lda_model.transform(tf)
    lda_H = lda_model.components_

    graph_lda, topics, top_doc_indices = process_topics(
        lda_H,
        lda_W,
        tf_feature_names,
        data_samples,
        no_top_words,
        no_top_data_samples,
        indices)

    pickle.dump(lda_model, open('lda_model.pickle', 'wb'))
    pickle.dump(tf, open('tf.pickle', 'wb'))

In [15]:
build_topics()

Counting words
Done Counting
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Best Perplexity Score:  1007.2084943865559
Topic 0:
eu commission center today ecological regulation space gender lack factor discrimination meet agreement actor diversity stage guarantee operate database importance worker text adapt skilled excellence
france
germany
italy
Topic 1:
commission center eu text factor today gender worker collaboration organization page day culture assessment balance intelligent eur chain adapt class stage fully face equality awareness
sweden
uk
denmark
{1: [2, 3], 2: [1, 3], 3: [1, 2], 4: [5, 0], 5: [4, 0], 0: [4, 5]}
{0: ['eu', 'commission', 'center', 'today', 'ecological', 'regulation', 'space', 'gender', 'lack', 'factor', 'discrimination',

In [14]:
spacy_stop_words = list(spacy.lang.en.stop_words.STOP_WORDS)
#print('Number of stop words: %d' % len(stop_words))

data_samples, indices = build_data_samples()
stopwords, words = build_gg_stoplist(data_samples, spacy_stop_words)

# Initial Parameters
no_features = 1000  # Number of features
n_samples = len(data_samples)  # Len of data samples
no_top_words = 25  # Number of top words in each topic
n_components = 2  # Number of topics
# How many correlations under each topic
no_top_data_samples = math.ceil(n_samples / n_components)

Counting words
Done Counting


In [15]:
#no_top_data_samples
data_samples[0]

'MARCH Strategy Artificial IntelligenceMinistry Finance Ministry Industry Business Financial AffairsMARCH Strategy Artificial IntelligenceMinistry Finance Ministry Industry Business Financial Vision goal Good start point challenge good Focus area responsible foundation artificial intelligence More well datum Strong competence knowledge Increased investment artificial intelligence Priority area Vision goal Denmark front runner responsible development artificial intelligence Over year technology many advance globally technological development forge significantly well live condition many people Denmark be quick grasp opportunity bring technology generate economic growth welfare everyone There have be rapid development within artificial intelligence recent year major breakthrough with artificial intelligence that enable computer learn without human intervention Artificial intelligence use datum algorithm mathematical formula mimic human learn make decision solve problem technology already 

In [16]:
# LDA can only use raw term counts for LDA because it is a probabilistic
# graphical model
tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=no_features,
    lowercase=True,
    stop_words=stopwords)
tf = tf_vectorizer.fit_transform(data_samples)
tf_feature_names = tf_vectorizer.get_feature_names()

lda_model = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10,
                                      learning_method='online',
                                      learning_offset=50.,
                                      verbose=1,
                                      n_jobs=cpu_count() - 1,
                                      random_state=0)
lda_model.fit(tf)

print("Best Perplexity Score: ", lda_model.perplexity(tf))

lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

graph_lda, topics, top_doc_indices = process_topics(
    lda_H,
    lda_W,
    tf_feature_names,
    data_samples,
    no_top_words,
    no_top_data_samples,
    indices)


iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Best Perplexity Score:  975.363088703203
Topic 0:
commission paper expand white labour privacy cluster guarantee excellence space venture skilled analyse office capability relationship consultation transparency shape importance smes connect organisation talent assessment
germany
uk
italy
Topic 1:
ecological center today gender actor page agreement regulation competence vital collective organization consumption factor association lack collaboration chain career member stage nature database assistance states
france
denmark
sweden
{2: [5, 3], 5: [2, 3], 3: [2, 5], 1: [0, 4], 0: [1, 4], 4: [1, 0]}
{0: ['commission', 'paper', 'expand', 'white', 'labour', 'privacy', 'cluster', 'guarantee', 'excellence', 's

In [17]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, tf, tf_vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
