In [1]:
# Basic Python modules
import pickle
import time
import math
import os
import re
import string
import random
import threading
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import STOPWORDS

#Plotting Tools
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patheffects as PathEffects
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})
#NLTK
import nltk

#Scikit learn models 
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Import pandas and Numpy
import pandas as pd
import numpy as np

# Chemdataextractor
from chemdataextractor.doc import Document, Heading, Paragraph

In [2]:
MDF=pd.read_csv('Final_Abstracts_lda.csv')

In [3]:
NDF=MDF[['PII','Title_x','Abstract_x']].copy()

In [4]:
def process_data(DF, column_name):
    data = d[column_name].tolist()
    data_words = list(sent_to_words(data))
    
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100) 
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    data_words_nostops = [[word for word in simple_preprocess(str(doc)) if word not in STOPWORDS] for doc in data_words]
    data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]
    
    id2word = corpora.Dictionary(data_words_bigrams)
    texts = data_words_bigrams
    corpus = [id2word.doc2bow(text) for text in texts]
    
    return corpus, id2word

In [5]:
def sent_to_words(sentences):
        for sentence in sentences:
             yield(simple_preprocess(str(sentence), deacc=True)) 
        
def preprocess(text):
    result = []
    text=" ".join(text.split())
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append((token))
    return result

def sim_calculator(DF, column_name):

    print("Number of {}: {}".format(column_name,len(DF[column_name])))

    #Preprocessing
    print('\nCreating Dictionary...')
    processed_docs = DF[column_name].map(preprocess)

    #Generating dictionary
    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=100,no_above=0.9, keep_n=100000)
    print('Dictionary created')
    print("Size of vocabularly: ",len(dictionary))

    #Bag of words
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    #Tfid vectorization
    print('\nRunning TFIDF vectorization...')
    model2 = TfidfModel(bow_corpus) 
    abs_tfidf=model2[bow_corpus]
    print('TFIDF complete')

    #Calculating similarties
    print('\nCalculating Similarity Matrix...')
    index = similarities.MatrixSimilarity(abs_tfidf)
    sims = index[abs_tfidf]
    print("size of similarity matrix: ", sims.shape)

    return sims

def tsne(sims,title,DF,i):

    #run PCA
    print("\nBeginning clustering...")
    N=100
    pca = PCA(n_components=N)
    pca_result = pca.fit_transform(sims)
    var=np.sum(pca.explained_variance_ratio_)
    print("\nPCA calculated")
    print('Cumulative explained variation for {} principal components: {}'.format(N,var))

    #Running TSNE
    print("Creating TSNE labels...")
    tsne=TSNE(n_components=2,verbose=1,perplexity=40,n_iter=300)
    tsne_results=tsne.fit_transform(pca_result)
    print("TSNE complete")
    DF['x-label-'+str(i)]=tsne_results[:,0]
    DF['y-label-'+str(i)]=tsne_results[:,1]

    return str(np.round(var*100,2))

def dictionarizer(DF,topic_num):
    
    topic_num=str(topic_num)
    size_dict={}
    tag_list=list(set(DF[topic_num+'-topic']))

    for tag in tag_list:
        val=len(DF[DF[topic_num+'-topic']==tag])
        size_dict[str(tag)]=val

    for tag in tag_list:
        size_dict[str(tag)]=np.around(size_dict[str(tag)]*20/max(list(size_dict.values()))+7,2)

    sorted_dict={}
    for w in sorted(size_dict, key=size_dict.get):
        sorted_dict[w]=size_dict[w]

    return sorted_dict

def scatter(DF,sorted_dict,topic_num,i,title,session_id,var='62.3'):
    
    sample_size=len(DF)
    
    topic_num=str(topic_num)
    print("\nCreating LDA plot...")

    if len(DF)>=50000:
        s=10
    else:
        s=40

    palette = np.array(sns.color_palette("hls", len(sorted_dict)+5))

    f = plt.figure(figsize=(12, 12))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(DF['x-label-'+str(i)], DF['y-label-'+str(i)], s,lw=0,c=palette[DF[topic_num+'-topic'].astype(np.int)])
    ax.axis('off')
    ax.axis('tight')

    txts = []
    for item in sorted_dict.keys():
        # Position of each label.
        xtext, ytext = np.median(DF[DF[topic_num+'-topic']==float(item)][['x-label-'+str(i),'y-label-'+str(i)]],axis=0)
        if not math.isnan(xtext) and not math.isnan(ytext):
            txt = ax.text(xtext, ytext, item, fontsize=sorted_dict[item])
            txt.set_path_effects([
                PathEffects.Stroke(linewidth=5, foreground="w"),
                PathEffects.Normal()])
            txts.append(txt)

    PCA_text='PCA: 100 \nTotal Variance: '+var

    plt.title('Cluster plot of '+str(sample_size)+' Abstracts with '+topic_num+' topics')
    plt.annotate(PCA_text, xy=(0,0), xytext=(12, 80), va='top',xycoords='axes fraction', textcoords='offset points')
    plt.savefig(title+'_'+session_id+'.png')
    print("\nLDA plot saved as {} at {}".format(title+'.png',os.getcwd()))
    plt.show()
    
    return None

In [6]:
class Document:
    
    def __init__(self,address):
        
        self.session_id=self.randomString()
        
        if type(address)==str:
            self.DF=pd.read_csv(address)
        elif isinstance(NDF, pd.DataFrame):
            self.DF=address
        
        self.size=len(self.DF)
        
        self.lda_model=[]
        
        
    def randomString(self,stringLength=8):
        lettersAndDigits = string.ascii_letters + string.digits
        return ''.join(random.choice(lettersAndDigits) for i in range(stringLength))
    
    def run_LDA(self,column_name, topic_num, passes=10):
        
        print("Preparing data to run LDA...")
        corpus, id2word=process_data(self.DF, column_name)

        print('Running LDA...')

        start=time.time()
        lda_model = gensim.models.LdaMulticore(corpus,num_topics=topic_num, id2word=id2word, passes=passes, workers=None,
                                               chunksize=1000)
        stop=time.time()

        print("LDA complete")
        print("Total time: {}".format(stop-start))

        print("\nStoring data in dataframe")
        
        self.lda_model.append((topic_num,lda_model, session_id))
        

        for i,j in enumerate(self.DF.index):
            index, score = sorted(lda_model[corpus[i]], key=lambda tup: -1*tup[1])[0]
            self.DF.loc[j,str(topic_num)+'-topic']=index
            self.DF.loc[j,str(topic_num)+'-topic-score']=score

        print("Done")
        
        return None
        
    
    def generate_LDA_plot(self,column_name,topic_num, repeats=1, pca=True):
        
        if len(self.lda_model)>0:
            
            for i in range(repeats):
                column_name='Abstract_x'
                title='LDA Plot for '+str(self.size)+' '+column_name+' for '+str(topic_num)+'topics-'+str(i)
                sims=sim_calculator(self.DF, column_name)
                var=tsne(sims,topic_num,self.DF,i)
                sorted_dict=dictionarizer(self.DF,topic_num)
                scatter(d,sorted_dict,topic_num,i,title,self.session_id,var)
        
        else:
            print("No lda models found. Please run DF.run_LDA()")
                
        return None

In [13]:
d=NDF.sample(10000)

In [8]:
d.head()

Unnamed: 0,PII,Title_x,Abstract_x
57770,S0925400511006022,Developing a new method of 4-(2-pyridylazo)-re...,An optical sensor responsive to gallium (III) ...
41030,S1566119911002886,Meta-linked CBP-derivatives as host materials ...,"We present four derivatives of 4,4â²-bis(9-ca..."
2867,S2214785320307732,Influence of defect related oxygen vacancies i...,"In this work, SnO2 based precursor sol is subj..."
93231,S1350448712000510,The role of opacifiers in the luminescence of ...,Thermoluminescence (TL) and radioluminescence ...
14304,S0167273815000843,Improving thermal stability and its effects on...,We investigated electrochemical substitution i...


In [9]:
D=Document(d)

In [10]:
D.session_id

'NGwXJ0S0'

In [11]:
D.size

10000

In [None]:
D.run_LDA('Abstract_x',15)

In [12]:
D.generate_LDA_plot('Abstract_x',15)

No lda models found. Please run DF.run_LDA()
