In [93]:
import numpy as np
import pandas as pd
import pickle
import os
import time
import re
import random
import string

In [94]:
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [95]:
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA

In [96]:
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as pyplot
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [97]:
def preprocess(text):
        result = []
        text=" ".join(text.split())
        for token in simple_preprocess(text):
            if token not in STOPWORDS and len(token) > 3:
                result.append((token))
        return result

In [98]:
class Documents:
    
    def  __init__(self,PDseries,save=False):
        self.session_id=self.randomString()
        self.PDseries=PDseries
        self.size=PDseries.shape[0]
        self.preprocessed_text=PDseries.map(preprocess)
        self.vocabulary_size=0
        self.dictionary=None
        self.create_dictionary()
        self.flag_creation_signal()
        self.word2vec=None
        self.word2vec_vector_size=None
        self.tfidf=None
        self.tfidf_vector_matrix=None
        self.doc2vec=None
        self.doc2vec_vector_matrix=None
        if save:
            self.pickle_files()
        
        
    def randomString(self,stringLength=8):
        lettersAndDigits = string.ascii_letters + string.digits
        return ''.join(random.choice(lettersAndDigits) for i in range(stringLength))
    
    def preprocess(text):
        result = []
        text=" ".join(self.text.split())
        for token in simple_preprocess(text):
            if token not in STOPWORDS and len(token) > 3:
                result.append((token))
        return result
    
    def create_dictionary(self):
        self.dictionary = corpora.Dictionary(self.preprocessed_text)
        self.dictionary.filter_extremes(no_below=100,no_above=0.9, keep_n=100000)
        self.vocabulary_size=len(self.dictionary)
        return None
    
    def vectorize_by_tfidf(self):

        bow_corpus = [self.dictionary.doc2bow(doc) for doc in self.preprocessed_text]
        self.tfidf = TfidfModel(bow_corpus) 
        self.tfidf_vector_matrix=self.tfidf[bow_corpus]
        
        print('Tfidf complete')
        index = similarities.MatrixSimilarity(self.tfidf_vector_matrix)
        similarity_matrix = index[self.tfidf_vector_matrix]
        print('Similarity matrix computed')

        return similarity_matrix
    
    def vectorize_by_doc2vec(self,vector_size=10,window=2):
    
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(self.preprocessed_text.values)]
        self.doc2vec = Doc2Vec(documents, vector_size=vector_size, window=window, min_count=1, workers=4)
        self.doc2vec_vector_matrix=self.doc2vec.docvecs.vectors_docs
        
        print('doc2vec complete')
        similarity_matrix = pairwise_distances(self.doc2vec_vector_matrix, self.doc2vec_vector_matrix, metric='cosine', n_jobs=-1)
        print('Similarity matrix computed')
        
        return similarity_matrix
    
    def flag_creation_signal(self):
        print("Session {} created for document size {} with processed vocabulary of {}".format(self.session_id,self.size,self.vocabulary_size))
        return None
    
    def run_word2vec(self, vector_size=10, window=5, save=True):
        self.word2vec_vector_size=vector_size
        self.word2vec = gensim.models.Word2Vec(self.preprocessed_text.values, size=vector_size, window=window, min_count=2, workers=10)
        self.word2vec.train(self.preprocessed_text.values,total_examples=self.size,epochs=10)
        print('word2vec generated')
        if save:
            self.pickle_files()
        return None
    
    def pickle_files(self):
        
        F=open(self.session_id+'_dictionary.sav','wb')
        pickle.dump(self.dictionary,F)
        F.close()
        print("Dictionary {} saved".format(self.session_id))
        
        F=open(self.session_id+'_'+str(self.size)+'_word2vec.sav','wb')
        pickle.dump(self.word2vec,F)
        F.close()
        print("word2vec {} saved".format(self.session_id))
        
        return None
    

In [99]:
class PCA_TSNE():
    
    def __init__(self, similarity_matrix,session_id, if_pca=True, pca_dimensions=20):
        self.session_id=session_id
        self.pca_dimensions=pca_dimensions
        self.similarity_matrix=similarity_matrix
        self.size=similarity_matrix.shape[0]
        self.pca_result=None
        self.cum_pca_variance=None
        self.x_axis=None
        self.y_axis=None
        self.pca_result,self.var=self.run_pca()
        self.x_axis,self.y_axis=self.run_tsne()
        
    def run_pca(self): 
        pca = PCA(n_components=self.pca_dimensions)
        pca_result = pca.fit_transform(self.similarity_matrix)
        self.cum_pca_variance=np.sum(pca.explained_variance_ratio_)
        print('Cumulative explained variation for {} principal components: {}'.format(self.pca_dimensions,self.cum_pca_variance))
        
        return pca_result, self.cum_pca_variance
        
    def run_tsne(self):
        
        tsne=TSNE(n_components=2,verbose=1,perplexity=40,n_iter=300)
        tsne_results=tsne.fit_transform(self.pca_result)
        
        return tsne_results[:,0],tsne_results[:,1]        


In [100]:
class Plotter():
    
    def __init__(self, PCA_TSNE_instance,session_id, s=20):
        self.session_id=session_id
        self.size=PCA_TSNE_instance.size
        self.x_axis=PCA_TSNE_instance.x_axis
        self.y_axis=PCA_TSNE_instance.y_axis
        self.pixel_size=s
        self.cum_pca_variance=PCA_TSNE_instance.cum_pca_variance
        self.pca_dimensions=PCA_TSNE_instance.pca_dimensions

    
    def plotter(self,title=None):
        
        f = pyplot.figure(figsize=(12, 12))
        ax = pyplot.subplot(aspect='equal')

        sc = ax.scatter(self.x_axis, self.y_axis, lw=0, s=self.pixel_size)
        ax.axis('off')
        ax.axis('tight')

        if self.cum_pca_variance!=None:
            
            PCA_text='PCA:{} \nTotal Variance: {} %'.format(self.pca_dimensions,np.round(self.cum_pca_variance*100,2))
            pyplot.annotate(PCA_text, xy=(0,0), xytext=(12, 80), va='top',xycoords='axes fraction', textcoords='offset points')

        if title!=None:
            pyplot.title(title+" of "+str(self.size)+" Articles")
        
        pyplot.savefig(self.session_id+'-'+title+'-'+str(self.size)+'-plot.png')

        pyplot.show()
        pyplot.close()
        