In [76]:
import os
import pandas as pd

import transformers
import torch
import numpy as np
from simpletransformers.language_representation import RepresentationModel
from tqdm import tqdm

import umap.umap_ as umap
import matplotlib.pyplot as plt
import sklearn.cluster as cluster
import json

In [77]:
os.chdir('..')

In [85]:
os.listdir('./lea')

['.DS_Store',
 'LICENSE',
 'config',
 'deploy',
 'learningmachines',
 'node_modules',
 'webpack.config.js',
 'buildspec.yaml',
 'README.md',
 '.gitignore',
 'package-lock.json',
 'package.json',
 'README.npm.text',
 'nginx',
 'docker-compose.yml',
 '.git',
 'assets.js']

In [86]:
from searcher.es_search import SearchResults_ES

In [69]:
#https://simpletransformers.ai/docs/lm-model/

#https://www.sbert.net/examples/applications/computing-embeddings/README.html

#https://umap-learn.readthedocs.io/en/latest/clustering.html

In [131]:
class EmbGenerator:
    def __init__(self, model_name, qry_obj, max_length=512, proj_method='umap', cluster_method='kmeans'):
        self.model = RepresentationModel(model_type="bert", model_name=model_name, use_cuda=False)
        self.max_length = max_length
        self.qry_obj = qry_obj
        self.es = SearchResults_ES(qry_obj['database'], qry_obj, cleaned=False, rand=False)
        self.proj_method = proj_method
        self.cluster_method = cluster_method
        self.doc_embs = {'x' : [],
                         'y' : [],
                         'emb' : [],
                         'title' : [],
                         'dates' : [],
                         'id' : [],
                        'clusters' : []}
    
    
    def create_doc_obj(self, emb, doc_info):
        self.doc_embs['emb'].append(emb)
        self.doc_embs['title'].append(doc_info.article_title.replace('"', "").replace("\n", ""))
        self.doc_embs['id'].append(doc_info.doc_id)
        self.doc_embs['dates'].append(doc_info.date)
        return
    
    def split_sentences(self, doc):
        split_doc = doc.split()
        chunked_list = list()
        chunk_size = self.max_length
        for i in range(0, len(split_doc), chunk_size):
            chunked_list.append(" ".join(split_doc[i:i+chunk_size]))
        return(chunked_list)
    
    def generate_embs(self):
        for document in tqdm(self.es):
            dsplits = self.split_sentences(document.text)
            sentenceVectors = self.model.encode_sentences(dsplits, combine_strategy="mean")
            word_embedding_avg = np.mean(sentenceVectors, axis=0)
            self.create_doc_obj(word_embedding_avg, document)
            
    def reduce_dims(self, plot=False):
        if self.proj_method == 'umap':
            import umap.umap_ as umap
            fit = umap.UMAP(
                n_neighbors=15,
                min_dist=.1,
                n_components=2,
                metric='cosine')
            
            u = fit.fit_transform(self.doc_embs['emb'])
            self.doc_embs['x'] = u[:,0].tolist()
            self.doc_embs['y'] = u[:,1].tolist()
        
    def cluster_embs(self, num_clusters=10, reduced=False):
        if reduced:
            X = self.doc_embs['x']
        else:
            X = self.doc_embs['emb']
        
        if self.cluster_method == 'kmeans':
            self.doc_embs['clusters'] = cluster.KMeans(n_clusters=num_clusters, random_state=0).fit_predict(X).tolist()
        if type(self.cluster_method) == list:
            self.doc_embs['clusters'] == self.cluster_method
            
    def dump_embs(self):
        cc = self.cluster_method if type(self.cluster_method) == str else 'custom'
        f = open(f"learningmachines/searcher/static/searcher/bert_method_data/{self.qry_obj['database']}_{self.proj_method}_{cc}.json", 'w')
        self.doc_embs['emb'] = []
        json.dump(self.doc_embs, f)
        f.close()
        
                
            
        
        
        
  
test_qry_obj = {'qry': '', 'maximum_hits': '40000', 'database': 'Reddit'}
model_name = 'bert-base-uncased'
eg = EmbGenerator(model_name, test_qry_obj, max_length=100)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTextRepresentation: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
eg.generate_embs()
eg.reduce_dims()
eg.cluster_embs()
eg.dump_embs()



0it [00:00, ?it/s]

40000
{'qry': '', 'maximum_hits': '40000', 'database': 'Reddit', 'f_start': -1, 'f_end': -1, 'min_occurrence': -1, 'max_occurrence': -1}
search query {'size': 1000, 'query': {'match_all': {}}}


1000it [01:57,  6.75it/s]

40000
{'qry': '', 'maximum_hits': '40000', 'database': 'Reddit', 'f_start': -1, 'f_end': -1, 'min_occurrence': -1, 'max_occurrence': -1}
search query {'size': 1000, 'query': {'match_all': {}}}


1001it [01:58,  1.96it/s]

scroll 0 1000


2000it [03:52,  8.41it/s]

40000
{'qry': '', 'maximum_hits': '40000', 'database': 'Reddit', 'f_start': -1, 'f_end': -1, 'min_occurrence': -1, 'max_occurrence': -1}
search query {'size': 1000, 'query': {'match_all': {}}}
scroll 1 1000


2999it [05:48,  9.37it/s]

40000
{'qry': '', 'maximum_hits': '40000', 'database': 'Reddit', 'f_start': -1, 'f_end': -1, 'min_occurrence': -1, 'max_occurrence': -1}
search query {'size': 1000, 'query': {'match_all': {}}}


3001it [05:49,  3.61it/s]

scroll 2 1000


3707it [07:16,  6.67it/s]

In [130]:
for a in eg.doc_embs['title']:
    print(a)

Atmosphere       Launch Audio in a New Window
from    “An Attempt at Jealousy”
Attempted Assassination of the Queen
The Attic
Aubade
Aubade Ending with the Death of a Mosquito
Auguries Cast Aside
Auguries of Innocence
August 1914
The Aureole       Launch Audio in a New Window
Australasian Darters
The Author of Torah
The Author Reflects on His 35th Birthday
Author’s Prayer
Auto-Lullaby
Autobiography
Autobiography
Autumn
Autumn Psalm
Autumn's Way
Ave Maria
Avising the Bright Beams
Baby Ate a Microchip
Baby Wrens’ Voices
Back Road
Bad Year Anthem
Bait Goat
Balcony Scene
The Ballad which Anne Askew made and sang when she was in Newgate
Ballade of Modest Confession
Bantams in Pine-Woods       Launch Audio in a New Window
Bar Napkin Sonnet #11
Barber of the Pea       Launch Audio in a New Window
The Bard: A Pindaric Ode
Barry
Basal Cell
The Bath
Baudelaire
Bavaria
'Be Music, Night'
from    Beachy Head
the bear and the salmon
The Bear
Bearings
Beautiful Signor
The Beauty Shell
“Because he swi