In [3]:
import pickle as pickle
from pprint import pprint
import pandas as pd
import re
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [3]:
# Load Model
with open("./data/saved_results/model/lda_k=5.pickle", "rb") as fp:
    lda_5 = pickle.load(fp)
    
# datawords
with open("./data/saved_results/data_words/ner=20210205_q=0.005.pickle", "rb") as fp:
    data_word = pickle.load(fp)

# load corpus
with open("./data/saved_results/data_words/ner=20210205_q=0.005_corpus.pickle", "rb") as fp:
    corpus = pickle.load(fp)
    
# load doc_lda
with open("./data/saved_results/model/lda_k=5_abstract_doc_lda.pickle", "rb") as fp:
    doc_lda = pickle.load(fp)

In [5]:
doc_lda

[[(0, 0.34567901234567905),
  (1, 0.10789049919484703),
  (2, 0.3332566520972319),
  (3, 0.16455793267387472),
  (4, 0.04861590368836746)],
 [(0, 0.2620721987810595),
  (1, 0.03234880450070323),
  (2, 0.01915477864844953),
  (3, 0.6599022168642421),
  (4, 0.026522001205545504)],
 [(0, 0.7624306527712965),
  (1, 0.09571213770933787),
  (2, 0.06175143879296934),
  (3, 0.032975579405817385),
  (4, 0.047130191320578614)],
 [(0, 0.020822397200349955),
  (1, 0.023272090988626423),
  (2, 0.931350247885681),
  (3, 0.014756488772236804),
  (4, 0.009798775153105862)],
 [(0, 0.03267195767195767),
  (1, 0.5948853615520282),
  (2, 0.014109347442680775),
  (3, 0.136331569664903),
  (4, 0.22200176366843033)],
 [(0, 0.048171966178397044),
  (1, 0.03924020483506014),
  (2, 0.18065975943789447),
  (3, 0.12355603191616053),
  (4, 0.6083720376324878)],
 [(0, 0.04344418318217445),
  (1, 0.8630239241592954),
  (2, 0.012018064419811146),
  (3, 0.05617138804911732),
  (4, 0.025342440189601766)],
 [(0, 0.01630

In [3]:

# article
with open("./data/saved_results/data_words/article_sq.pickle", "rb") as fp:
    articles = pickle.load(fp)

In [2]:
def topics_df(model, bar=0.003, num_words=40):
    """
    Use this method to display topics in one DataFrame

    @param
    model: input model 
    bar: threshold of word frequency 
    num_words: number of words for each topic
    """
    topics = {}
    for idx, item in model.print_topics(num_words=num_words):
        idx += 1
        topic = []
        for num, j in zip(re.findall(r'\s(0.0[0-9]{2})\*', item),re.findall('\"([a-zA-Z]+)\"', item)):
            
            # set the threshold
            if float(num) >= bar:
                topic.append(j)
        topics[f'Topic{idx}'] = ', '.join(topic)

    # create the dataframe 
    df = pd.DataFrame(topics.values(), columns=['Terms per Topic'], index=topics.keys())
    return df

In [8]:
import re
pd.set_option('display.max_colwidth', None)

In [17]:
lda_5.alpha

array([10., 10., 10., 10., 10.])

In [11]:
lda_5.alpha

array([1., 1., 1., 1., 1.])

In [18]:
topics_df(lda_5)

Unnamed: 0,Terms per Topic
Topic1,"mrna, terminator, repression, GFP, strength, ribosome, rb, membrane, IPTG, strains, RBS, supplementary"
Topic2,"light, domain, activation, ligand, ion, fusion, switch, sensor, receptor, ed, GFP, mCherry, intensity, mammalian, membrane"
Topic3,"biosynthesis, titer, metabolite, carbon, fermentation, flux, compound, deletion, biosynthetic, biosensor, strains, fatty, heterologous, intracellular, recombinant, overexpression, supernatant, precursor"
Topic4,"circuit, network, output, input, behavior, gate, strand, device, simulation, module, algorithm, noise, distribution, degradation, population, logic, switch, domain"
Topic5,"yeast, cassette, tRNA, residue, bp, CRISPR, recombination, cluster, DNA, incorporation, transformation, ligation"


In [11]:
lda_5.get_topics()[0]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       1.53210683e-05, 0.00000000e+00, 6.12842732e-06])

In [12]:
lda_5.print_topics()

[(0,
  '0.013*"circuit" + 0.007*"input" + 0.007*"network" + 0.006*"output" + 0.005*"behavior" + 0.005*"gate" + 0.004*"strand" + 0.004*"simulation" + 0.004*"device" + 0.004*"module"'),
 (1,
  '0.006*"°c" + 0.006*"ion" + 0.006*"membrane" + 0.005*"residue" + 0.005*"peptide" + 0.004*"surface" + 0.003*"affinity" + 0.003*"fusion" + 0.003*"secretion" + 0.003*"E."'),
 (2,
  '0.011*"E." + 0.007*"S." + 0.007*"°c" + 0.005*"biosynthesis" + 0.005*"titer" + 0.004*"carbon" + 0.004*"cluster" + 0.004*"metabolite" + 0.004*"fermentation" + 0.004*"flux"'),
 (3,
  '0.008*"°c" + 0.007*"E." + 0.005*"terminator" + 0.004*"mrna" + 0.004*"bp" + 0.004*"GFP" + 0.004*"repression" + 0.003*"tRNA" + 0.003*"cassette" + 0.003*"rb"'),
 (4,
  '0.006*"sensor" + 0.005*"light" + 0.005*"activation" + 0.005*"E." + 0.005*"ligand" + 0.005*"GFP" + 0.004*"switch" + 0.004*"biosensor" + 0.003*"riboswitch" + 0.003*"receptor"')]

In [83]:
data_word[9][667:]

['aroc-',
 'MenF',
 'AroD',
 'AroZ',
 'arod-',
 'AroZ',
 'examined',
 'linker',
 'MA',
 'aroc-',
 'MenF',
 'arod-',
 'AroZ',
 'MA',
 'linker',
 'flexible',
 'linker',
 'rigid',
 'linker',
 'supplementary',
 'flexible',
 'linker',
 'suitable',
 'aroc-',
 'MenF',
 'arod-',
 'AroZ',
 'supplementary',
 'display',
 'MA',
 'transfor',
 'man',
 'cultivation',
 'CFT51c',
 'aroc-',
 'MenF',
 'CFT53c',
 'arod-',
 'AroZ',
 'MA',
 'MA',
 'CFT51c',
 '16-fold',
 'CFT51b',
 'express',
 'unfused',
 'AroC',
 'MenF',
 'CFT51b',
 'CFT51c',
 'MenF',
 'fused',
 'aroc',
 'MenF',
 'fusion',
 'western',
 'blotting',
 'supplementary',
 'aroc-',
 'MenF',
 'fusion',
 'placed',
 'close',
 'proximity',
 'thereby',
 'apparent',
 'intermediate',
 'chorismate',
 'around',
 'MenF',
 'nonfusion',
 'carbon',
 'flux',
 'MA',
 'biosynthesis',
 'competing',
 'branched',
 'compound',
 'channeling',
 'fusion',
 'arod-',
 'AroZ',
 'fusion',
 'improvement',
 'MA',
 'noticed',
 'CFT53b',
 'AroD',
 'AroZ',
 'overexpressed',
 'se

In [84]:
data_word_v2[9][667:]

['AroC',
 '-menf',
 'AroD',
 'AroZ',
 'arod-',
 'AroZ',
 'examined',
 'linker',
 'MA',
 'AroC',
 '-menf',
 'arod-',
 'AroZ',
 'MA',
 'linker',
 'flexible',
 'linker',
 'rigid',
 'linker',
 'supplementary',
 'flexible',
 'linker',
 'suitable',
 'AroC',
 '-menf',
 'arod-',
 'AroZ',
 'supplementary',
 'display',
 'MA',
 'transfor',
 'man',
 'cultivation',
 'CFT51c',
 'AroC',
 '-menf',
 'CFT53c',
 'arod-',
 'AroZ',
 'MA',
 'MA',
 'CFT51c',
 '16-fold',
 'CFT51b',
 'express',
 'unfused',
 'AroC',
 'MenF',
 'CFT51b',
 'CFT51c',
 'MenF',
 'fused',
 'AroC',
 'menf',
 'fusion',
 'western',
 'blotting',
 'supplementary',
 'AroC',
 '-menf',
 'fusion',
 'placed',
 'close',
 'proximity',
 'thereby',
 'apparent',
 'intermediate',
 'chorismate',
 'around',
 'MenF',
 'nonfusion',
 'carbon',
 'flux',
 'MA',
 'biosynthesis',
 'competing',
 'branched',
 'compound',
 'channeling',
 'fusion',
 'arod-',
 'AroZ',
 'fusion',
 'improvement',
 'MA',
 'noticed',
 'CFT53b',
 'AroD',
 'AroZ',
 'overexpressed',
 'se

In [85]:
jsonfiles[corpus[9]]

'The global market for adipic acid, an important aliphatic dicarboxylic acid, is estimated to reach £8 billion by 2022. More than half of the total adipic acid production is from cyclohexane, and the remainder is from cyclohexene, uncoupled cyclohexanol/cyclohexanone (KA oil), and phenol. The production of nylon-66 fibers and engineering resins accounted for approximately 57% of the total amount of adipic acid consumed in 2016. Recently, microbial production of adipic acid has attracted attention as a solution to the exhaustion of finite fossil resources along with the increasing global demand of adipic acid. The production of bioadipic acid, wherein carbons are derived from a renewable feedstock, may solve these problems. Yu et al. prepared 639 ± 34 μg/L of adipic acid from 10 g/L glucose via an artificial pathway with acetyl-CoA and succinyl-CoA as starting units (ACSC1 pathway) in Escherichia coli. Cheong et al. created an E. coli strain producing adipic acid at higher amount by int

In [86]:
path2entity[corpus[9]]

{') 3',
 '. DHS',
 '2-aminobenzoate',
 '3-dehydroquinate dehydratase',
 '3-dehydroshikimate',
 '3-deoxy-d-heptulosonate-7-phosphate',
 '3-hydroxybenzoate',
 '4-hydroxybenzoate',
 '5-O-(1-Carboxyvinyl)-3-phosphoshikimate',
 'AA',
 'ACSC1',
 'ACSC2',
 'ADP',
 'ATP',
 'Acetyl-CoA',
 'Adipyl-CoA',
 'AroC',
 'AroD',
 'AroY',
 'AroZ',
 'Bacillus thuringiensis',
 'CFT5',
 'CFT5 strain',
 'CFT5-Derived Strains',
 'CFT5-derived',
 'CFT5-derived strains',
 'CFT51a',
 'CFT51b',
 'CFT51c',
 'CFT52a',
 'CFT52a cultures',
 'CFT53a',
 'CFT53b',
 'CFT53c',
 'CaCO',
 'CaCO 3',
 'CaCl 2',
 'CatA',
 'Chorismate',
 'DAHP',
 'DAHP synthase',
 'DHS',
 'DHS dehydratase',
 'E.',
 'E. coli',
 'E4P',
 'Escherichia coli',
 'FeSO 4',
 'G6P',
 'GGS',
 'GalP',
 'Glk',
 'Glucose',
 'HCl',
 'K. pneumoniae',
 'KH 2 PO 4',
 'KT2440',
 'L-tryptophan',
 'M9',
 'MA',
 'MenF',
 'MgSO 4',
 'NH 4 Cl',
 'NH 4 OH',
 'Na 2 HPO 4',
 'NaCl',
 'NovaBlue competent cells',
 'P. putida',
 'PBr',
 'PCA',
 'PCA decarboxylase',
 'PEP',


In [157]:
from src.data.load_data import *
from src.data import clean
import pickle

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

def load_json_ner(json_input_path, ner_input_path, contain_abstract):
    json_files = load_json_files(json_input_path, contain_abstract)
    path2entity = load_ner_entities(ner_input_path, json_files.keys())

    return json_files, path2entity

def create_data_words(json_files, path2entity, q):
    article_sq = []
    for k in json_files:
        if k not in path2entity:
            path2entity[k] = set()

    data_words = []
    for k in json_files.keys():
        article_sq.append(k)
        data_words.append(clean.NERCleanText(k, json_files[k], path2entity))

    if q > 0:
        corpus = [' '.join(d) for d in data_words]
        tfidf = TfidfTransformer()
        vocabulary = list(set(' '.join(corpus).split()))
        pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(corpus)
        ser = pd.Series(index = vocabulary, data = pipe['tfid'].idf_)

        # create stop words list 
        stops = ser[ser<ser.quantile(q)].sort_values().index.tolist()

        #update data_words 
        data_words_cleaned = []
        for d in data_words:
            data_words_cleaned.append([w for w in d if w not in stops])
        data_words = data_words_cleaned

    return data_words, article_sq
    
def save_data_words(json_input_path, ner_input_path, contain_abstract, q, output_path_data_words, output_path_a_sq):
    json_files, path2entity = load_json_ner(json_input_path, ner_input_path, contain_abstract)
    data_words, a_sq = create_data_words(json_files, path2entity, q)
    return data_words

In [158]:
def load_params(fp):
    """
    Load params from json file 
    """
    with open(fp) as fh:
        param = json.load(fh)

    return param

In [159]:
params = load_params('config/data_words.json')

In [160]:
jsonfiles, path2entity = load_json_ner(params['json_input_path'], params['ner_input_path'], False)

In [73]:
params = load_params('config/data_words.json')
data_words = save_data_words(**params)

In [70]:
data_words2 = save_data_words(**params)

In [74]:
data_words==data_words2

True

### Topic Terms & Dominant topic counts by documents

In [109]:
topics_df(lda_5_v2)

Unnamed: 0,Terms per Topic
Topic1,"circuit, input, output, network, gate, behavior, device, strand, simulation, module, algorithm, noise, distribution, population, degradation"
Topic2,"IPTG, GFP, rb, biosensor, mrna, repression, sensor, inducer, LB, strains, supplementary, strength, lac, riboswitch, regulator, RBS, cultures, inducible, constitutive, arabinose"
Topic3,"cassette, bp, cluster, integration, deletion, terminator, recombination, CRISPR, chromosome, marker, clone, cerevisiae, locus, transformation, DNA, kb, assembled, ligation, homologous, sgrna, editing"
Topic4,"light, ion, membrane, ligand, peptide, fusion, surface, activation, affinity, intensity, switch, receptor, ed, image, antibody, residue, scaffold"
Topic5,"compound, biosynthesis, tRNA, flux, titer, carbon, metabolite, fermentation, fatty, residue, precursor, mass, synthase"


In [8]:
import numpy as np

In [1]:
tm_results = lda_5_v2[corpus]

NameError: name 'lda_5_v2' is not defined

In [6]:
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results]

In [11]:
corpus_topic_df = pd.DataFrame({'Dominant Topic': [item[0]+1 for item in corpus_topics]})

In [12]:
corpus_topic_df

Unnamed: 0,Dominant Topic
0,2
1,1
2,2
3,5
4,4
...,...
902,2
903,4
904,5
905,5


In [24]:
dominant_topic_df = corpus_topic_df.groupby('Dominant Topic').agg(
                                  Doc_Count = ('Dominant Topic', np.size),
                                  Total_Docs_Perc = ('Dominant Topic', np.size)).reset_index()

In [25]:
dominant_topic_df['Total_Docs_Perc'] = dominant_topic_df['Total_Docs_Perc'].apply(lambda row: round((row*100) / len(corpus), 2))

dominant_topic_df

Unnamed: 0,Dominant Topic,Doc_Count,Total_Docs_Perc
0,1,201,22.16
1,2,168,18.52
2,3,185,20.4
3,4,170,18.74
4,5,183,20.18


In [3]:
import numpy as np
import pandas as pd
import nltk
import re
import string
import scipy
import six
import collections
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text.decode("utf-8", "ignore")
        elif isinstance(text, unicode):
            return text
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")

def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens  


### Clean our Text and return a list of tokens ###
class PlainTokenizer(object):
    """
        Runs basic tokenization (punctuation splitting, lower casing, etc.).
        Ner terms aren't used in this tokenizer. 
    """

    def __init__(self, do_lower_case=True):
        """Constructs a BasicTokenizer.
        Args:
            do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def _is_whitespace(self, char):
        """Checks whether `chars` is a whitespace character."""
        # \t, \n, and \r are technically contorl characters but we treat them
        # as whitespace since they are generally considered as such.
        if char == " " or char == "\t" or char == "\n" or char == "\r":
            return True
        cat = unicodedata.category(char)
        if cat == "Zs":
            return True
        return False

    def _is_control(self, char):
        """Checks whether `chars` is a control character."""
        # These are technically control characters but we count them as whitespace
        # characters.
        if char == "\t" or char == "\n" or char == "\r":
            return False
        cat = unicodedata.category(char)
        if cat.startswith("C"):
            return True
        return False

    def _is_punctuation(self, char):
        """Checks whether `chars` is a punctuation character."""
        cp = ord(char)
        # We treat all non-letter/number ASCII as punctuation.
        # Characters such as "^", "$", and "`" are not in the Unicode
        # Punctuation class but we treat them as punctuation anyways, for
        # consistency.

        # '-' not break 
        if cp == 45:
            return False
        if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
            return True
        cat = unicodedata.category(char)
        if cat.startswith("P"):
            return True
        return False
    
    def tokenize(self, text):
        """Tokenizes a piece of text."""

        # convert 'text' to unicode 
        text = convert_to_unicode(text)

        # remove invalide char and white space 
        text = self._clean_text(text)

        # spliting on a piece of text 
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()

                # remove accent (Nonspacing Mark from the tokens)
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if self._is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or self._is_control(char):
                continue
            if self._is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)


class NERTokenizer(PlainTokenizer):

    """
        Subclass of PlainTokenizer and override 'tokenize' function 
        to keep the ner terms intact
    """

    def __init__(self, do_lower_case=True):
        super().__init__(do_lower_case=True)

    def tokenize(self, text, terms):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []

        for token in orig_tokens:

            # remain terms untouched
            if token in terms:
                split_tokens.extend([token])
                continue

            # if the token contains multiple (>=1) terms
            elif sum([term in token for term in terms]) > 0:
                for term in terms:
                    if term in token:
                        splited = token.split(term)

                        # result: token_0 + term + token_1
                        token = splited[0] #token_0
                        if self.do_lower_case:
                            token = token.lower()
                            token = self._run_strip_accents(token)
                        split_tokens.extend(self._run_split_on_punc(token))
                        split_tokens.append(term) # term
                        token = splited[1] # token_1
                        if self.do_lower_case:
                            token = token.lower()
                            token = self._run_strip_accents(token)
                        split_tokens.extend(self._run_split_on_punc(token))
                continue

            # if text is not in terms 
            else:
                if self.do_lower_case:
                    token = token.lower()
                    token = self._run_strip_accents(token)
                split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens


In [66]:


from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string
import re

def NonNerCleanText(data):

    # basic tokenizer 
    B_tokenizer = tokenizer.PlainTokenizer()

    # set stop words
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union({'et', 'al', 'use', 'using', 'used'})
    # punctuations = string.punctuation.replace('-','')
    lemmatizer = WordNetLemmatizer()

    data = re.sub(r'\([^()]*\)', '', data)
    for tag in ['REFEND', 'REF', 'EQL', 'FIG']:
        data = data.replace(tag, '')
    words = [s for s in B_tokenizer.tokenize(data) if re.match("^[A-Za-z0-9\-]+$", s)]
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(s) for s in words]
    words = [s for s in words if not re.match("^[0-9]+$", s)]
    words = [s for s in words if not len(s) == 1]

    # Only keeping the nouns 
    is_noun = lambda pos: pos[:2] == 'NN'
    words = [word for (word, pos) in pos_tag(words) if is_noun(pos) or '-' in word] 

    return words

def NERCleanText(key, data, path2entity):
    """
        This method cleans the text and creates tokens 
        @param filepath: path to articles 
        @oaram path2entity: ner dictionary 

        @return words: nested list; 
                       each inner list contains the tokens of each article 
    """
    # using ner tokenizer which keep ner terms intact 
    B_tokenizer = NERTokenizer()
    
    # set stop words 
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union({'et', 'al', 'use', 'using', 'used'})
    punctuations = string.punctuation
    lemmatizer = WordNetLemmatizer()

    # load NER entities
    entities = list(path2entity[key])
    entities.sort(key=len, reverse=True)    

    # open file read data 
    

    # non-words tag
    for tag in ['REFEND', 'REF', 'EQL', 'FIG']:
        data = data.replace(tag, '')
    
    # check NER terms
    terms = []
    for t in entities:

        # replace " " with "_" for NER terms
        if ' ' in t:
            tnew = t.replace(' ', '_')
            data = data.replace(t, tnew)
            terms.append(tnew)
        else:
            terms.append(t)
    terms.sort(key=len, reverse=True)
    
    # using NERTokenizer
    words = [s for s in B_tokenizer.tokenize(data, terms) if not (len(s) == 1 and (s in punctuations))]
  
    # remove stop words 
    words = [w for w in words if not w in stop_words]
    words_new = []
    for w in words:

        # if ner terms, keep intact 
        if w in terms:
            words_new.append(w.replace('_', ' '))

        # else lemmatize
        else:
            words_new.append(lemmatizer.lemmatize(w))
    
    words = words_new
    words = [s for s in words if not re.match("^[0-9]+$", s)]
    words = [s for s in words if not len(s) == 1]
   
    return words


In [67]:
from src.data.load_data import *
import pickle

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

def load_json_ner(json_input_path, ner_input_path, contain_abstract):
    json_files = load_json_files(json_input_path, contain_abstract)
    path2entity = load_ner_entities(ner_input_path, json_files.keys())

    return json_files, path2entity

def create_data_words(json_files, path2entity, q):
    article_sq = []
    for k in json_files:
        if k not in path2entity:
            path2entity[k] = set()

    data_words = []
    for k in json_files.keys():
        article_sq.append(k)
        data_words.append(NERCleanText(k, json_files[k], path2entity))
        

#     if q > 0:
#         corpus = [' '.join(d) for d in data_words]
#         tfidf = TfidfTransformer()
#         vocabulary = list(set(' '.join(corpus).split()))
#         pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(corpus)
#         ser = pd.Series(index = vocabulary, data = pipe['tfid'].idf_)

#         # create stop words list 
#         stops = ser[ser<ser.quantile(q)].sort_values().index.tolist()

#         #update data_words 
#         data_words_cleaned = []
#         for d in data_words:
#             data_words_cleaned.append([w for w in d if w not in stops])
#         data_words = data_words_cleaned


    return data_words, article_sq
    
def save_data_words(json_input_path, ner_input_path, contain_abstract, q, output_path_data_words, output_path_a_sq):
    json_files, path2entity = load_json_ner(json_input_path, ner_input_path, contain_abstract)
    data_words, a_sq = create_data_words(json_files, path2entity, q)
    return data_words

In [68]:
data_words = save_data_words(**params)

In [69]:
data_words[9][667:]

['another',
 'mde',
 'reaction',
 'intermediate',
 'produced',
 'fusion',
 'enzyme',
 'diffuse',
 'increasing',
 'efficiency',
 'subsequent',
 'reaction',
 'hence',
 'developing',
 'artificial',
 'metabolic',
 'channeling',
 'fusion',
 'protein',
 'powerful',
 'tool',
 'increasing',
 'carbon',
 'flux',
 'specific',
 'pathway',
 'study',
 'investigates',
 'effect',
 'overexpressing',
 'fusion',
 'protein',
 'shikimate',
 'pathway',
 'gene-level',
 'fusion',
 'method',
 'MA',
 'yield',
 'hence',
 'MA',
 'synthesis',
 'pathway',
 'introduced',
 'strain',
 'increased',
 'shikimate',
 'pathway',
 'flux',
 'described',
 'selecting',
 'optimal',
 'MA',
 'synthesis',
 'pathway',
 'three',
 'candidate',
 'production',
 'MA',
 'selected',
 'pathway',
 'increased',
 'overexpressing',
 'chorismate synthase',
 'AroC',
 'finally',
 'effect',
 'gene-level',
 'fusion',
 'method',
 'investigated',
 'production',
 'MA',
 'increased',
 'overexpressing',
 'gene-level',
 'fusion',
 'protein',
 'AroC',
 'Me

In [72]:


from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string
import re

def NonNerCleanText(data):

    # basic tokenizer 
    B_tokenizer = tokenizer.PlainTokenizer()

    # set stop words
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union({'et', 'al', 'use', 'using', 'used'})
    # punctuations = string.punctuation.replace('-','')
    lemmatizer = WordNetLemmatizer()

    data = re.sub(r'\([^()]*\)', '', data)
    for tag in ['REFEND', 'REF', 'EQL', 'FIG']:
        data = data.replace(tag, '')
    words = [s for s in B_tokenizer.tokenize(data) if re.match("^[A-Za-z0-9\-]+$", s)]
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(s) for s in words]
    words = [s for s in words if not re.match("^[0-9]+$", s)]
    words = [s for s in words if not len(s) == 1]

    # Only keeping the nouns 
    is_noun = lambda pos: pos[:2] == 'NN'
    words = [word for (word, pos) in pos_tag(words) if is_noun(pos) or '-' in word] 

    return words

def NERCleanText(key, data, path2entity):
    """
        This method cleans the text and creates tokens 
        @param filepath: path to articles 
        @oaram path2entity: ner dictionary 

        @return words: nested list; 
                       each inner list contains the tokens of each article 
    """
    # using ner tokenizer which keep ner terms intact 
    B_tokenizer = NERTokenizer()
    
    # set stop words 
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union({'et', 'al', 'use', 'using', 'used'})
    punctuations = string.punctuation
    lemmatizer = WordNetLemmatizer()

    # load NER entities
    entities = list(path2entity[key])
    entities.sort(key=len, reverse=True)    

    # open file read data 
    

    # non-words tag
    for tag in ['REFEND', 'REF', 'EQL', 'FIG']:
        data = data.replace(tag, '')
    
    # check NER terms
    terms = []
    for t in entities:

        # replace " " with "_" for NER terms
        if ' ' in t:
            tnew = t.replace(' ', '_')
            data = data.replace(t, tnew)
            terms.append(tnew)
        else:
            terms.append(t)
    terms.sort(key=len, reverse=True)
    
    # using NERTokenizer
    words = [s for s in B_tokenizer.tokenize(data, terms) if not (len(s) == 1 and (s in punctuations))]
  
    # remove stop words 
    words = [w for w in words if not w in stop_words]
    words_new = []
    for w in words:

        # if ner terms, keep intact 
        if w in terms:
            words_new.append(w.replace('_', ' '))

        # else lemmatize
        else:
            words_new.append(lemmatizer.lemmatize(w))
    
    words = words_new
    words = [s for s in words if not re.match("^[0-9]+$", s)]
    words = [s for s in words if not len(s) == 1]
   
    return words


In [70]:
texts = 'AroC and MenF (AroC-MenF) in Pathway 1 and AroD and AroZ (AroD-AroZ) in Pathway 3 (Figure A). First we examined the effect of linker type on MA production. In AroC-MenF and AroD-AroZ, MA production was compared in the case of using without linker, flexible linker, and rigid linker (Supplementary Figure S4), respectively, and flexible linker was most suitable for AroC-MenF and AroD-AroZ (Supplementary Table S4). Figure displays the amount of MA produced by each transformant after 72 h cultivation. CFT51c (AroC-MenF) and CFT53c (AroD-AroZ) produced 3.45 ± 0.04 and 1.20 ± 0.10 g/L of MA, respectively. The production of MA in CFT51c was 2.16-fold higher than that produced by CFT51b, which expresses the unfused AroC and MenF proteins. We confirmed that CFT51b and CFT51c expressed MenF and fused AroC/MenF fusion protein respectively by Western blotting analysis (Supplementary Figure S5). In the AroC-MenF fusion protein, the two enzymes are placed in close proximity, thereby increasing the apparent concentration of the intermediate (chorismate) around MenF relative to that of the nonfusion enzyme, resulting in higher carbon flux to the MA biosynthesis pathway (Figure).'

In [83]:
from src.data.load_data import *
import pickle

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

def load_json_ner(json_input_path, ner_input_path, contain_abstract):
    json_files = load_json_files(json_input_path, contain_abstract)
    path2entity = load_ner_entities(ner_input_path, json_files.keys())

    return json_files, path2entity

def create_data_words(json_files, path2entity, q):
    article_sq = []
    for k in json_files:
        if k not in path2entity:
            path2entity[k] = set()

    data_words = []
    for k in json_files.keys():
        k_ = 'sb8b00380'
        #article_sq.append(k)
        data_words.append(NERCleanText(k_, texts, path2entity))
        break

    if q > 0:
        corpus = [' '.join(d) for d in data_words]
        tfidf = TfidfTransformer()
        vocabulary = list(set(' '.join(corpus).split()))
        pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(corpus)
        ser = pd.Series(index = vocabulary, data = pipe['tfid'].idf_)

        # create stop words list 
        stops = ser[ser<ser.quantile(q)].sort_values().index.tolist()

        #update data_words 
        data_words_cleaned = []
        for d in data_words:
            data_words_cleaned.append([w for w in d if w not in stops])
        data_words = data_words_cleaned
      

    return data_words, article_sq
    
def save_data_words(json_input_path, ner_input_path, contain_abstract, q, output_path_data_words, output_path_a_sq):
    json_files, path2entity = load_json_ner(json_input_path, ner_input_path, contain_abstract)
    data_words, a_sq = create_data_words(json_files, path2entity, q)
    return data_words

In [84]:
data_word_9 = save_data_words(**params)

In [81]:
articles[9]

'sb8b00380'

In [82]:
data_word_9[0]

['AroC',
 'MenF',
 'aroc-',
 'MenF',
 'pathway',
 'AroD',
 'AroZ',
 'AroD',
 '-aroz',
 'pathway',
 'figure',
 'first',
 'examined',
 'effect',
 'linker',
 'type',
 'MA',
 'production',
 'aroc-',
 'MenF',
 'AroD',
 '-aroz',
 'MA',
 'production',
 'compared',
 'case',
 'without',
 'linker',
 'flexible',
 'linker',
 'rigid',
 'linker',
 'supplementary',
 'figure',
 's4',
 'respectively',
 'flexible',
 'linker',
 'suitable',
 'aroc-',
 'MenF',
 'AroD',
 '-aroz',
 'supplementary',
 'table',
 's4',
 'figure',
 'display',
 'amount',
 'MA',
 'produced',
 'transfor',
 'man',
 'cultivation',
 'CFT51c',
 'aroc-',
 'MenF',
 'CFT53c',
 'AroD',
 '-aroz',
 'produced',
 'MA',
 'respectively',
 'production',
 'MA',
 'CFT51c',
 '16-fold',
 'higher',
 'produced',
 'CFT51b',
 'express',
 'unfused',
 'AroC',
 'MenF',
 'protein',
 'confirmed',
 'CFT51b',
 'CFT51c',
 'expressed',
 'MenF',
 'fused',
 'aroc',
 'MenF',
 'fusion',
 'protein',
 'respectively',
 'western',
 'blotting',
 'analysis',
 'supplementary

In [85]:
data_word_9[0]

['AroC',
 'MenF',
 'aroc-',
 'MenF',
 'pathway',
 'AroD',
 'AroZ',
 'AroD',
 '-aroz',
 'pathway',
 'figure',
 'first',
 'examined',
 'effect',
 'linker',
 'type',
 'MA',
 'production',
 'aroc-',
 'MenF',
 'AroD',
 '-aroz',
 'MA',
 'production',
 'compared',
 'case',
 'without',
 'linker',
 'flexible',
 'linker',
 'rigid',
 'linker',
 'supplementary',
 'figure',
 's4',
 'respectively',
 'flexible',
 'linker',
 'suitable',
 'aroc-',
 'MenF',
 'AroD',
 '-aroz',
 'supplementary',
 'table',
 's4',
 'figure',
 'display',
 'amount',
 'MA',
 'produced',
 'transfor',
 'man',
 'cultivation',
 'CFT51c',
 'aroc-',
 'MenF',
 'CFT53c',
 'AroD',
 '-aroz',
 'produced',
 'MA',
 'respectively',
 'production',
 'MA',
 'CFT51c',
 '16-fold',
 'higher',
 'produced',
 'CFT51b',
 'express',
 'unfused',
 'AroC',
 'MenF',
 'protein',
 'confirmed',
 'CFT51b',
 'CFT51c',
 'expressed',
 'MenF',
 'fused',
 'aroc',
 'MenF',
 'fusion',
 'protein',
 'respectively',
 'western',
 'blotting',
 'analysis',
 'supplementary

### NER Comparison

In [12]:

with open("./data/saved_results/model/noner_abstract_coherence.pickle", "rb") as fp:
    noNer_abs = pickle.load(fp)

    
with open("./data/saved_results/model/noner_coherence.pickle", "rb") as fp:
    noNer = pickle.load(fp)

with open("./data/saved_results/model/lda_k=5_abstract_coherence.pickle", "rb") as fp:
    ner = pickle.load(fp)

In [14]:
noNer.get_coherence()

0.5773225598096631

In [13]:
noNer_abs.get_coherence()

0.517957605486875

In [7]:
ner.get_coherence()

0.4248681962241365

In [23]:
with open("./data/saved_results/model/lda_k=5_noner.pickle", "rb") as fp:
    noNerModel = pickle.load(fp)

In [19]:
topics_df(noNerModel)

Unnamed: 0,Terms per Topic
Topic1,"yeast, codon, cassette, clone, trna, integration, bp, recombination, transformation, genomic, deletion, chromosome, sgrna, marker, locus, screening, crispr, replication, editing, template, round, assembled"
Topic2,"circuit, input, network, output, specie, strand, simulation, device, gate, behavior, algorithm, module, noise, sbol, distribution, feature, population, domain, logic, term, feedback"
Topic3,"gfp, circuit, induction, mrna, repression, translation, rb, sensor, iptg, regulatory, terminator, inducer, switch, biosensor, strength, activation, regulator, output, operator, repressor, upstream, ribosome, riboswitch, inducible, constitutive, degradation, atc, intensity"
Topic4,"domain, peptide, light, membrane, fusion, residue, surface, receptor, affinity, secretion, ligand, antibody, blue, activation, scaffold, image, supernatant"
Topic5,"glucose, mg, biosynthesis, titer, metabolite, carbon, compound, flux, fermentation, heterologous, biosynthetic, fatty, cluster, module, biomass, extract, operon, ethanol"


In [24]:
topics_df(noNerModel)

Unnamed: 0,Terms per Topic
Topic1,"yeast, codon, cassette, clone, trna, integration, bp, recombination, transformation, genomic, deletion, chromosome, sgrna, marker, locus, screening, crispr, replication, editing, template, round, assembled"
Topic2,"circuit, input, network, output, specie, strand, simulation, device, gate, behavior, algorithm, module, noise, sbol, distribution, feature, population, domain, logic, term, feedback"
Topic3,"gfp, circuit, induction, mrna, repression, translation, rb, sensor, iptg, regulatory, terminator, inducer, switch, biosensor, strength, activation, regulator, output, operator, repressor, upstream, ribosome, riboswitch, inducible, constitutive, degradation, atc, intensity"
Topic4,"domain, peptide, light, membrane, fusion, residue, surface, receptor, affinity, secretion, ligand, antibody, blue, activation, scaffold, image, supernatant"
Topic5,"glucose, mg, biosynthesis, titer, metabolite, carbon, compound, flux, fermentation, heterologous, biosynthetic, fatty, cluster, module, biomass, extract, operon, ethanol"


### Terminal Testing

In [187]:
# model no abstract
with open("../SBKS/data/saved_results/model/lda_k=5.pickle", "rb") as fp:
    v1 = pickle.load(fp)
    
# model 2 no abstract (for debugging)
with open("../SBKS/data/saved_results/model/lda_k=5_v2.pickle", "rb") as fp:
    v2 = pickle.load(fp)

In [188]:
# model with abstract
with open("../SBKS/data/saved_results/model/lda_k=5_abstract.pickle", "rb") as fp:
    lda_abstract = pickle.load(fp)

In [163]:
# datawords
with open("../SBKS/data/saved_results/data_words/ner=20210205_q=0.005.pickle", "rb") as fp:
    d1 = pickle.load(fp)
    
# datawords
with open("../SBKS/data/saved_results/data_words/ner=20210205_q=0.005_v2.pickle", "rb") as fp:
    d2 = pickle.load(fp)

In [164]:
# articles
with open("../SBKS/data/saved_results/data_words/article_sq.pickle", "rb") as fp:
    articles = pickle.load(fp)

In [167]:
topics_df(v1)

Unnamed: 0,Terms per Topic
Topic1,"circuit, input, network, output, behavior, gate, strand, simulation, device, module, algorithm, noise, distribution, population, degradation"
Topic2,"ion, membrane, residue, peptide, surface, affinity, fusion, secretion, light"
Topic3,"biosynthesis, titer, carbon, cluster, metabolite, fermentation, flux, deletion, biosynthetic, module, heterologous, compound, fatty, strains, cerevisiae, biomass, synthase, overexpression"
Topic4,"terminator, mrna, bp, GFP, repression, tRNA, cassette, rb, ribosome, strength, transformation, strains"
Topic5,"sensor, light, activation, ligand, GFP, switch, biosensor, riboswitch, receptor, mCherry, circuit, ed, intensity, output, supplementary, mammalian"


In [168]:
topics_df(v2)

Unnamed: 0,Terms per Topic
Topic1,"circuit, input, network, output, behavior, gate, strand, simulation, device, module, algorithm, noise, distribution, population, degradation"
Topic2,"ion, membrane, residue, peptide, surface, affinity, fusion, secretion, light"
Topic3,"biosynthesis, titer, carbon, cluster, metabolite, fermentation, flux, deletion, biosynthetic, module, heterologous, compound, fatty, strains, cerevisiae, biomass, synthase, overexpression"
Topic4,"terminator, mrna, bp, GFP, repression, tRNA, cassette, rb, ribosome, strength, transformation, strains"
Topic5,"sensor, light, activation, ligand, GFP, switch, biosensor, riboswitch, receptor, mCherry, circuit, ed, intensity, output, supplementary, mammalian"


In [170]:
topics_df(lda_abstract)

Unnamed: 0,Terms per Topic
Topic1,"mrna, terminator, repression, GFP, strength, ribosome, rb, membrane, IPTG, strains, RBS, supplementary"
Topic2,"light, domain, activation, ligand, ion, fusion, switch, sensor, receptor, ed, GFP, mCherry, intensity, mammalian, membrane"
Topic3,"biosynthesis, titer, metabolite, carbon, fermentation, flux, compound, deletion, biosynthetic, biosensor, strains, fatty, heterologous, intracellular, recombinant, overexpression, supernatant, precursor"
Topic4,"circuit, network, output, input, behavior, gate, strand, device, simulation, module, algorithm, noise, distribution, degradation, population, logic, switch, domain"
Topic5,"yeast, cassette, tRNA, residue, bp, CRISPR, recombination, cluster, DNA, incorporation, transformation, ligation"


In [189]:
topics_df(lda_abstract)

Unnamed: 0,Terms per Topic
Topic1,"mrna, terminator, repression, GFP, strength, ribosome, rb, membrane, IPTG, strains, RBS, supplementary"
Topic2,"light, domain, activation, ligand, ion, fusion, switch, sensor, receptor, ed, GFP, mCherry, intensity, mammalian, membrane"
Topic3,"biosynthesis, titer, metabolite, carbon, fermentation, flux, compound, deletion, biosynthetic, biosensor, strains, fatty, heterologous, intracellular, recombinant, overexpression, supernatant, precursor"
Topic4,"circuit, network, output, input, behavior, gate, strand, device, simulation, module, algorithm, noise, distribution, degradation, population, logic, switch, domain"
Topic5,"yeast, cassette, tRNA, residue, bp, CRISPR, recombination, cluster, DNA, incorporation, transformation, ligation"


In [149]:
d1 == d2

False

In [150]:
for i in range(len(d1)):
    if d1[i]!=d2[i]:
        print(i)

491
856


In [151]:
for i in range(len(d1[491])):
    if d1[491][i]!=d2[491][i]:
        print(i)

188
189


In [153]:
d1[491][188:190]

['tetra-', 'aspartic acid motif']

In [154]:
d2[491][188:190]

['tetra-aspartic acid', 'motif']

In [156]:
articles[491]

'sb8b00330'

In [161]:
jsonfiles[articles[491]]

'Numerous clinical trials have shown that adeno-associated virus (AAV) is a promising gene therapy vector that has low immunogenicity and is nonpathogenic. Additionally, AAV can be engineered as a biocomputing nanoplatform capable of delivering genetic material in response to specific stimuli. Controllable and targeted gene delivery vectors are necessary to decrease negative off-target side effects and limit toxicity to healthy tissue. Such vectors also allow for systemically delivered therapeutics to act preferentially at specific locations in the body. To achieve more controlled and targeted gene delivery, we previously developed a protease-activatable AAV platform, called provector. The provector has switchable behavior and transduces cells only in the presence of matrix metalloproteinases (MMPs), which are overexpressed in many diseased states, such as various cancers, stroke, congestive heart failure, and atherosclerosis.The first generation provector was based on AAV serotype 2 (

In [183]:
# coherence model with abstract
with open("../SBKS/data/saved_results/model/lda_k=5_abstract_coherence.pickle", "rb") as fp:
    coh_model = pickle.load(fp)
    
# coherence model w/o abstract
with open("../SBKS/data/saved_results/model/lda_k=5_coherence.pickle", "rb") as fp:
    coh_model_noabs = pickle.load(fp)

In [180]:
coh_model.get_coherence()

0.4248681962241365

In [184]:
coh_model_noabs.get_coherence()

0.45293548917502635

In [30]:
# Load Model
with open("./data/saved_results/model/lda_k=5_abstract.pickle", "rb") as fp:
    lda_5 = pickle.load(fp)
    
# coherence model with abstract
with open("./data/saved_results/model/lda_k=5_abstract_coherence.pickle", "rb") as fp:
    coh_model = pickle.load(fp)
    
# Load Model
with open("./data/saved_results/model/lda_k=5_a=5_abstract.pickle", "rb") as fp:
    lda_5_a_5 = pickle.load(fp)
    
# coherence model with abstract
with open("./data/saved_results/model/lda_k=5_a=5_abstract_coherence.pickle", "rb") as fp:
    coh_model_a_5 = pickle.load(fp)
    
# Load Model
with open("./data/saved_results/model/lda_k=5_a=30_abstract.pickle", "rb") as fp:
    lda_5_a_30 = pickle.load(fp)
    
# coherence model with abstract
with open("./data/saved_results/model/lda_k=5_a=30_abstract_coherence.pickle", "rb") as fp:
    coh_model_a_30 = pickle.load(fp)
    
# Load Model
with open("./data/saved_results/model/lda_k=5_a=100_abstract.pickle", "rb") as fp:
    lda_5_a_100 = pickle.load(fp)

# coherence model with abstract
with open("./data/saved_results/model/lda_k=5_a=100_abstract_coherence.pickle", "rb") as fp:
    coh_model_a_100 = pickle.load(fp)

In [21]:
topics_df(lda_5)

Unnamed: 0,Terms per Topic
Topic1,"mrna, terminator, repression, GFP, strength, ribosome, rb, membrane, IPTG, strains, RBS, supplementary"
Topic2,"light, domain, activation, ligand, ion, fusion, switch, sensor, receptor, ed, GFP, mCherry, intensity, mammalian, membrane"
Topic3,"biosynthesis, titer, metabolite, carbon, fermentation, flux, compound, deletion, biosynthetic, biosensor, strains, fatty, heterologous, intracellular, recombinant, overexpression, supernatant, precursor"
Topic4,"circuit, network, output, input, behavior, gate, strand, device, simulation, module, algorithm, noise, distribution, degradation, population, logic, switch, domain"
Topic5,"yeast, cassette, tRNA, residue, bp, CRISPR, recombination, cluster, DNA, incorporation, transformation, ligation"


In [31]:
coh_model.get_coherence()

0.4248681962241365

In [22]:
topics_df(lda_5_a_5)

Unnamed: 0,Terms per Topic
Topic1,"module, algorithm, base, SBOL, RBS, class, predicted, nucleotide, DNA, user"
Topic2,"yeast, cassette, bp, integration, CRISPR, recombination, GFP, ed, marker, mCherry, chromosome, locus, light, targeting, activation"
Topic3,"domain, ion, membrane, peptide, light, fusion, tRNA, residue, surface, affinity, ligand, secretion, incorporation"
Topic4,"circuit, output, network, input, behavior, GFP, mrna, gate, device, simulation, degradation, noise, repression, switch, strand, domain, population, inducer, IPTG, regulator, repressor, activation, operator, distribution, feedback"
Topic5,"biosynthesis, carbon, titer, metabolite, flux, fermentation, strains, biosynthetic, heterologous, cluster, yeast, fatty, compound"


In [32]:
coh_model_a_5.get_coherence()

0.40679661251908356

In [27]:
topics_df(lda_5_a_30)

Unnamed: 0,Terms per Topic
Topic1,"algorithm, module, SBOL, base, RBS, class, predicted, core, input, compound, file, user"
Topic2,"light, domain, ion, membrane, fusion, peptide, surface, affinity, activation, ed, antibody, GFP"
Topic3,"carbon, titer, biosynthesis, flux, fermentation, metabolite, strains, compound, fatty, intracellular, heterologous, synthase"
Topic4,"circuit, output, network, input, GFP, mrna, device, behavior, gate, repression, noise, degradation, strand, population, switch, domain, simulation, repressor, IPTG, inducer, regulator, activation, operator, ligand, feedback"
Topic5,"yeast, cassette, bp, terminator, tRNA, cluster, integration, cerevisiae, CRISPR, recombination, deletion, chromosome, transformation, marker, clone, strains, locus, GFP"


In [33]:
coh_model_a_30.get_coherence()

0.44419961366656563

In [28]:
topics_df(lda_5_a_100)

Unnamed: 0,Terms per Topic
Topic1,"module, algorithm, base, SBOL, predicted, RBS, strength, class, nucleotide, DNA, droplet"
Topic2,"yeast, cassette, bp, GFP, terminator, integration, CRISPR, sensor, recombination, ed, riboswitch, chromosome, marker, mCherry, day, targeting, activation, locus, clone"
Topic3,"domain, light, membrane, ion, fusion, peptide, tRNA, surface, affinity, residue, secretion, ligand, antibody, intensity"
Topic4,"circuit, output, network, input, behavior, strand, gate, simulation, mrna, device, noise, degradation, GFP, domain, switch, population, repression, distribution, regulator, repressor, feedback, operator, logic, inducer"
Topic5,"biosynthesis, titer, carbon, metabolite, strains, flux, fermentation, biosynthetic, cluster, heterologous, yeast, compound, fatty, cerevisiae, biomass, precursor"


In [34]:
coh_model_a_100.get_coherence()

0.4214844247835856

In [1]:
from Bio import Entrez

In [3]:
Entrez.email = 'dxiang@ucsd.edu'

In [77]:
search_query = 'ethics[Abstract] AND open access[filter] AND (("synthetic biology"[MeSH Terms] OR ("synthetic"[All Fields] AND "biology"[All Fields]) OR "synthetic biology"[All Fields]) AND ("ethics"[Subheading] OR "ethics"[All Fields] OR "ethics"[MeSH Terms]))'
handle = Entrez.esearch(db='pmc',
                        sort='relevance',
                        retmax='100000',
                        term=search_query,
                        usehistory='y')
results = Entrez.read(handle)
ids = results['IdList']

In [5]:
results

{'Count': '33', 'RetMax': '33', 'RetStart': '0', 'QueryKey': '1', 'WebEnv': 'MCID_6152917d8e61fe7a8d705dca', 'IdList': ['7123342', '7089176', '7226902', '7149545', '6630032', '7880687', '7014608', '5801888', '7863570', '5680775', '8316367', '8268180', '7052494', '6267627', '4681174', '6950820', '4071700', '3901125', '5147464', '6824826', '7805885', '6889464', '8435764', '7953679', '5561630', '8278188', '7683513', '4030158', '8011122', '5402672', '7853504', '8300107', '7695393'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'dilemma[Abstract]', 'Field': 'Abstract', 'Count': '5243', 'Explode': 'N'}, {'Term': 'open access[filter]', 'Field': 'filter', 'Count': '3852273', 'Explode': 'N'}, 'AND', {'Term': '"synthetic biology"[MeSH Terms]', 'Field': 'MeSH Terms', 'Count': '1294', 'Explode': 'Y'}, {'Term': '"synthetic"[All Fields]', 'Field': 'All Fields', 'Count': '1000963', 'Explode': 'N'}, {'Term': '"biology"[All Fields]', 'Field': 'All Fields', 'Count': '1746653', 'Explode': 'N'}, 'A

In [215]:
handle = Entrez.efetch(db='pmc', rettype='full', retstart=0, retmax=1, retmode='xml', webenv=results['WebEnv'], query_key=results['QueryKey'])

In [28]:
handle = Entrez.efetch(db='pmc', id='1470055', rettype='full',retmode='xml')

In [29]:
parsed = Entrez.read(handle, validate=False)

In [43]:
parsed[0].keys()

dict_keys(['sub-article', 'response', 'front', 'body'])

In [50]:
parsed[0]['front'].keys()

dict_keys(['list', 'def-list', 'notes', 'journal-meta', 'article-meta'])

In [48]:
parsed[0]['body'].keys()

dict_keys(['supplementary-material', 'chem-struct', 'table-wrap', 'preformat', 'related-article', 'disp-quote', 'graphic', 'sec', 'ack', 'fig-group', 'array', 'list', 'boxed-text', 'statement', 'verse-group', 'chem-struct-wrapper', 'p', 'speech', 'fig', 'mml:math', 'media', 'disp-formula', 'tex-math', 'def-list', 'table-wrap-group'])

In [73]:
import pprint

In [76]:
pprint.pprint(parsed[0]['front']['article-meta'])

{'abstract': [{'ack': [],
               'array': [],
               'boxed-text': [],
               'chem-struct': [],
               'chem-struct-wrapper': [],
               'def-list': [],
               'disp-formula': [],
               'disp-quote': [],
               'fig': [],
               'fig-group': [],
               'fn-group': [],
               'glossary': [],
               'graphic': [],
               'list': [],
               'media': [],
               'mml:math': [],
               'notes': [],
               'object-id': [],
               'p': ['The 12th meeting of the Scientific Group on Methodologies for the Safety Evaluation of Chemicals (SGOMSEC) considered the topic of methodologies for determining human and ecosystem susceptibility to environmental hazards. The report prepared at the meeting describes measurement of susceptibility through the use of biological markers of exposure, biological markers of effect, and biomarkers directly indicative of susc

In [82]:
for ids in results['IdList']:
    handle = Entrez.efetch(db='pmc', id=ids, rettype='full',retmode='xml')
    with open('../PMC data/ethics/pmc{}.xml'.format(ids), 'w') as f:
        f.write(handle.read())

AttributeError: 'bytes' object has no attribute 'to_string'

In [216]:
with open('../PMC data/ethics/6692427.xml', 'w') as f:
    f.write(str(handle.read()))

In [197]:
article

<generator object DataHandler.parse at 0x7fc69b07a250>

In [28]:
# def search(query):
#     Entrez.email = 'dxiang@ucsd.edu'
#     handle = Entrez.esearch(db='pmc', 
#                             sort='relevance', 
#                             retmax='20',
#                             retmode='txt', 
#                             term=query)
#     results = Entrez.read(handle)
#     return results

# def fetch_details(id_list):
#     ids = ','.join(id_list)
#     Entrez.email = 'dxiang@ucsd.edu'
#     handle = Entrez.efetch(db='pmc',
#                            retmode='txt',
#                            id=ids)
#     results = Entrez.read(handle)
#     return results

# if __name__ == '__main__':
#     results = search('synthetic biology ethics')
#     id_list = results['IdList']
#     papers = fetch_details(id_list)
#     for i, paper in enumerate(papers['PubmedArticle']):
#         print("{}) {}".format(i+1, paper['MedlineCitation']['Article']['ArticleTitle']))
