# LDA Tunning

In [99]:
import re
import nltk
import spacy
import gensim
import nlp
import pandas as pd
import numpy as np
import pickle

In [100]:
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
%matplotlib inline

## Loading data 

In [101]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/externo/joseas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [102]:
df = pd.read_csv("data/dimensions/publications-ecology-uniq.csv")
np.shape(df)

(13808, 55)

In [103]:
df

Unnamed: 0.1,Unnamed: 0,Rank,Publication ID,DOI,PMID,PMCID,ISBN,Title,Abstract,Acknowledgements,...,Dimensions URL,Fields of Research (ANZSRC 2020),RCDC Categories,HRCS HC Categories,HRCS RAC Categories,Cancer Types,CSO Categories,Units of Assessment,Sustainable Development Goals,GROUP
0,0,3366,pub.1148151862,10.1093/icb/icac055,35612972.0,,,An Integrative Perspective On the Mechanistic ...,It has long been known that the outcome of spe...,,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology,,,,,,A05 Biological Sciences,13 Climate Action,species interaction
1,1,3161,pub.1140217916,10.1086/716724,34762574.0,,,Species Interactions Limit the Predictability ...,Predicting how ecological communities will res...,,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology,,,,,,A05 Biological Sciences,14 Life Below Water,species interaction
2,2,3133,pub.1152523368,10.1111/ele.14139,36335559.0,PMC10099232,,Resetting our expectations for parasites and t...,"Despite the ubiquitous nature of parasitism, h...",We thank the authors for supplying data. We al...,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology; 41 Envir...,Infectious Diseases,,2.1 Biological and endogenous factors,,,A05 Biological Sciences,14 Life Below Water,species interaction
3,3,3130,pub.1145194743,10.1111/ele.13977,35106910.0,PMC9543015,,Disentangling key species interactions in dive...,Modelling species interactions in diverse comm...,This paper is a joint effort of the working gr...,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology,,,,,,A05 Biological Sciences,,species interaction
4,4,3099,pub.1149534290,10.1111/nph.18384,35842790.0,PMC9804646,,Impact of warmer and drier conditions on tree ...,Increased temperature and prolonged soil moist...,MD‐G and CG were supported by the Swiss Nation...,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology; 3108 Pla...,,,,,,A05 Biological Sciences,13 Climate Action,species interaction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13803,2619,297,pub.1005057415,10.1046/j.1523-1739.2000.98081.x,,,,A Survey and Overview of Habitat Fragmentation...,Abstract: Habitat destruction and fragmentatio...,We thank all those investigators who provided ...,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology; 3109 Zoo...,,,,,,,,interspecific interaction
13804,2620,284,pub.1034372806,10.2478/v10208-011-0015-3,,,,A gradient analytic perspective on distributio...,Abstract After massive proliferation over the ...,,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology,,,,,,A05 Biological Sciences,,interspecific interaction
13805,2621,261,pub.1029922063,10.3727/015613880791573853,,,,A Comparison of Two Populations of the Grey-cr...,Group-breeding is defined as breeding behaviou...,,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology; 3109 Zoo...,,,,,,"A06 Agriculture, Veterinary and Food Science",,interspecific interaction
13806,2622,252,pub.1147285073,10.1111/1365-2745.13895,,,,Temporal stabilizing effects of species richne...,The extent to which individuals experience int...,ACKNOWLEDGEMENTS We are grateful to all who as...,...,https://app.dimensions.ai/details/publication/...,31 Biological Sciences; 3103 Ecology,,,,,,A05 Biological Sciences,14 Life Below Water,interspecific interaction


In [104]:
df["text"] = df["Title"] + " " + df["Abstract"]
data = df.text.values.tolist()
data[:2]

["An Integrative Perspective On the Mechanistic Basis of Context- Dependent Species Interactions. It has long been known that the outcome of species interactions depends on the environmental context in which they occur. Climate change research has sparked a renewed interest in context-dependent species interactions because rapidly changing abiotic environments will cause species interactions to occur in novel contexts and researchers must incorporate this in their predictions of species' responses to climate change. Here, we argue that predicting how the environment will alter the outcome of species interactions requires an integrative biology approach that focuses on the traits, mechanisms, and processes that bridge disciplines such as physiology, biomechanics, ecology, and evolutionary biology. Specifically, we advocate for quantifying how species differ in their tolerance and performance to both environmental challenges independent of species interactions, and in interactions with o

## Preprocessing and cleaning

In [105]:
len(data)

13808

In [106]:
data[:2]

["An Integrative Perspective On the Mechanistic Basis of Context- Dependent Species Interactions. It has long been known that the outcome of species interactions depends on the environmental context in which they occur. Climate change research has sparked a renewed interest in context-dependent species interactions because rapidly changing abiotic environments will cause species interactions to occur in novel contexts and researchers must incorporate this in their predictions of species' responses to climate change. Here, we argue that predicting how the environment will alter the outcome of species interactions requires an integrative biology approach that focuses on the traits, mechanisms, and processes that bridge disciplines such as physiology, biomechanics, ecology, and evolutionary biology. Specifically, we advocate for quantifying how species differ in their tolerance and performance to both environmental challenges independent of species interactions, and in interactions with o

In [107]:
# Remove new line characters
data = [re.sub(r'\s+', ' ', str(i)) for i in data]

In [108]:
data = [re.sub("'", "", i) for i in data]

In [109]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True, min_len=4))

In [110]:
data_words = list(sent_to_words(data))

In [111]:
print(data_words[1])

['species', 'interactions', 'limit', 'predictability', 'community', 'responses', 'environmental', 'change', 'predicting', 'ecological', 'communities', 'will', 'respond', 'environmental', 'change', 'challenging', 'highly', 'relevant', 'this', 'global', 'change', 'ecologists', 'commonly', 'current', 'spatial', 'relationships', 'between', 'species', 'environmental', 'conditions', 'make', 'predictions', 'about', 'future', 'this', 'assumes', 'that', 'species', 'will', 'track', 'conditions', 'shifting', 'their', 'distributions', 'however', 'theory', 'experimental', 'evidence', 'suggest', 'that', 'species', 'interactions', 'prevent', 'communities', 'from', 'predictably', 'tracking', 'temporal', 'changes', 'environmental', 'conditions', 'basis', 'current', 'spatial', 'relationships', 'between', 'species', 'environmental', 'gradients', 'tested', 'this', 'hypothesis', 'assessing', 'dynamics', 'protist', 'species', 'replicated', 'patch', 'microcosm', 'landscapes', 'that', 'experienced', 'differen

In [112]:
with open('stopwords.txt', 'r') as f:
    custom_stop_words = f.read().split("\n")
custom_stop_words[:15]

['also',
 'thus',
 'depends',
 'plant',
 'effect',
 'population',
 'study',
 'habitat',
 'biotic',
 'species',
 'ecological',
 'community',
 'interspecific',
 'inter-specific',
 'biological']

In [113]:
stop_words = nltk.corpus.stopwords.words("english")
def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [114]:
def remove_common_words(texts, common_words):
    return [[word for word in doc if word not in common_words] for doc in texts]

In [115]:
import spacy_fastlang
# Use Fastlang to remove non-english publications
nlp = spacy.load("en_core_web_lg", disable=['ner', 'parser', 'senter'])
nlp.add_pipe('language_detector', last=True)



<spacy_fastlang.LanguageDetector at 0x7f24f62d8880>

In [116]:
def lemmatization(texts):
    texts_out = []
    rm_index = []
    for i, sent in enumerate(texts):
        doc = nlp(" ".join(sent))
        if doc._.language != 'en':                
            rm_index.append(i)
            continue
        texts_out.append([token.lemma_ for token in doc])
    return (texts_out,rm_index)

In [117]:
def pos_tagger_filter(texts, allowed_postags=['NOUN']):
    texts_out = []
    for i, sent in enumerate(texts):
        doc = nlp(" ".join(sent), disable=['lemmatizer', 'language_detector'])        
        texts_out.append([token.text for token in doc if allowed_postags is None or token.pos_ in allowed_postags])
    return texts_out

In [118]:
# Remove stop words
data_words_nostops = remove_stopwords(data_words, stop_words)
data_words_nostops = remove_common_words(data_words_nostops, custom_stop_words)
data_words_nostops[1]

['predictability',
 'responses',
 'environmental',
 'predicting',
 'communities',
 'respond',
 'environmental',
 'challenging',
 'highly',
 'relevant',
 'global',
 'ecologists',
 'commonly',
 'current',
 'spatial',
 'relationships',
 'environmental',
 'conditions',
 'make',
 'predictions',
 'assumes',
 'conditions',
 'shifting',
 'distributions',
 'however',
 'experimental',
 'suggest',
 'prevent',
 'communities',
 'predictably',
 'tracking',
 'temporal',
 'changes',
 'environmental',
 'conditions',
 'basis',
 'current',
 'spatial',
 'relationships',
 'environmental',
 'gradients',
 'tested',
 'assessing',
 'dynamics',
 'protist',
 'replicated',
 'patch',
 'microcosm',
 'landscapes',
 'experienced',
 'different',
 'regimes',
 'spatial',
 'temporal',
 'environmental',
 'heterogeneity',
 'light',
 'dark',
 'populations',
 'kept',
 'monocultures',
 'polycultures',
 'assess',
 'monocultures',
 'abundances',
 'predictable',
 'basis',
 'current',
 'environmental',
 'conditions',
 'regardless

In [119]:
# Lemmatization before generate n-grams
data_lemmatized, rm_index = lemmatization(data_words_nostops)
len(data_lemmatized)

13701

In [120]:
df = df.drop(rm_index)
df.shape

(13701, 56)

In [121]:
df.to_csv("data/dimensions/publications-ecology-filtered-13701.csv")

In [122]:
# Generate bi-grams from lemmas
bigram = gensim.models.Phrases(data_lemmatized, min_count=5,threshold=1) #-1, scoring="npmi")
print(bigram)

Phrases<903340 vocab, min_count=5, threshold=1, max_vocab_size=40000000>


In [123]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
print(bigram_mod[data_lemmatized[1]])

['predictability', 'response_environmental', 'predict', 'community_respond', 'environmental_challenge', 'highly_relevant', 'global', 'ecologist', 'commonly', 'current', 'spatial', 'relationship_environmental', 'condition', 'make_prediction', 'assume', 'condition', 'shift_distribution', 'however_experimental', 'suggest', 'prevent', 'community', 'predictably', 'track', 'temporal_change', 'environmental_condition', 'basis', 'current', 'spatial', 'relationship_environmental', 'gradient_test', 'assess', 'dynamic', 'protist', 'replicate', 'patch', 'microcosm', 'landscape', 'experience_different', 'regime', 'spatial_temporal', 'environmental_heterogeneity', 'light_dark', 'population', 'keep', 'monoculture', 'polyculture', 'assess', 'monoculture', 'abundance', 'predictable', 'basis', 'current', 'environmental_condition', 'regardless_whether', 'population_experience', 'temporal', 'environmental', 'polyculture', 'abundance', 'depend_environmental', 'condition_experience', 'suggest', 'community_r

In [124]:
trigram = gensim.models.Phrases(bigram[data_lemmatized], threshold=1, min_count=5)
print(trigram)

Phrases<1184569 vocab, min_count=5, threshold=1, max_vocab_size=40000000>


In [125]:
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(trigram_mod[bigram_mod[data_lemmatized[1]]])

['predictability', 'response_environmental', 'predict', 'community_respond', 'environmental_challenge', 'highly_relevant', 'global', 'ecologist', 'commonly', 'current', 'spatial', 'relationship_environmental', 'condition', 'make_prediction', 'assume', 'condition', 'shift_distribution', 'however_experimental', 'suggest', 'prevent', 'community', 'predictably', 'track', 'temporal_change', 'environmental_condition', 'basis', 'current', 'spatial', 'relationship_environmental', 'gradient_test', 'assess', 'dynamic', 'protist', 'replicate', 'patch', 'microcosm', 'landscape', 'experience_different', 'regime', 'spatial_temporal', 'environmental_heterogeneity', 'light_dark', 'population', 'keep', 'monoculture', 'polyculture', 'assess', 'monoculture', 'abundance', 'predictable', 'basis', 'current', 'environmental_condition', 'regardless_whether', 'population_experience', 'temporal', 'environmental', 'polyculture', 'abundance', 'depend_environmental', 'condition_experience', 'suggest', 'community_r

In [126]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [127]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [128]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_lemmatized)
data_words_bigrams[:1][:10]

[['integrative',
  'mechanistic_basis',
  'context_dependent',
  'long_know',
  'outcome',
  'environmental_context',
  'occur',
  'climate',
  'spark',
  'renew',
  'context_dependent',
  'rapidly_change',
  'abiotic_environment',
  'cause',
  'occur',
  'novel',
  'context',
  'researcher',
  'must_incorporate',
  'prediction_response',
  'climate',
  'argue',
  'predict',
  'environment',
  'outcome',
  'require',
  'integrative',
  'biology',
  'focus',
  'trait',
  'mechanism',
  'process',
  'bridge',
  'discipline',
  'physiology',
  'biomechanic',
  'ecology_evolutionary',
  'biology',
  'specifically',
  'advocate',
  'quantifying',
  'differ',
  'tolerance_environmental',
  'challenge',
  'independent',
  'function',
  'environment',
  'increase',
  'mechanism_underlie',
  'outcome',
  'across_different',
  'environmental_context',
  'help_determine',
  'outcome',
  'affect',
  'relative_abundance',
  'distribution',
  'interact',
  'nature',
  'general',
  'emerge',
  'unabl

In [129]:
# Form Trigrams
data_words_trigrams = make_trigrams(data_words_bigrams)
data_words_trigrams[:1][:10]

[['integrative',
  'mechanistic_basis',
  'context_dependent',
  'long_know',
  'outcome',
  'environmental_context',
  'occur',
  'climate',
  'spark',
  'renew',
  'context_dependent',
  'rapidly_change',
  'abiotic_environment',
  'cause',
  'occur',
  'novel',
  'context',
  'researcher',
  'must_incorporate',
  'prediction_response_climate',
  'argue',
  'predict',
  'environment',
  'outcome',
  'require',
  'integrative',
  'biology',
  'focus',
  'trait',
  'mechanism',
  'process',
  'bridge',
  'discipline',
  'physiology',
  'biomechanic',
  'ecology_evolutionary_biology',
  'specifically',
  'advocate',
  'quantifying',
  'differ',
  'tolerance_environmental',
  'challenge',
  'independent',
  'function',
  'environment',
  'increase',
  'mechanism_underlie',
  'outcome',
  'across_different',
  'environmental_context',
  'help_determine',
  'outcome',
  'affect',
  'relative_abundance',
  'distribution',
  'interact',
  'nature',
  'general',
  'emerge',
  'unable',
  'mai

In [130]:
data_postag_filtered = pos_tagger_filter(data_words_trigrams, allowed_postags=["NOUN","ADJ"])

In [131]:
data_postag_filtered[1][:10]

['predictability',
 'response_environmental',
 'global',
 'ecologist',
 'current',
 'spatial',
 'condition',
 'shift_distribution',
 'community',
 'temporal_change']

In [132]:
data_final = remove_common_words(data_postag_filtered, custom_stop_words)
#data_lemmatized_nocommon=data_words_trigrams

In [133]:
print(data_final[1])

['predictability', 'response_environmental', 'global', 'ecologist', 'current', 'spatial', 'shift_distribution', 'temporal_change', 'environmental_condition', 'basis', 'current', 'spatial', 'relationship_environmental', 'dynamic', 'protist', 'patch', 'microcosm', 'landscape', 'regime', 'spatial_temporal', 'light_dark', 'monoculture', 'polyculture', 'monoculture', 'abundance', 'predictable', 'basis', 'current', 'environmental_condition', 'population_experience', 'temporal', 'environmental', 'polyculture', 'abundance', 'depend_environmental', 'community_respond', 'spatial', 'temporal', 'environmental_change', 'likely', 'prediction', 'base_current', 'spatial_relationship', 'environment']


In [142]:
# LDA
id2word = corpora.Dictionary(data_final)
print(id2word)

Dictionary<42894 unique tokens: ['abiotic_environment', 'across_environment', 'biology', 'bridge', 'cause']...>


In [135]:
texts  = data_final

## Save dictionary and texts

In [136]:
# Save dictionary
id2word.save('./data/m-uniq/id2word')
with open("./data/m-uniq/texts.pkl","wb") as f:
    pickle.dump(texts,f)

## Create BOW and Corpus

In [137]:
min_freq = 0.01*len(texts)
print(f"Len: {len(texts)} Min freq: {min_freq}")

Len: 13701 Min freq: 137.01


In [138]:
id2word.filter_extremes(no_below=min_freq,no_above=0.9, keep_n=None)
print(id2word)

Dictionary<690 unique tokens: ['biology', 'cause', 'climate', 'context', 'distribution']...>


In [139]:
id2word.save('./data/m-uniq/id2word-extremes')
print(id2word)

Dictionary<690 unique tokens: ['biology', 'cause', 'climate', 'context', 'distribution']...>


In [46]:
corpus = [id2word.doc2bow(text) for text in texts]

In [141]:
print(f'Number of unique tokens: {len(id2word)}')
print(f'Number of documents: {len(corpus)}')

Number of unique tokens: 690


NameError: name 'corpus' is not defined

In [None]:
def fit_lda(corpus, texts, id2word, k, a, b, corpus_test):
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=k,
                                                random_state=1234,
                                                iterations=50,
                                                chunksize=10000,
                                                passes=1,
                                                alpha=a,
                                                eta=b,                                                        
                                                per_word_topics=True)
    cv = CoherenceModel(model=lda_model, texts=texts,corpus=corpus, dictionary=id2word, coherence='c_v')
    umass = CoherenceModel(model=lda_model, texts=texts,corpus=corpus, dictionary=id2word, coherence='u_mass')
    perp = lda_model.log_perplexity(corpus_test)
    return (cv.get_coherence(),umass.get_coherence, perp)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
!pip install --upgrade tqdm

In [None]:
import tqdm

In [None]:
# Topics range
topics_range = range(2,50,2)

In [None]:
# Alpha parameter
alpha = [0.05,0.1,0.5,1.5,10]
alpha.append("symmetric")

In [None]:
# Beta parameter
beta = [0.05,0.1,0.5,1.5,10]
beta.append("symmetric")

In [None]:
# Validation set
models_results = {
    'Topics': [],
    'Alpha': [],
    'Beta': [],
    'CV': [],
    'UMASS': [],
    'PERP': []
}

In [None]:
# Train documents
train_docs=df.groupby("GROUP").sample(frac=0.75,random_state=200)
idx = train_docs.index.to_numpy()

In [None]:
train_texts = [texts[i] for i in idx]
len(train_texts)

In [None]:
train_corpus = [id2word.doc2bow(text) for text in train_texts]

In [None]:
# Test documents
test_docs=df.drop(train_docs.index)
len(test_docs)

In [None]:
test_texts = [texts[i] for i in range(0,len(texts)) if i not in idx]
len(test_texts)

In [None]:
text_corpus = [id2word.doc2bow(text) for text in test_texts]
len(text_corpus)

In [None]:
pbar = tqdm.tqdm(total=len(topics_range)*len(alpha)*len(beta))

In [None]:
for k in topics_range:
    for a in alpha:
        for b in beta:                        
            (cv,umass,perp) = fit_lda(corpus=corpus,texts=texts, id2word=id2word,k=k,a=a,b=b)            
            models_results['Topics'].append(k)
            models_results['Alpha'].append(a)
            models_results['Beta'].append(b)
            models_results['CV'].append(cv)
            models_results['UMASS'].append(umass)
            models_results['PERP'].append(perp)

            pbar.update(1)

In [None]:
pd.DataFrame(models_results).to_csv('./data/lda_tuning_test.csv', index=False)
pbar.close()