In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
data = pd.read_csv('./textmining/New_Data/final_hdsi_faculty_updated.csv', index_col='Unnamed: 0')

In [4]:
data[-10:]

Unnamed: 0,year,authors,title,abstract,times_cited,concepts,journal.title,HDSI_author
2184,2016,[{'raw_affiliation': ['Scripps Institution of ...,Information leverage in interconnected ecosyst...,"In ecological analysis, complexity has been re...",59,"['multiview embedding', 'curse of dimensionali...",Science,George Sugihara
2185,2016,[{'raw_affiliation': ['Department of Mathemati...,DYNAMICAL EVIDENCE FOR CAUSALITY BETWEEN GALAC...,,0,"['evidence', 'causality', 'RAYS', 'temperature...",,George Sugihara
2186,2016,[{'raw_affiliation': ['Institute of Integrativ...,Elevated nonlinearity as indicator of transiti...,Abstract\n Ecosystems may exper...,0,"['risk of extinction', 'fish stocks', 'stable ...",bioRxiv,George Sugihara
2187,2016,[{'raw_affiliation': ['Department of Geriatric...,Slowing Down of Recovery as Generic Risk Marke...,OBJECTIVE: We propose a novel paradigm to pred...,57,"['critical care medicine', 'chronic diseases',...",Critical Care Medicine,George Sugihara
2188,2016,[{'raw_affiliation': ['Scripps Institution of ...,Tracking and forecasting ecosystem interaction...,Evidence shows that species interactions are n...,111,"['marine mesocosm experiment', 'species intera...",Proceedings of the Royal Society B,George Sugihara
2189,2018,"['Justin Eldridge', 'Mikhail Belkin', 'Yusu Wa...",Unperturbed: spectral analysis beyond Davis-Kahan,"['weyl', 'algorithm', 'typical', 'davis', 'unp...",50,"['Perturbation theory', 'Eigenvalues and eigen...",,Justin Eldridge
2190,2015,"['Justin Eldridge', 'Mikhail Belkin', 'Yusu Wa...",Beyond Hartigan Consistency: Merge Distortion ...,"['correct', 'imply', 'single', 'nesting', 'two...",26,"['Hierarchical clustering', 'Cluster analysis'...",,Justin Eldridge
2191,2016,"['Justin Eldridge', 'Mikhail Belkin', 'Yusu Wa...","Graphons, mergeons, and so on!","['correct', 'assumption', 'algorithm', 'produc...",13,"['Clustering coefficient', 'Cluster analysis',...",,Justin Eldridge
2192,2014,"['Justin Eldridge', 'Alison E Lane', 'Mikhail ...",Robust features for the automatic identificati...,"['hyper', 'recordings', 'noise', 'means', 'tak...",26,"['Electroencephalography', 'Autism spectrum di...",journal of neurodevelopmental disorders,Justin Eldridge
2193,2013,['Aaron McMillan Fraenkel'],Extensions of Poisson Structures on Singular H...,"['characterize', 'extensions', 'pi', 'singular...",1,"['Isolated singularity', 'Koszul complex', 'Id...",arxiv symplectic geometry,Aaron Fraenkel


In [5]:
data['abstract'] = data['abstract'].apply(lambda x: re.sub('[\[\]]','',x) if type(x)==str else x)

In [6]:
agg_by_year = data.groupby(['HDSI_author','year']).agg({'abstract':np.sum})

In [7]:
len(data['HDSI_author'].unique())

51

In [8]:
agg_by_year = agg_by_year[agg_by_year['abstract']!=0]

### Gensim

In [9]:
# Load the regular expression library

# Remove punctuation
agg_by_year['abstract'] = agg_by_year['abstract'].map(lambda x: re.sub('[,\.!?]', '', x))

agg_by_year['abstract'] = agg_by_year['abstract'].map(lambda x: x.lower())
# Print out the first rows of papers
agg_by_year

Unnamed: 0_level_0,Unnamed: 1_level_0,abstract
HDSI_author,year,Unnamed: 2_level_1
Aaron Fraenkel,2013,'characterize' 'extensions' 'pi' 'singular' 'a...
Albert Hsiao,2016,while early and intermediate results of fontan...
Albert Hsiao,2017,objective: we report here an initial experienc...
Albert Hsiao,2018,purposewith the hypothesis that 4d flow can be...
Albert Hsiao,2019,we present a case of a 20-year-old man with te...
...,...,...
Zhiting Hu,2016,deep kernel learning combines the non-parametr...
Zhiting Hu,2017,the recently developed variational autoencoder...
Zhiting Hu,2018,sequence prediction models can be learned from...
Zhiting Hu,2019,manipulating data such as weighting data examp...


In [10]:
import tokenizer as tokenizer

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string
import re

def CleanText(data):

    # basic tokenizer 
    B_tokenizer = tokenizer.PlainTokenizer()

    # set stop words
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union({'et', 'al', 'use', 'using', 'used'})
    # punctuations = string.punctuation.replace('-','')
    lemmatizer = WordNetLemmatizer()

    data = re.sub(r'\([^()]*\)', '', data)
    for tag in ['REFEND', 'REF', 'EQL', 'FIG']:
        data = data.replace(tag, '')
    words = [s for s in B_tokenizer.tokenize(data) if re.match("^[A-Za-z0-9\-]+$", s)]
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(s) for s in words]
    words = [s for s in words if not re.match("^[0-9]+$", s)]
    words = [s for s in words if not len(s) == 1]

    return words

In [35]:
data = agg_by_year.abstract.values.tolist()
data_words = []
for d in data:
    data_words.append(CleanText(d))

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

q = 0.005
if q > 0:
    corpus = [' '.join(d) for d in data_words]
    tfidf = TfidfTransformer()
    vocabulary = list(set(' '.join(corpus).split()))
    pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(corpus)
    ser = pd.Series(index = vocabulary, data = pipe['tfid'].idf_)

    # create stop words list
    stops = ser[ser<ser.quantile(q)].sort_values().index.tolist()

    #update data_words 
    data_words_cleaned = []
    for d in data_words:
        data_words_cleaned.append([w for w in d if w not in stops])
    data_words = data_words_cleaned

In [37]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]


In [64]:
import gensim
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 30 topics
pprint(lda_model.print_topics(num_topics=30))
doc_lda = lda_model[corpus]

[(0,
  '0.004*"microbiome" + 0.004*"gut" + 0.004*"patient" + 0.003*"disease" + '
  '0.003*"microbial" + 0.003*"associated" + 0.002*"diversity" + 0.002*"cell" + '
  '0.002*"community" + 0.002*"individual"'),
 (1,
  '0.004*"microbiome" + 0.004*"gene" + 0.004*"microbial" + 0.003*"gut" + '
  '0.003*"cell" + 0.003*"sequencing" + 0.003*"patient" + 0.002*"disease" + '
  '0.002*"individual" + 0.002*"genome"'),
 (2,
  '0.003*"microbiome" + 0.003*"associated" + 0.003*"cell" + 0.003*"neural" + '
  '0.003*"patient" + 0.003*"gut" + 0.002*"gene" + 0.002*"effect" + '
  '0.002*"disease" + 0.002*"microbial"'),
 (3,
  '0.004*"microbiome" + 0.003*"patient" + 0.003*"microbial" + 0.003*"cell" + '
  '0.002*"associated" + 0.002*"gene" + 0.002*"community" + 0.002*"test" + '
  '0.002*"brain" + 0.002*"disease"'),
 (4,
  '0.005*"microbiome" + 0.004*"gut" + 0.003*"microbial" + 0.003*"community" + '
  '0.003*"neural" + 0.002*"sequencing" + 0.002*"associated" + '
  '0.002*"individual" + 0.002*"brain" + 0.002*"activ

### Sklearn

In [73]:
import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(123)
import pickle
import nltk
nltk.download('wordnet')

data = pd.read_csv('./textmining/New_Data/final_hdsi_faculty_updated.csv', index_col='Unnamed: 0')
data = data.fillna('')

redundant = ['abstract', 'purpose', 'paper', 'goal']
stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')
def preprocess_abstract(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in redundant:
            result.append(lemmatize_stemming(token))
    return " ".join(result)


data['abstract_processed'] = data['abstract'].apply(preprocess_abstract)

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package wordnet to /Users/duxiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [74]:
data['abstract_processed'] = data['abstract'].apply(preprocess_abstract)
counts = CountVectorizer().fit_transform(data['abstract_processed'])

In [75]:
data['year'] = data['year'].astype(int)
data = data[data['year'] > 2015]

In [76]:
# organzie author's abstracts by year
authors = {}
for author in data['HDSI_author'].unique():
    authors[author] = {
        2016 : list(),
        2017 : list(),
        2018 : list(),
        2019 : list(),
        2020 : list(),
        2021 : list()
    }
for i, row in data.iterrows():
    authors[row['HDSI_author']][row['year']].append(row['abstract_processed'])

In [16]:
all_docs = []
for author, author_dict in authors.items():
    for year, documents in author_dict.items():
        all_docs.append(" ".join(documents))

In [17]:
# initate LDA model
countVec = CountVectorizer()
counts = countVec.fit_transform(all_docs)
names = countVec.get_feature_names()

In [18]:
# 50 topics model 
modeller = LatentDirichletAllocation(n_components=30, n_jobs=-1, random_state=123)
result = modeller.fit_transform(counts)

# display top words for each topic in the model
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
display_topics(modeller, names, 10)

Topic 0:
ϕλϕμ fatten fibroid fibrogenic fibrogenesis fibroblasts fibroblast fibro fibrinolytic fibrillation
Topic 1:
signal relapse black hole observe science theory individuals gravitational ligo
Topic 2:
network mechanisms disease muscle cycle variability biological study chromatin cell
Topic 3:
model generate dialog music task generation persona dataset datasets language
Topic 4:
parameters random number stochastic algorithm circuit small design sample methods
Topic 5:
seed document text network entities train model semantic module facet
Topic 6:
cells data cell sequence response analysis single study forecast genes
Topic 7:
model neural network brain data dynamics activity learn time neurons
Topic 8:
model causal time prediction recruitment face human problem information fairness
Topic 9:
data patients model clinical study result risk time methods cancer
Topic 10:
cyclopeptides human fandom variants sequence cyclonovo covid consult antibiotics substitutions
Topic 11:
microbiome mic

In [63]:
# time-author-topic dataframe
# 350 rows
# one column for each topic, author, year
# assign a topic for each document (like the matrix, 0, 1, 0) and marked the document by author and year


# Topic Matrix => result

# column names
topicnames = ["Topic" + str(i) for i in range(30)]

# index names
docnames = ["Doc" + str(i) for i in range(len(all_docs))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(result, columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic


In [20]:
df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,dominant_topic
Doc0,0.000126,0.000126,0.000126,0.000126,0.000126,0.000126,0.000126,0.593200,0.000126,0.000126,...,0.000126,0.000126,0.000126,0.000126,0.000126,0.000126,0.000126,0.132488,0.000126,7
Doc1,0.000029,0.000029,0.000029,0.000029,0.000029,0.000029,0.000029,0.400168,0.000029,0.000029,...,0.000029,0.000029,0.000029,0.000029,0.000029,0.000029,0.000029,0.098097,0.061526,7
Doc2,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,...,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,0.000036,20
Doc3,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.322880,0.000038,0.000038,...,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.252439,0.000038,7
Doc4,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.388859,0.000051,0.000051,...,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.118619,0.000051,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc295,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0
Doc296,0.000694,0.118566,0.000694,0.000694,0.000694,0.000694,0.000694,0.259326,0.000694,0.000694,...,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,7
Doc297,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0
Doc298,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0


In [64]:
# add author and year
df_document_topic['author'] = np.nan
df_document_topic['year'] = np.nan
df_document_topic.shape

year_paper_count = {}
for author in authors.keys():
    if author not in year_paper_count.keys():
        year_paper_count[author] = 0
    year_paper_count[author] += len(authors[author])

author_list = list(year_paper_count.keys())
author_list_populate = np.array([[a]*6 for a in author_list]).flatten()
df_document_topic.iloc[:, 31] = author_list_populate

year = [2016, 2017, 2018, 2019, 2020, 2021] * len(author_list)
df_document_topic.iloc[:, 32] = year

time_author_topic = df_document_topic
time_author_topic.to_csv('./Data/time_author_topic.csv')

In [60]:
time_author_topic[-10:]

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,dominant_topic,author,year
Doc290,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,0.997725,7.8e-05,...,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,7.8e-05,8,George Sugihara,2018
Doc291,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,0.594422,0.221129,0.130271,4.5e-05,...,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,6,George Sugihara,2019
Doc292,3.1e-05,3.1e-05,3.1e-05,3.1e-05,3.1e-05,3.1e-05,3.1e-05,0.529278,0.143412,3.1e-05,...,0.221771,3.1e-05,3.1e-05,3.1e-05,3.1e-05,0.104725,3.1e-05,7,George Sugihara,2020
Doc293,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,0.631455,0.097619,3.4e-05,...,0.047275,3.4e-05,3.4e-05,3.4e-05,3.4e-05,0.205482,3.4e-05,7,George Sugihara,2021
Doc294,0.000794,0.000794,0.000794,0.000794,0.000794,0.000794,0.000794,0.000794,0.000794,0.000794,...,0.000794,0.000794,0.000794,0.000794,0.000794,0.000794,0.000794,13,Justin Eldridge,2016
Doc295,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0,Justin Eldridge,2017
Doc296,0.000694,0.118566,0.000694,0.000694,0.000694,0.000694,0.000694,0.259326,0.000694,0.000694,...,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,7,Justin Eldridge,2018
Doc297,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0,Justin Eldridge,2019
Doc298,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0,Justin Eldridge,2020
Doc299,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,...,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0,Justin Eldridge,2021


## Sankey

In [77]:
import pandas as pd

data2 = pd.read_csv('Data/time_author_topic.csv', index_col=0)


In [78]:
# data
# how each author is related to each topic overall
averaged = data2.groupby('author').mean().drop(['dominant_topic','year'], axis=1)

In [102]:
averaged

Unnamed: 0_level_0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Albert Hsiao,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05,0.024312,6.3e-05,6.3e-05,0.041868,...,6.3e-05,6.3e-05,0.071758,0.096217,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05
Alex Cloninger,0.005637,0.005637,0.005637,0.005637,0.005637,0.005637,0.005637,0.053563,0.005637,0.02867,...,0.005637,0.398486,0.005637,0.005637,0.005637,0.005637,0.005637,0.005637,0.023793,0.008946
Angela Yu,0.000117,0.165695,0.000117,0.000117,0.000117,0.000117,0.000117,0.116022,0.489219,0.000117,...,0.000117,0.000117,0.000117,0.000117,0.000117,0.166223,0.000117,0.000117,0.018337,0.000117
Armin Schwartzman,7.4e-05,7.4e-05,0.128389,7.4e-05,7.4e-05,7.4e-05,7.4e-05,7.4e-05,7.4e-05,0.006928,...,7.4e-05,7.4e-05,0.295368,0.022455,7.4e-05,7.4e-05,7.4e-05,7.4e-05,0.025801,7.4e-05
Arun Kumar,0.027794,0.027794,0.027794,0.027794,0.027794,0.027794,0.027794,0.027794,0.027794,0.027794,...,0.055168,0.027794,0.027794,0.027794,0.027794,0.027794,0.027794,0.027794,0.166605,0.027794
Arya Mazumdar,4.3e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05,0.005213,4.3e-05,4.3e-05,...,4.3e-05,4.3e-05,0.147942,4.3e-05,0.006398,4.3e-05,4.3e-05,4.3e-05,0.185101,4.3e-05
Babak Salimi,0.000191,0.000191,0.000191,0.000191,0.000191,0.000191,0.102365,0.000191,0.265581,0.000191,...,0.000191,0.008843,0.083188,0.005338,0.005407,0.000191,0.000191,0.000191,0.512765,0.009151
Barna Saha,0.000103,0.000103,0.000103,0.000103,0.000103,0.000103,0.000103,0.000103,0.174135,0.000103,...,0.028418,0.000103,0.000103,0.000103,0.357826,0.000103,0.000103,0.000103,0.069655,0.008258
Benjamin Smarr,0.000109,0.000109,0.235818,0.000109,0.000109,0.000109,0.000109,0.142976,0.000109,0.000109,...,0.000109,0.000109,0.000109,0.372221,0.067305,0.000109,0.000109,0.000109,0.086337,0.000109
Berk Ustun,0.011186,0.011186,0.011186,0.011186,0.011186,0.011186,0.011186,0.012015,0.0138,0.082743,...,0.011186,0.011186,0.294175,0.084745,0.176763,0.011186,0.011186,0.011186,0.077238,0.011186


In [138]:
NUM_AUTHORS = 50

# filter the data by dropping value outside .95 percentile
filtered = averaged.mask(averaged < averaged.quantile(.95), other=0)
# filtered.head()
# get labels, sources, targets, values prepared for developing sankey diagram
# read sankey diagram library for the purposes of them

labels = filtered.index.to_list() #name of faculty & topics
labels.extend(filtered.columns.to_list())

sources = []
targets = []
values = [] # proportions
index_counter = 0
for index, row in filtered.iterrows():
    for i, value in enumerate(row):
        if value != 0:
            sources.append(index_counter)
            targets.append(NUM_AUTHORS + i)
            values.append(value)
    index_counter += 1
# split those average value that representing the relationships into ranks [1, 10]
def split_into_ranks(array):
    ranks = []
    for value in array:
        for i, percentage in enumerate(np.arange(.1, 1.1, .1)):
            if value <= np.quantile(array, percentage):
                ranks.append(i + 1)
                break
    return ranks

import numpy as np
values_array = np.array(values)
values_final = split_into_ranks(values_array)

In [123]:
countVec = CountVectorizer()
counts = countVec.fit_transform(all_docs)
names = countVec.get_feature_names()

In [139]:
# get top words for topics
model = modeller
vectorizer = countVec

names = vectorizer.get_feature_names()
def display_topics_list(model, feature_names, no_top_words):
    topic_list = []
    for topic_idx, topic in enumerate(model.components_):
        topic_list.append(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    return topic_list
link_labels = labels.copy()
link_labels[NUM_AUTHORS:] = display_topics_list(model, names, 10)

In [140]:
# author topic connections sankey diagram for 50 topics model
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels, # flow name
      color = 'purple',
      customdata = link_labels,
      hovertemplate='%{customdata} Total Flow: %{value}<extra></extra>'
    ),
    link = dict(
      source = sources, # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = targets, # targets nodes
      value = values_final # flow volumn 
  ))])

fig.update_layout(title_text="Author Topic Connections", font_size=10, height=2000, paper_bgcolor="LightSteelBlue")
fig.show()
# fig.write_html("visualization/Author_Topic_Connections.html")