In [None]:
from nltk.corpus import stopwords
import nltk
import matplotlib.cm as cm
import pandas as pd
import numpy as np
import spacy
import re
from matplotlib import pyplot as plt
import time
import json
import os
import glob
from tqdm import tqdm
from langdetect import detect
from nltk.corpus import stopwords
from langdetect import DetectorFactory
import string
import en_core_sci_lg
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
import faiss
import seaborn as sns
from txtai.pipeline.data import tokenizer

warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('../Data/raw.csv')

In [None]:
# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df))):
    # split by space into list, take the first x intex, join with space
    text = df.iloc[ii]['body_text'].split(" ")

    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:

            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[ii]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass

    # get the language
    languages.append(lang)

In [None]:
languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)

print("Total: {}\n".format(len(languages)))
print(languages_dict)

In [None]:
df['language'] = languages
plt.bar(range(len(languages_dict)), list(languages_dict.values()), align='center')
plt.xticks(range(len(languages_dict)), list(languages_dict.keys()))
plt.title("Distribution of Languages in Dataset")
plt.show()

In [None]:
df_en = df[df['language'] == 'en']
df_en = df_en.drop(['Unnamed: 0'],axis=1)
df_en.info()

In [None]:
# df_en.to_csv('../Data/eng_only.csv')
df_en = pd.read_csv('../Data/eng_only.csv')

In [None]:
df_en_abstract_only = df_en[df_en['abstract_summary'] != "Not provided."]
df_en_abstract_only = df_en_abstract_only[df_en_abstract_only['abstract'].notnull()]
df_en_abstract_only.info()

In [None]:
custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
    'al.', 'Elsevier', 'PMC', 'CZI'
]

for stop in custom_stop_words:
    if stop not in stopwords:
        stopwords.add(stop)

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('sentencizer')
punctuation = string.punctuation

In [None]:
def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower().strip(punctuation) for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords]
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = []
    i=1
    for doc in tqdm(nlp.pipe(texts, batch_size=2)):
        preproc_pipe.append(lemmatize_pipe(doc))
        print('finished handling doc NO.{}'.format(i))
        i+=1
    return preproc_pipe

In [None]:
tqdm.pandas()
df_en_abstract_only['self_processor'] = preprocess_pipe(df_en_abstract_only['abstract'])

In [None]:
# Parser
punctuations = string.punctuation
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
tqdm.pandas()
df_en_abstract_only["article_processor"] = df_en_abstract_only["abstract"].progress_apply(spacy_tokenizer)

In [None]:
df_en_abstract_only.head()

In [None]:
# df_en_abstract_only = df_en_abstract_only.drop(['Unnamed: 0'],axis=1)
# df_en_abstract_only.to_csv('../Data/article_proc.csv')

# Do Not Run Anything before this point!

In [None]:
df_en_abstract_only = pd.read_csv('../Data/article_proc.csv')

In [None]:
def vectorize(text, maxx_features):

    vectorizer = TfidfVectorizer(max_features=maxx_features).fit(text)
    X = vectorizer.transform(text)
    return X

In [None]:
abstract = df_en_abstract_only['abstract'].values
max_features = 2**12

X = vectorize(abstract, max_features)

In [None]:
print(X[0].shape)
print(X[0])

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans

In [None]:
X_32 = X.todense().astype('float32')

In [None]:
X_32 = X_32.toarray()

In [None]:
from sklearn import metrics
from scipy.spatial.distance import cdist

# run kmeans with many different k
distortions = []
K = range(2, 30)
for k in K:
    print('fitting clusters with {} clusters'.format(k))
    k_means = KMeans(n_clusters=k, random_state=0,verbose=1,init='k-means++',max_iter=100).fit(X_32)
    k_means.fit(X_32)
    distortions.append(sum(np.min(cdist(X_32, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    print('Found distortion for {} clusters'.format(k))

In [None]:
n_init = 5
max_iter = 100
distortions = []
K = range(10, 50)
for k in K:
    print('fitting clusters with {} clusters'.format(k))
    k_means = faiss.Kmeans(d=X_32.shape[1], k=k, niter=max_iter, nredo=n_init,gpu=True,verbose=True,seed=553602)
    k_means.train(X_32)
    distortions.append(sum(np.min(cdist(X_32, k_means.centroids, 'euclidean'), axis=1)) / X.shape[0])
    print('Found distortion for {} clusters'.format(k))

In [None]:
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
k = 40
k_means = faiss.Kmeans(d=X_32.shape[1], k=k, niter=max_iter, nredo=n_init,gpu=True,verbose=True,seed=553602)
k_means.train(X_32)
label = k_means.assign(X_32)

In [None]:
df_en_abstract_only['cluster'] = label[1]

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(verbose=1, perplexity=50)  # Changed perplexity from 100 to 50 per FAQ
X_embedded = tsne.fit_transform(X_32)

In [None]:
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", 1)

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], palette=palette)
plt.title('t-SNE with no Labels')
plt.savefig("t-sne_covid19.png")
plt.show()

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(20,20)})

# colors
palette = sns.hls_palette(40, l=.4, s=.9)

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=label[1], legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()

In [None]:
print(X_embedded[:,0].shape)
print(X_embedded[:,0].shape)

In [None]:
from sklearn.decomposition import TruncatedSVD

X_reduced = TruncatedSVD(n_components=50, random_state=553602).fit_transform(X_32)

In [None]:
k = 40
n_init = 5
max_iter = 100
distortions = []
k_means = faiss.Kmeans(d=X_reduced.shape[1], k=k, niter=max_iter, nredo=n_init,gpu=True,verbose=True,seed=553602)
k_means.train(X_reduced)
label_reduced = k_means.assign(X_reduced)

In [None]:
label_reduced

In [None]:
from sklearn.manifold import TSNE
tsne_reduced = TSNE(verbose=1, perplexity=50)  # Changed perplexity from 100 to 50 per FAQ
X_embedded_reduced = tsne_reduced.fit_transform(X_reduced)

In [None]:
# sns settings
sns.set(rc={'figure.figsize':(20,20)})

# colors
palette = sns.hls_palette(40, l=.4, s=.9)

# plot
sns.scatterplot(X_embedded_reduced[:,0], X_embedded_reduced[:,1], hue=label_reduced[1], legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()

In [None]:
df_en_abstract_only['cluster'] = label_reduced[1]

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
abstract = df_en_abstract_only['abstract']

In [None]:
len(abstract)

In [None]:
vectorizers = []

for ii in range(0, 40):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [None]:
vectorized_data = []

for current_cluster, cvec in tqdm(enumerate(vectorizers)):
    try:
        vectorized_data.append(cvec.fit_transform(df_en_abstract_only.loc[df_en_abstract_only['cluster'] == current_cluster, 'abstract']))
    except Exception as e:
        print("Not enough instances in cluster: " + str(current_cluster))
        vectorized_data.append(None)

In [None]:
# number of topics per cluster
NUM_TOPICS_PER_CLUSTER = 10


lda_models = []

for ii in range(0, 40):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=True, random_state=42)
    lda_models.append(lda)

lda_models[0]

In [None]:
clusters_lda_data = []

for current_cluster, lda in enumerate(lda_models):
    print("Current Cluster: " + str(current_cluster))

    if vectorized_data[current_cluster] is not None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))

In [None]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=5):
    current_words = []
    keywords = []

    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])

    keywords.sort(key = lambda x: x[1])
    keywords.reverse()
    return_values = []
    for ii in keywords:
        return_values.append(ii[0])
    return return_values

In [None]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    print("Current Cluster: " + str(current_vectorizer))

    if vectorized_data[current_vectorizer] is not None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))

In [None]:
f=open('../Data/topics.txt','w')

count = 0

for ii in all_keywords:

    if vectorized_data[count] != None:
        f.write(', '.join(ii) + "\n")
    else:
        f.write("Not enough instances to be determined. \n")
        f.write(', '.join(ii) + "\n")
    count += 1

f.close()

In [None]:
df_en_abstract_only['cluster'] = label_reduced[1]

In [None]:
import pickle

# save the COVID-19 DataFrame
pickle.dump(df_en_abstract_only, open("../Data/df_en_abstract_only.p", "wb" ))

# save the final t-SNE
pickle.dump(X_embedded, open("../Data/X_embedded_reduced.p", "wb" ))

# save the labels generate with k-means(20)
pickle.dump(label, open("../Data/y_pred.p", "wb" ))

In [None]:
# required libraries for plot
from plot_text import header
from call_backs import input_callback, selected_code
import bokeh
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import viridis, plasma, Category20
from bokeh.transform import linear_cmap, transform
from bokeh.io import output_file, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import RadioButtonGroup, TextInput, Div, Paragraph
from bokeh.layouts import column, widgetbox, row, layout
from bokeh.layouts import column
from bokeh.io import curdoc

In [None]:
import os

topic_path = os.path.join(os.getcwd(), '../Data/topics.txt')
with open(topic_path) as f:
    topics = f.readlines()

In [None]:
df_en_abstract_only['x'] = X_embedded_reduced[:,0]
df_en_abstract_only['y'] = X_embedded_reduced[:,1]

In [None]:
df_bokeh_plot = []
for i in range(40):
    df_bokeh_plot.append(df_en_abstract_only.loc[df_en_abstract_only['cluster']==i].drop(['Unnamed: 0'],axis=1))
    print(i)
df_bokeh_plot[0]

In [None]:
# show on notebook
output_notebook()
# target labels
y_labels = label_reduced[1]

# data sources


# hover over information
hover = HoverTool(tooltips=[
    ("Title", "@titles{safe}"),
    ("Author(s)", "@authors{safe}"),
    ("Journal", "@journal"),
    ("Abstract", "@abstract{safe}"),
    ("Link", "@links")
],
point_policy="follow_mouse")

# map colors
colors1 = plasma(10)
colors2 = viridis(10)
colors3 = Category20[20]
colors = colors1+colors2+colors3
# prepare the figure
plot = figure(plot_width=1200, plot_height=850,
           tools=[hover],
           title="Clustering of the COVID-19 Literature with t-SNE and K-Means")

curdoc().theme = 'caliber'

for cluster_data, name, color in zip(df_bokeh_plot, [i for i in range(40)], colors):

    source = ColumnDataSource(data=dict(
    x= cluster_data['x'],
    y= cluster_data['y'],
    desc= cluster_data['cluster'],
    titles= cluster_data['title'],
    authors = cluster_data['authors'],
    journal = cluster_data['journal'],
    abstract = cluster_data['abstract_summary'],
    labels = ["C-" + str(x) for x in cluster_data['cluster']],
    links = cluster_data['doi']
    ))

    plot.scatter('x', 'y', size=5,
          fill_color=color,
          fill_alpha=0.6,
          line_alpha=0.3,
          legend_label=str(name),
          source=source)


plot.legend.click_policy="hide"
plot.legend.background_fill_alpha = 0.6

In [None]:
# STYLE
header.sizing_mode = "stretch_width"
header.style={'color': '#2e484c', 'font-family': 'Julius Sans One, sans-serif;'}
header.margin=5

plot.sizing_mode = "scale_both"
plot.margin = 5

In [None]:
# LAYOUT OF THE PAGE
l = layout([
    [header],
    [plot]
])
l.sizing_mode = "scale_both"


# show
output_file('t-sne_covid-19_kmeans.html')
show(l)