In [None]:
!pip install -q -U transformers
!pip install -q -U sentence-transformers
!pip install -q -U scikit-learn
!pip install -q -U umap-learn
!pip install -q -U seaborn
!pip install -q -U bokeh
!pip install -q -U numpy
!pip install -q -U torch

In [None]:
# Imports
import numpy as np
from sentence_transformers import SentenceTransformer
import umap
from sklearn.cluster import KMeans
import pandas as pd
import datetime as dt
import os

In [None]:
# Load data
os.chdir('')
folder = ''
csv_file = ''
file_path = os.path.join(folder,csv_file)

# Enter name of text column
text_column = ''
text_language = 'English' # English or Multi 

df = pd.read_csv(f'{file_path}')

# ---------------------------------------------------- #
# Clean data, e.g.
# df = df.dropna(subset=text_column)
# df = df[df['Abstract'] != '[No abstract available]']

# ---------------------------------------------------- #
df.reset_index(drop=True, inplace=True)

docs = df[text_column]

In [None]:
if text_language.lower() == 'english':
    model = SentenceTransformer('sentence-transformers/allenai-specter')
else:
    model = SentenceTransformer('intfloat/multilingual-e5-large') # https://huggingface.co/intfloat/multilingual-e5-large#

def convert_docs_to_embedding(docs):
    return model.encode(docs, show_progress_bar=True)

docs_array = convert_docs_to_embedding(docs)

In [None]:
# Initialize KMeans
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)

# Fit KMeans on the embeddings
kmeans.fit(docs_array)

# Get the cluster labels
clusters = kmeans.labels_

In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(docs_array)
dimensions = pd.DataFrame(embedding, columns=['x', 'y'])
merged = df.join(dimensions)
merged['clusters'] = clusters
merged['clusters'] = merged['clusters'].astype(str)


# BOKEH

In [None]:
from bokeh.plotting import figure, show, output_file, save, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, WheelZoomTool, BoxZoomTool, ResetTool, TapTool, CustomJS, Div, LassoSelectTool
from bokeh.models.widgets import RangeSlider
from bokeh.layouts import layout, column
from bokeh.transform import factor_cmap
from bokeh.palettes import Viridis256, Category20
from bokeh.io import curdoc

def configure_output(filename, title, save_file):
    if save_file == True:
        output_file(filename=f'{filename}.html', title=title)
    else:
        output_notebook()

def create_color_palette(num_categories, color1, color2):
    if num_categories == 1:
        return [color1]
    elif num_categories == 2:
        return [color1, color2]
    elif num_categories <= 20:
        return Category20[num_categories]
    else:
        return [Viridis256[i] for i in range(0, 256, 256 // num_categories)]

def create_plot(source, title, category, categories, palette):
    color_mapper = factor_cmap(category, palette=palette, factors=categories)
    p = figure(title=title,
               width=1000,
               height=1200,
               x_axis_label='X',
               y_axis_label='Y',
               tools="pan,wheel_zoom,box_zoom")
    p.scatter('x', 'y', size=10, line_color=color_mapper, fill_color=color_mapper, fill_alpha=0.5, source=source)

    return p

def add_tooltip(p, source):
    hover = HoverTool()
    hover.tooltips = [
        ("Title", "@Title"),  # Assuming 'Title' is a column in your data source
        ("Authors", "@{Authors}"),  # Adjust the field name as per your source
        ("Year", "@Year"),
        ("Keywords", "@{Author Keywords}")
    ]
    hover.attachment = 'vertical'
    p.add_tools(hover)

def add_reset_functionality(p, details_div):
    reset_callback = CustomJS(args={'details_div': details_div}, code="""
        details_div.text = '<div style="text-align: center; width: 100%;">Click on a point to see details.</div>';
    """)

    reset_tool = ResetTool()
    p.add_tools(reset_tool)

    p.js_on_event('reset', reset_callback)

def add_info_windows(p, source, details_div):
    tap_callback_code = """
    let indices = source.selected.indices;
    const data = source.data;
    let details_html = '<div style="display: flex; flex-direction: row; flex-wrap: wrap; gap: 10px; height:"""+str(p.height)+"""px; overflow-y: auto; padding: 5px;">';
    if (indices.length > 0) {
        indices.forEach(index => {
            details_html += `
                <div style="flex: none; min-width: 700px; max-width: 750px; background: #f0f0f0; padding: 10px; border: 1px solid #ccc; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
                    <strong>Title:</strong> ${data.Title[index]}<br>
                    <strong>Authors:</strong> ${data['Author full names'][index]}<br>
                    <strong>Abstract:</strong> ${data.Abstract[index]}<br>
                    <strong>Cluster:</strong> ${data.clusters[index]}<br>
                    <strong>Year:</strong> ${data.Year[index]}<br>
                    <strong>Link:</strong> <a href="${data.Link[index]}" target="_blank">Click to view</a><br>
                    <strong>Author Keywords:</strong> ${data['Author Keywords'][index]}<br>
                    <strong>Index Keywords:</strong> ${data['Index Keywords'][index]}<br>
                    <strong>Coordinates:</strong> (${data.x[index]}, ${data.y[index]})
                </div>`;
        });
    } else {
        details_html = '<div style="text-align: center; width: 100%;">Click on a point to see details.</div>';
    }
    details_div.text = details_html;
    """
    tap_tool = TapTool(callback=CustomJS(args={'source': source, 'details_div': details_div}, code=tap_callback_code))
    lasso_tool = LassoSelectTool()
    p.add_tools(tap_tool, lasso_tool)

def add_slider(source, original_source, df, year_column):
    min_year = df[year_column].min()
    max_year = df[year_column].max()
    range_slider = RangeSlider(start=min_year, end=max_year, value=(min_year, max_year), step=1, title="Year Filter")
    callback = CustomJS(args=dict(source=source, original_source=original_source, range_slider=range_slider), code="""
        const data = source.data;
        const original_data = original_source.data;
        const range = range_slider.value;

        // Clear current data
        data['x'] = [];
        data['y'] = [];
        data['Title'] = [];
        data['Authors'] = [];
        data['Year'] = [];
        data['Keywords'] = [];

        // Filter data based on the slider range
        for (let i = 0; i < original_data['x'].length; i++) {
            if (original_data[year_column][i] >= range[0] && original_data[year_column][i] <= range[1]) {
                data['x'].push(original_data['x'][i]);
                data['y'].push(original_data['y'][i]);
                data['Title'].push(original_data['Title'][i]);
                data['Authors'].push(original_data['Authors'][i]);
                data['Year'].push(original_data['Year'][i]);
                data['Keywords'].push(original_data['Keywords'][i]);
            }
        }
        source.change.emit();
    """)
    range_slider.js_on_change('value', callback)

    return range_slider

def main(df, category, color1, color2, output_filename, browser_title, title, save_file, year):

    configure_output(output_filename, browser_title, save_file)

    source = ColumnDataSource(df)
    original_source = ColumnDataSource(df.copy())
    categories = list(merged[category].unique())
    palette = create_color_palette(len(categories), color1, color2)

    p = create_plot(source, title, category, categories, palette)

    details_div = Div(width=800, height=p.max_height, styles={'overflow-y': 'auto', 'background': 'white'})
    details_div.text = '<div style="text-align: center; width: 100%;">Click on a point to see details.</div>'

    add_info_windows(p, source, details_div)
    add_reset_functionality(p, details_div)
    add_tooltip(p, source)

    year_slider = add_slider(source, original_source, df, year) # Currently not working. It does not properly update the data source

    layout_config = layout([[p, details_div], [year_slider]])
    show(layout_config)


In [None]:
# Name a column in the dataframe that you wish to use for colouring the dots
category = 'clusters'

#If you have a column with years you can create a slider for it
year = 'Year'

#If given a column containing only one or two unique values, set colours below
color1 = 'blue'
color2 = 'red'

#Name your output
filename = ''
browser_title = ''
title = ''
date = str(dt.date.today())
output_filename = f"{date}_{filename}"

#Save file as HTML? If you save the file it also opens in a separate browser window
save_file = False

main(df=merged,
     category=category,
     color1='red',
     color2='blue',
     output_filename=output_filename,
     browser_title=browser_title,
     title=title,
     save_file = save_file,
     year = year
)