In [None]:
## INSTALLS

!pip install bertopic

In [None]:
## IMPORTS

import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from bertopic import BERTopic
import numpy as np
from os import path
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [None]:
## LOAD IN YOUR DATA & ESTABLISH FOLDER FOR SAVING RESULTS

DATA = "/content/drive/MyDrive/Data/modelling_paper_1/search_1/scopus.csv"
SAVE_FILEPATH = "/content/drive/MyDrive/Data/modelling_paper_1/search_1/temp"

## READ YOUR DATA

df = pd.read_csv(DATA)
df.head()

In [None]:
## SET UP FUNCTIONS

In [None]:
## GENERATE PLOT TO SHOW PUBLICATIONS OVER TIME

def plot_freq_over_time(df,
                        year_col,
                        title_col,
                        save_name="publications_over_time.eps",
                        save_format="eps",
                        save_path=SAVE_FILEPATH,
                        x_label="Year",
                        y_label="Number of Publications"):
  """
  This function generates (and saves locally) a figure representing the number of papers within the corpus published over time.

  :param df: (DataFrame) pandas DataFrame containing data/metadata derived from paper database download.
  :param year_col: (str) the name of the column containing the date information
  :param title_col: (str) the name of the column containing the paper titles
  :param save_name: (str) the name you wish to save the plot under, including file extension. Defaults to "publications_over_time.eps"
  :param save_format: (str) the format in which you wish to save the file. Default is "eps"
  :param save_path: (str) the file location you wish to save the file to
  :param x_label: (str) the x-axis label. Defaults to "Year"
  :param y_label: (str) the y-axis label. Defaults to "Number of Publications"

  Returns
  Saves image file to specified location.
  """
  # create a dataframe only containing the sub-set of interest
  year_df = df[[year_col, title_col]]
  # group the dataframe by year
  year_df = year_df.groupby(year_col).count()
  year_df = year_df.reset_index()
  # ensure that your date column is formatted as integers
  year_df[year_col] = year_df[year_col].astype(int)

  # plot the figure
  tick_labels = year_df[year_col]
  timeseries = year_df.plot(x = year_col, y = title_col)
  timeseries.set_xlabel(x_label)
  timeseries.set_ylabel(y_label)

  # save the figure
  timeseries.figure.savefig(SAVE_FILEPATH + save_name, format=save_format)

In [None]:
## GENERATE PLOT TO SHOW WHICH JOURNALS ARE PUBLISHED IN MOST FREQUENTLY WITHIN THIS RESEARCH AREA

def journal_popularity_plot(df,
                            journal_col,
                            title_col,
                            save_name="top_20_journals.eps",
                            save_format="eps",
                            save_path=SAVE_FILEPATH,
                            x_label='Journal',
                            y_label='Number of Publications'):
  """
  This function plots (and saves locally) a figure showing the top 20 journals within the corpus (by publication volume) and
  their relative number of papers.

  :param df: (DataFrame) pandas DataFrame containing data/metadata derived from paper database download.
  :param journal_col: (str) the name of the column containing the journal name information
  :param title_col: (str) the name of the column containing the paper titles
  :param save_name: (str) the name you wish to save the plot under, including file extension. Defaults to "top_20_journals.eps"
  :param save_format: (str) the format in which you wish to save the file. Default is "eps"
  :param save_path: (str) the file location you wish to save the file to
  :param x_label: (str) the x-axis label. Defaults to "Journal"
  :param y_label: (str) the y-axis label. Defaults to "Number of Publications"

  Returns
  Saves image file to specified location.
  """
  journals = df.groupby(df[journal_col]).count()
  journals = journals.reset_index()
  journals = journals[[journal_col, title_col]]
  journals = journals.rename(columns = {title_col: "Freq"})
  journals = journals.sort_values(by=["Freq"], ascending = False)
  # print how many journals there are within the corpus
  print(f"There are {len(journals)} journals within this corpus.")

  # plot figure to show top 20 most-published-in journals
  journals_plot = journals[:20].plot.bar(x = journal_col, y = "Freq", legend = False)
  journals_plot.set_xlabel(x_label)
  journals_plot.set_ylabel(y_label)

  # save the figure
  journals_plot.figure.savefig(save_path + save_name, format=save_format)

In [None]:
## DETERMINE THE LANGUAGE PROFILE OF THE CORPUS & DROP PAPERS THAT AREN'T IN ENGLISH

def get_language_profile(df,
                         language_col,
                         title_col,
                         save_name="top_languages.eps",
                         save_format="eps",
                         save_path=SAVE_FILEPATH,
                         x_label="Language",
                         y_label="Number of Publications"):
  """
  This is a function to plot the distribution of different languages present within the corpus, then drop all non-English papers.
  It returns a locally saved figure and a dataframe containing only English-language papers.

  :param df: (DataFrame) pandas DataFrame containing data/metadata derived from paper database download
  :param language_col: (str) the name of the column containing the journal name information
  :param title_col: (str) the name of the column containing the paper titles
  :param save_name: (str) the name you wish to save the plot under, including file extension. Defaults to "top_20_journals.eps"
  :param save_format: (str) the format in which you wish to save the file. Default is "eps"
  :param save_path: (str) the file location you wish to save the file to
  :param x_label: (str) the x-axis label. Defaults to "Journal"
  :param y_label: (str) the y-axis label. Defaults to "Number of Publications"

  Returns
  :df: (DataFrame) pandas DataFrame
  Saves image file to specified location.
  """
  lang = df.groupby(df[language_col]).count()
  lang = lang.reset_index()
  lang = lang[[language_col, title_col]]
  lang = lang.sort_values(by=[title_col], ascending = False)

  # plot figure
  top_langs = lang.plot.bar(x = language_col, y = title_col, legend = False)
  top_langs.set_xlabel(x_label)
  top_langs.set_ylabel(y_label)
  top_langs.figure.savefig(save_path + save_name, format=save_format)

  # drop non-English papers
  temp_df = df
  # only keep columns where the language is specified to be English
  df = df[df[language_col] == "English"]
  # check how many papers were not in English
  print(f"There were {len(temp_df)} papers in the original corpus. {len(temp_df) - len(df)} were non-English. {len(df)} were in English.")

  return df

In [None]:
## UNDERTAKE TOPIC MODELLING

def topic_modelling(df, col_to_cluster_on, additional_stopwords):
  """
  This function clusters the content in a given column into unsupervised topics and returns a dataframe with topic information.

  :param df: (DataFrame) pandas DataFrame containing data/metadata derived from paper database download.
  :param col_to_cluster_on: (str) the name of the column containing the information you wish to cluster (paper abstracts are recommended)
  :additional_stopwords: (list) list of any additional stopwords on top of the classic NLTK provided ones (as strings)

  Returns
  :topic_df: (DataFrame)
  """

  # remove stopwords
  additional_stopwords = additional_stopwords
  stop = stopwords.words('english') + additional_stopwords
  df['no_stopwords'] = df[col_to_cluster_on].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

  # initialise the BERTopic model
  topic_model = BERTopic()
  # apply the topic model to your data, generating topics & associated probabilities
  topics, probs = topic_model.fit_transform(df["no_stopwords"])
  # assign the topic infomation to a variable
  _topics = topic_model.get_topic_info()
  # create a new dataframe containing the topics and their associated content
  topic_assignments = pd.DataFrame({'Topic': topic_model.topics_,'no_stopwords': df['no_stopwords']})
  # combine the original dataframe and the topic assignments so you have everything in one place
  topic_df = pd.merge(df, topic_assignments, on = "no_stopwords", how = "inner")
  topic_df = pd.merge(topic_df, _topics, on = "Topic", how = "inner")

  return topic_df

In [None]:
def topics_over_time(df,
                     year_col,
                     cluster_name_col,
                     count_col,
                     save_name="clusters_over_time.eps",
                     save_format="eps",
                     save_path=SAVE_FILEPATH,
                     x_label="Year",
                     y_label="Number of Publications"):
  """
  Function to render visualisation of the number of papers per cluster over time. It returns a locally saved figure.

  :param df: (DataFrame) pandas DataFrame containing data/metadata derived from paper database download.
  :param year_col: (str) name of the column containing date information
  :param cluster_name_col: (str) name of the column with cluster name inforation, derived from topic modelling
  :param count_col: (str) name of the column with document frequency per cluster, derived from topic mocelling
  :param save_name: (str) the name you wish to save the plot under, including file extension. Defaults to "top_20_journals.eps"
  :param save_format: (str) the format in which you wish to save the file. Default is "eps"
  :param save_path: (str) the file location you wish to save the file to
  :param x_label: (str) the x-axis label. Defaults to "Journal"
  :param y_label: (str) the y-axis label. Defaults to "Number of Publications"

  Returns
  Saves image file to specified location.
  """
  temp_df = df.groupby([year_col, cluster_name_col]).count().reset_index()
  temp_df = temp_df[[year_col, cluster_name_col, count_col]]
  temp_df = temp_df.pivot(index=year_col, columns=cluster_name_col, values=count_col)
  cluster_freq_over_time = temp_df.plot.line(legend = False)
  cluster_freq_over_time.legend(fontsize = "xx-small")
  cluster_freq_over_time.set_xlabel(x_label)
  cluster_freq_over_time.set_ylabel(y_label)
  cluster_freq_over_time.figure.savefig(save_path + save_name, bbox_inches = "tight", format=save_format)

In [None]:
## GENERATE YOUR OUTPUTS

In [None]:
plot_freq_over_time(df, "Year", "Title")

In [None]:
journal_popularity_plot(df, "Source title", "Title")

In [None]:
df = get_language_profile(df, "Language of Original Document", "Title")
df

In [None]:
df = topic_modelling(df, "Abstract", additional_stopwords = ["and", "the", "The", "of", "in", "to", "this"])
df.head()

In [None]:
topics_over_time(df, "Year", "Name", "Count")