This notebook was prepared to run on Google Colab, with the NYT snippets placed in your GDrive as a CSV file.

Install the required packages, BERTopic version used in the analysis was 0.16


In [None]:
%%capture
!pip install bertopic stop-words

Get stopwords and load the dataframe which includes NYT snippets

In [None]:
from stop_words import get_stop_words
eng_stopwords = get_stop_words('english')

import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
gdrive = '/content/drive/YOUR/DRIVE/'

blm_df = pd.read_csv(open(gdrive + "NYT_DATAFRAME", encoding = "utf-8"))

Mounted at /content/drive


We are using a custom UMAP model with a set random_state to allow reproduction

In [None]:
# create model
from bertopic import BERTopic
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5,
                  min_dist=0.0, metric='cosine', random_state=52)
vectorizer_model = CountVectorizer(stop_words=eng_stopwords)
topic_model = BERTopic(vectorizer_model=vectorizer_model, language="english", umap_model=umap_model, nr_topics="auto")

#convert to list
docs = blm_df.text.to_list()
topics, probabilities = topic_model.fit_transform(docs)

In [None]:
topic_model.visualize_barchart(top_n_topics=50, n_words=10)

In [None]:
topic_model.save(gdrive+"model")



In [None]:
bar = topic_model.visualize_barchart(top_n_topics=50, n_words=10)
bar.write_html(gdrive +"chart.html")
fig = topic_model.visualize_topics()
fig.write_html(gdrive + "fig.html")

In [None]:
all_topics = topic_model.get_topics()
topic_dict = {}
for i in range(len(all_topics) - 1):
    topic_dict[i] = []
    for word_tuple in all_topics[i]:
        topic_dict[i].append(word_tuple[0])

df = pd.DataFrame(topic_dict)
df.to_csv(gdrive + "topics.csv")

In [None]:
topic_model = BERTopic.load(gdrive+"model")