In [17]:
from duckduckgo_search import ddg
import pandas as pd

clust_method = 'kmeans'
keywords = 'lesswrong'
md = ddg(keywords, region='wt-wt', safesearch='Moderate', time='y', max_results=500)
md = pd.DataFrame(md)

In [18]:
from sentence_transformers import SentenceTransformer

model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

sentence_embeddings = model.encode(md['body'].tolist(), show_progress_bar = True)
sentence_embeddings = pd.DataFrame(sentence_embeddings)

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [19]:
import umap.umap_ as umap
import matplotlib.pyplot as plt
reducer = umap.UMAP(metric = 'cosine')
dimr = reducer.fit_transform(sentence_embeddings)
dimr.shape

dimr = pd.DataFrame(dimr, columns = ['umap1', 'umap2'])

In [20]:
import numpy as np

if clust_method == "hdbscan":
    import hdbscan
    labels = hdbscan.HDBSCAN(
        min_samples=5,
        min_cluster_size=20,
    ).fit_predict(dimr[['umap1', 'umap2']])
else:
    import sklearn.cluster as cluster
    labels = cluster.KMeans(n_clusters=10).fit_predict(dimr[['umap1', 'umap2']])
    
dimr['cluster'] = labels
display(np.unique(labels)) 

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)

In [21]:
dat = pd.concat([dimr.reset_index(), md.reset_index()], axis = 1)
dat

Unnamed: 0,index,umap1,umap2,cluster,index.1,title,href,body
0,0,6.126308,15.378495,6,0,ChatGPT: Optimizing Language Models for Dialogue,https://openai.com/blog/chatgpt/,We've trained a model called ChatGPT which int...
1,1,6.274906,15.835807,5,1,ChatGPT has the answers. How does ChatGPT get ...,https://www.usatoday.com/story/tech/2023/01/27...,ChatGPT (Generative Pre-trained Transformer) i...
2,2,5.871943,12.925730,4,2,ChatGPT passes exams from law and business sch...,https://www.cnn.com/2023/01/26/tech/chatgpt-pa...,ChatGPT is smart enough to pass prestigious gr...
3,3,5.820159,14.696119,6,3,AI ChatGPT is helping CEOs think. Will it also...,https://www.cbsnews.com/news/chatgpt-chatbot-a...,ChatGPT is a language model that can be used f...
4,4,8.835108,15.581505,8,4,What is ChatGPT? The AI text generator explained,https://www.trustedreviews.com/explainer/what-...,ChatGPT is a chatbot created by the artificial...
...,...,...,...,...,...,...,...,...
165,165,6.732293,15.461653,5,165,ChatGPT | Know Your Meme,https://knowyourmeme.com/memes/sites/chatgpt,"ChatGPT, short for Chat Generative Pre-trained..."
166,166,8.460801,12.549894,0,166,How ChatGPT can turn anyone into a ransomware ...,https://venturebeat.com/security/chatgpt-ranso...,"While ChatGPT hasn't been out long, security r..."
167,167,8.093411,13.974854,2,167,Elevate Your Property Marketing with ChatGPT: ...,https://www.rismedia.com/2023/01/24/elevate-yo...,Three Prompt Basics for ChatGPT That Every Age...
168,168,9.622962,14.183983,7,168,Old Video: ChatGPT Founder On How He Plans To ...,https://www.ndtv.com/feature/old-video-chatgpt...,An old video of ChatGPT CEO Sam Altman has sur...


In [22]:
# Testing
from keybert import KeyBERT
import nltk
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

# Create WordNetLemmatizer object
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
kw_model = KeyBERT()

keywords_df = []
for i in np.unique(dat['cluster']):
    curr = dat[dat['cluster'] == i]
    text =  ' '.join(curr['title'])
    
    # Lemmatization
    text = nltk.word_tokenize(text)
    text = [wnl.lemmatize(i) for i in text]
    text = ' '.join(text)
    
    # Keyword extraction
    TR_keywords = kw_model.extract_keywords(text)
    keywords_df.append(TR_keywords[0:10])
    
keywords_df = pd.DataFrame(keywords_df)
keywords_df['cluster'] = np.unique(dimr['cluster'])
keywords_df.columns = ['keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5', 'cluster']
keywords_df

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tylerburns/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tylerburns/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tylerburns/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,keyword1,keyword2,keyword3,keyword4,keyword5,cluster
0,"(chatgpt, 0.5194)","(chatbot, 0.4259)","(microsoft, 0.3402)","(chatting, 0.3221)","(blogs, 0.3176)",0
1,"(chatgpt, 0.5966)","(openai, 0.4725)","(chatbot, 0.3983)","(chat, 0.375)","(gpt, 0.369)",1
2,"(chatgpt, 0.6387)","(chatbot, 0.4784)","(chat, 0.4373)","(conversations, 0.353)","(technology, 0.3337)",2
3,"(chatgpt, 0.6901)","(chatbots, 0.563)","(chatbot, 0.5296)","(chat, 0.4988)","(chatteo, 0.3485)",3
4,"(chatgpt, 0.4452)","(exams, 0.4082)","(exam, 0.395)","(study, 0.3399)","(law, 0.2934)",4
5,"(chatgpt, 0.5262)","(chatbot, 0.4435)","(microsoft, 0.3729)","(openai, 0.3712)","(ai, 0.3663)",5
6,"(chatgpt, 0.6156)","(openai, 0.3609)","(gpt3, 0.318)","(dialogue, 0.3147)","(microsoft, 0.3088)",6
7,"(chatgpt, 0.6082)","(seo, 0.3712)","(seos, 0.356)","(google, 0.2892)","(web, 0.2509)",7
8,"(chatgpt, 0.6383)","(imessage, 0.4338)","(ai, 0.3901)","(text, 0.3585)","(talking, 0.3206)",8
9,"(chatgpt, 0.4891)","(educators, 0.417)","(teaching, 0.406)","(teachers, 0.3767)","(teacher, 0.3755)",9


In [23]:
dat = dat.merge(keywords_df) # This messes up the index
dat

Unnamed: 0,index,umap1,umap2,cluster,index.1,title,href,body,keyword1,keyword2,keyword3,keyword4,keyword5
0,0,6.126308,15.378495,6,0,ChatGPT: Optimizing Language Models for Dialogue,https://openai.com/blog/chatgpt/,We've trained a model called ChatGPT which int...,"(chatgpt, 0.6156)","(openai, 0.3609)","(gpt3, 0.318)","(dialogue, 0.3147)","(microsoft, 0.3088)"
1,3,5.820159,14.696119,6,3,AI ChatGPT is helping CEOs think. Will it also...,https://www.cbsnews.com/news/chatgpt-chatbot-a...,ChatGPT is a language model that can be used f...,"(chatgpt, 0.6156)","(openai, 0.3609)","(gpt3, 0.318)","(dialogue, 0.3147)","(microsoft, 0.3088)"
2,5,5.947445,15.025608,6,5,How to use ChatGPT | ZDNET,https://www.zdnet.com/article/how-to-use-chatgpt/,ChatGPT is a large language model that uses ar...,"(chatgpt, 0.6156)","(openai, 0.3609)","(gpt3, 0.318)","(dialogue, 0.3147)","(microsoft, 0.3088)"
3,6,6.604967,14.831259,6,6,What is ChatGPT and why does it matter? Here's...,https://www.zdnet.com/article/what-is-chatgpt-...,ChatGPT is a natural language processing tool ...,"(chatgpt, 0.6156)","(openai, 0.3609)","(gpt3, 0.318)","(dialogue, 0.3147)","(microsoft, 0.3088)"
4,8,6.018540,15.351077,6,8,ChatGPT: What Is It & How Can You Use It?,https://www.searchenginejournal.com/what-is-ch...,ChatGPT is a large language model chatbot deve...,"(chatgpt, 0.6156)","(openai, 0.3609)","(gpt3, 0.318)","(dialogue, 0.3147)","(microsoft, 0.3088)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,154,8.112314,12.914286,0,154,Jawaharlal Nehru Meets PM Modi on Republic Day...,https://www.thequint.com/news/india/jawaharlal...,"Yet, the what-if scenarios that can be played ...","(chatgpt, 0.5194)","(chatbot, 0.4259)","(microsoft, 0.3402)","(chatting, 0.3221)","(blogs, 0.3176)"
166,157,8.623848,13.219074,0,157,The 3 Best AI Stocks to Buy as ChatGPT Ushers ...,https://investorplace.com/2022/12/the-3-best-a...,ChatGPT has created a revolutionary new experi...,"(chatgpt, 0.5194)","(chatbot, 0.4259)","(microsoft, 0.3402)","(chatting, 0.3221)","(blogs, 0.3176)"
167,163,8.440441,13.329154,0,163,Google won't make a competitor to Microsoft Az...,https://www.windowscentral.com/software-apps/g...,"With ChatGPT making the news, Google employees...","(chatgpt, 0.5194)","(chatbot, 0.4259)","(microsoft, 0.3402)","(chatting, 0.3221)","(blogs, 0.3176)"
168,164,9.004629,12.803530,0,164,ChatGPT as a Python Programming Assistant - KD...,https://www.kdnuggets.com/2023/01/chatgpt-pyth...,ChatGPT did a perfectly competent job of creat...,"(chatgpt, 0.5194)","(chatbot, 0.4259)","(microsoft, 0.3402)","(chatting, 0.3221)","(blogs, 0.3176)"


In [24]:
import plotly.express as px
import textwrap
from ipywidgets import interactive, HBox, VBox
import plotly.graph_objs as go

to_display = 'body'

# Test the visuals
dat[to_display] = dat[to_display].str.wrap(30)
dat[to_display] = dat[to_display].apply(lambda x: x.replace('\n', '<br>'))
labels_cat = dat['cluster'].astype("category")
columns = ['title', 'href', 'body'] + ['keyword' + str(i) for i in range(1, 6)] 
f = px.scatter(dat, x ='umap1', y ='umap2', hover_data=columns, width = 800, height = 800, title = 'news tweets', color = labels_cat)

display(f)