In [None]:
%env OPENAI_API_KEY=<PUT_YOUR_API_KEY_HERE>

In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd


def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(
        subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame(
        [newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame(newsgroups_train.target_names, columns=['title'])

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out.to_csv('./data/20_newsgroup.csv', index=False)


twenty_newsgroup_to_csv()


In [None]:
from openai import OpenAI, RateLimitError
import os
import tiktoken
import backoff
import pandas as pd


embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
batch_size = 10
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

df = pd.read_csv('./data/20_newsgroup.csv')
print("Number of rows before null filtering:", len(df))
df = df[df['text'].isnull() == False]
encoding = tiktoken.get_encoding(embedding_encoding)

df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
print("Number of rows before token number filtering:", len(df))
df = df[df.n_tokens <= max_tokens]
print("Number of rows data used:", len(df))


In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_embeddings(text, model):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


@backoff.on_exception(backoff.expo, RateLimitError, max_time=60, max_tries=10)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)
    return embeddings


prompts = df.text.tolist()
prompt_batches = [prompts[i:i+batch_size]
                  for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(
        prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("./data/20_newsgroup_with_embedding.parquet", index=False)

In [6]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

embedding_df = pd.read_parquet("./data/20_newsgroup_with_embedding.parquet")

matrix = np.vstack(embedding_df.embedding.values)
num_of_clusters = 20

kmeans = KMeans(n_clusters=num_of_clusters, init="k-means++", n_init=10, random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
embedding_df["cluster"] = labels


In [7]:
# Count the number of each cluster
new_df = embedding_df.groupby('cluster')['cluster'].count().reset_index(name='count')

# Count the number of the most frequent category within this cluster.
title_count = embedding_df.groupby(['cluster', 'title']).size().reset_index(name='title_count')
first_titles = title_count.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))
first_titles = first_titles.reset_index(drop=True)
new_df = pd.merge(new_df, first_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')
new_df = new_df.rename(columns={'title': 'rank1', 'title_count': 'rank1_count'})

# Count the number of the second most frequent category within this cluster
second_titles = title_count[~title_count['title'].isin(first_titles['title'])]
second_titles = second_titles.groupby('cluster').apply(lambda x: x.nlargest(1, columns=['title_count']))
second_titles = second_titles.reset_index(drop=True)
new_df = pd.merge(new_df, second_titles[['cluster', 'title', 'title_count']], on='cluster', how='left')
new_df = new_df.rename(columns={'title': 'rank2', 'title_count': 'rank2_count'})
new_df['first_percentage'] = (new_df['rank1_count'] / new_df['count']).map(lambda x: '{:.2%}'.format(x))
new_df['second_percentage'] = ((new_df['rank1_count'] + new_df['rank2_count'])/ new_df['count']).map(lambda x: '{:.2%}'.format(x))

# Replace missing values with 0
new_df.fillna(0, inplace=True)
# Output the results
from IPython.display import display
display(new_df)


Unnamed: 0,cluster,count,rank1,rank1_count,rank2,rank2_count,first_percentage,second_percentage
0,0,522,rec.autos,432,comp.sys.mac.hardware,6.0,82.76%,83.91%
1,1,391,comp.sys.ibm.pc.hardware,101,comp.sys.mac.hardware,85.0,25.83%,47.57%
2,2,1060,talk.politics.misc,129,talk.religion.misc,60.0,12.17%,17.83%
3,3,381,rec.motorcycles,364,comp.sys.mac.hardware,1.0,95.54%,95.80%
4,4,783,comp.sys.ibm.pc.hardware,323,comp.sys.mac.hardware,314.0,41.25%,81.35%
5,5,659,soc.religion.christian,409,talk.religion.misc,151.0,62.06%,84.98%
6,6,358,sci.crypt,345,comp.sys.mac.hardware,1.0,96.37%,96.65%
7,7,84,comp.os.ms-windows.misc,8,comp.sys.mac.hardware,8.0,9.52%,19.05%
8,8,477,rec.sport.hockey,461,0,0.0,96.65%,nan%
9,9,472,sci.space,403,comp.sys.mac.hardware,1.0,85.38%,85.59%


In [None]:
%env OPENAI_API_KEY=<PUT_YOUR_API_KEY_HERE>


In [12]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

items_per_cluster = 10
COMPLETIONS_MODEL = "text-davinci-003"

for i in range(num_of_clusters):
    cluster_name = new_df[new_df.cluster == i].iloc[0].rank1
    print(f"Cluster {i}, Rank 1: {cluster_name}, Theme:", end=" ")

    content = "\n".join(
        embedding_df[embedding_df.cluster == i].text.sample(items_per_cluster, random_state=42).values
    )
    response = client.completions.create(
        model=COMPLETIONS_MODEL,
        prompt=f'''We would like to categorize the following content into meaningful groups so that we can summarize it. Please name a news group in less than 20 words based on the commonalities in the content below. For example, 'PC Hardware'\n\nContent:\n"""\n{content}\n"""News Group Name:''',
        temperature=0,
        max_tokens=100,
        top_p=1,
    )
    print(response.choices[0].text.replace("\n", ""))


Cluster 0, Rank 1: rec.autos, Theme:  Automotive Maintenance and Performance
Cluster 1, Rank 1: comp.sys.ibm.pc.hardware, Theme:  Computer Hardware and Software
Cluster 2, Rank 1: talk.politics.misc, Theme:  Legal and Political Challenges
Cluster 3, Rank 1: rec.motorcycles, Theme:  Motorcycling Safety and Advice
Cluster 4, Rank 1: comp.sys.ibm.pc.hardware, Theme:  PC Hardware and Software Upgrades
Cluster 5, Rank 1: soc.religion.christian, Theme:  Christian Theology and Practices
Cluster 6, Rank 1: sci.crypt, Theme:  Government Surveillance and Crypto Regulations
Cluster 7, Rank 1: comp.os.ms-windows.misc, Theme:  International Politics"""
Cluster 8, Rank 1: rec.sport.hockey, Theme:  NHL Team Uniforms & Performance
Cluster 9, Rank 1: sci.space, Theme:  Space Exploration and Technology
Cluster 10, Rank 1: comp.windows.x, Theme:  X Window System Troubleshooting
Cluster 11, Rank 1: talk.politics.mideast, Theme:  Middle East Conflict
Cluster 12, Rank 1: comp.os.ms-windows.misc, Theme:  Win

# Summarize

In [13]:
history = """User : Who are you?
Assistant : Hi there! I'm the coach of the Los Angeles Chargers. How can I help you today? #BoltUp

User : What is the Los Angeles Chargers' main issue at the moment?
Assistant : Injuries have been a challenge for us this season, but we're working hard to overcome them and stay competitive. #StayStrong #BoltUp

User : What about other issues?
Assistant : Every team faces different challenges, but we're constantly evaluating and adjusting our game plan to address any areas that need improvement. #TeamWork #BoltUp
"""

def summarize(text, max_tokens=200):
    response = client.completions.create(
        model=COMPLETIONS_MODEL,
        prompt=text + "\n\nPlease summarize what the User and Assistant discussed above:\n",
        max_tokens=max_tokens,
    )
    return response.choices[0].text

summarized = summarize(history)
print(summarized)



The User and Assistant discussed the Los Angeles Chargers, with the Assistant explaining that the team is dealing with injuries but is working hard to stay competitive and address any issues that need improvement.


In [None]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

class Conversation:
    def __init__(self, prompt, num_of_rounds):
        self.prompt = prompt
        self.num_of_rounds = num_of_rounds
        self.messages = []
        self.messages.append({"role": "system", "content": self.prompt})

    def ask(self, question):
        try:
            self.messages.append({"role": "user", "content": question})
            response = client.chat.completions.create(model="gpt-3.5-turbo",
            messages=self.messages,
            temperature=0.5,
            max_tokens=2048,
            top_p=1)
        except Exception as e:
            print(e)
            return e

        message = response.choices[0].message.content
        self.messages.append({"role": "assistant", "content": message})

        if len(self.messages) > self.num_of_rounds*2 + 1:
            del self.messages[1:3]  # Remove the first round conversation left.
        return message

In [17]:
prompt = summarized + "\n\nBased on the content of the conversation so far, please continue the dialogue:"
conversation = Conversation(prompt, 5)

question = "How about San Francisco 49ers?"
answer = conversation.ask(question)
print("User : %s" % question)
print("Assistant : %s\n" % answer)


User : How about San Francisco 49ers?
Assistant : The San Francisco 49ers have had an interesting season so far. They started off strong but have faced their fair share of challenges, including injuries to key players. Despite that, they have shown resilience and have managed to stay competitive. The team has been working hard to address any issues that need improvement and make adjustments as necessary. It will be interesting to see how they continue to perform throughout the season. Is there anything specific you would like to know about the 49ers?



In [18]:
prompt = "\n\nBased on the content of the conversation so far, please continue the dialogue:"
conversation = Conversation(prompt, 5)

question = "How about San Francisco 49ers?"
answer = conversation.ask(question)
print("User : %s" % question)
print("Assistant : %s\n" % answer)


User : How about San Francisco 49ers?
Assistant : Oh, the San Francisco 49ers! They are a historic team with a rich history in the NFL. They have had some incredible seasons and have won multiple Super Bowls. What do you think of their performance this year?

