# LLM Clustering and Structured Outputs

In this demo, we will use unstructured data from [The 2024 MAD (ML, AI & Data) Landscape](https://mad.firstmark.com/) and automatically cluster it into meaninful categories, so that we can get better insights from the data. In the process, we will use Instructor to facilitate getting structured outputs from LLMs. 


# Load data

In [2]:
import pandas as pd
from clustering_utils import cluster_texts, visualize
import warnings

warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('aicompanies_clean.csv')

In [4]:
df.head()

Unnamed: 0,category,company_name,website,founded_year,raised_amount,description
0,Storage,Cohesity,https://www.cohesity.com/,2013.0,$660M,Cohesity is a late-stage technology firm that ...
1,Storage,Qumulo,https://qumulo.com/,2012.0,$345.5M,"Qumulo, headquartered in Seattle, has develope..."
2,Storage,NetApp,https://www.netapp.com/,1992.0,,NetApp (NASDAQ: NTAP) provides data storage an...
3,Storage,HPE Nimble Storage,https://www.hpe.com/us/en/services/nimble-stor...,2015.0,,Hewlett Packard Enterprise (NYSE: HPE) provide...
4,Storage,MinIO,https://min.io/,2014.0,$126.3M,Minio provides open source object storage for ...


# Embed and visualize

In [5]:
clusters, embeddings = cluster_texts(df['description'].values.tolist())

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [6]:
visualize(df['company_name'].values.tolist(), df['category'].values.tolist(), embeddings)

In [7]:
len(clusters), embeddings.shape

(1733, (1733, 5))

In [8]:
#TODO : save embeddings

In [9]:
visualize(df['company_name'].values.tolist(), clusters, embeddings)

In [10]:
df['cluster_id'] = clusters

In [11]:
df.to_csv('aicompanies_clusters3.csv', index=False)

# Naming clusters

In [12]:
import os
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

OpenAI API key configured


In [13]:
import pandas as pd
from clustering_utils import name_clusters, dedup_cluster_names, assign_clusters

In [14]:
df = pd.read_csv('aicompanies_clusters3.csv')

In [15]:
cluster_names = await name_clusters(df)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
with open('cluster_names1c.txt', 'w') as file:
    file.write('\n'.join(cluster_names))

# Dedup cluster names

In [17]:
df = pd.read_csv('aicompanies_clusters3.csv')
with open('cluster_names1b.txt', 'r') as file:
    cluster_names = file.read().split('\n')

In [18]:
ded_clus_names = dedup_cluster_names(cluster_names)

In [19]:
with open('cluster_names2c.txt', 'w') as file:
    file.write('\n'.join(ded_clus_names))

# Assign clusters

In [20]:
df = pd.read_csv('aicompanies_clusters3.csv')
with open('cluster_names2b.txt', 'r') as file:
    cluster_names = file.read().split('\n')

In [21]:
df = await assign_clusters(df, cluster_names)

In [22]:
df.to_csv('temp_new_clusters_c.csv', index=False)

In [23]:
cluster_names = df.new_cluster.unique().tolist()
with open('cluster_names3c.txt', 'w') as file:
    file.write('\n'.join(cluster_names))

# Dedup more cluster names

In [24]:
df = pd.read_csv('temp_new_clusters_c.csv')
with open('cluster_names3c.txt', 'r') as file:
    cluster_names = file.read().split('\n')

In [25]:
ded_clus_names = dedup_cluster_names(cluster_names)

In [26]:
with open('cluster_names4c.txt', 'w') as file:
    file.write('\n'.join(ded_clus_names))

# Assign final clusters

In [27]:
df = pd.read_csv('aicompanies_clusters3.csv')
with open('cluster_names4c.txt', 'r') as file:
    cluster_names = file.read().split('\n')

In [28]:
df = await assign_clusters(df, cluster_names, create=False)

In [29]:
len(cluster_names), df.new_cluster.nunique()

(40, 41)

In [30]:
df.to_csv('aicompanies_clusters3_final.csv', index=False)

# Visualize the result

In [31]:
df = pd.read_csv('aicompanies_clusters3_final.csv')

In [32]:
visualize(df['company_name'].values.tolist(), df['new_cluster'].values.tolist(), embeddings)

In [33]:
df.category.nunique(), df.new_cluster.nunique()

(99, 41)

# See the trends

In [97]:
# Let's focus on the last 12 years and top 10 most popular categories in that period
focus_df = df[(df.founded_year > 2012) & (df.founded_year < 2024)]
focus_cats = focus_df.new_cluster.value_counts()[:10].index
focus_df = focus_df[focus_df.new_cluster.isin(focus_cats)]

In [96]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category10
from bokeh.transform import dodge

output_notebook()

grouped_df = focus_df.groupby(['founded_year', 'new_cluster']).size().reset_index(name='count')
pivot_df = grouped_df.pivot(index='founded_year', columns='new_cluster', values='count')

p = figure(width=800, height=400, title='Interactive Line Chart of Rows by Cluster across Years',
           x_axis_label='Year', y_axis_label='Count')

colors = Category10[len(pivot_df.columns)]

hover = HoverTool(tooltips=[
    ('Year', '@year'),  # Use '@' to refer to columns in the source
    ('Count', '@count'),
    ('Cluster', '@cluster')  # '$name' refers to the legend field associated with the glyph
])
p.add_tools(hover)

for (col, color) in zip(pivot_df.columns, colors):
    # Convert year from index to a column and keep it as string for categorical x-axis
    source = ColumnDataSource(data={
        'year': [str(x) for x in pivot_df.index],
        'count': pivot_df[col],
        'cluster': [col] * len(pivot_df)
    })
    p.line('year', 'count', source=source, line_width=2, color=color)
    p.circle('year', 'count', source=source, fill_color=color, size=8)

show(p)
