In [1]:
import cohere
import pandas as pd
import numpy as np
import altair as alt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [2]:
co = cohere.ClientV2("j3dnErL21HLrrwoaSskJsQpmAn2SxRc1JzJWg7yL") # Get your free API key: https://dashboard.cohere.com/api-keys

In [3]:
# Introduction to Text Embeddings
# Step 1: Prepare the Dataset

# Load the dataset to a dataframe
df_orig = pd.read_csv('https://raw.githubusercontent.com/cohere-ai/cohere-developer-experience/main/notebooks/data/atis_intents_train.csv', names=['intent','query'])

# Take a small sample for illustration purposes
sample_classes = ['atis_airfare', 'atis_airline', 'atis_ground_service']
df = df_orig.sample(frac=0.1, random_state=30)
df = df[df.intent.isin(sample_classes)]
df_orig = df_orig.drop(df.index)
df.reset_index(drop=True,inplace=True)

# Remove unnecessary column 
intents = df['intent'] #save for a later need
df.drop(columns=['intent'], inplace=True)
df.head()

Unnamed: 0,query
0,which airlines fly from boston to washington ...
1,show me the airlines that fly between toronto...
2,show me round trip first class tickets from n...
3,i'd like the lowest fare from denver to pitts...
4,show me a list of ground transportation at bo...


In [4]:
for i in df.head(10)['query']:
    print(i)

 which airlines fly from boston to washington dc via other cities
 show me the airlines that fly between toronto and denver
 show me round trip first class tickets from new york to miami
 i'd like the lowest fare from denver to pittsburgh
 show me a list of ground transportation at boston airport
 show me boston ground transportation
 of all airlines which airline has the most arrivals in atlanta
 what ground transportation is available in boston
 i would like your rates between atlanta and boston on september third
 which airlines fly between boston and pittsburgh


In [5]:
# Step 2: Turn Text into Embeddings
# get text embeddgings
def get_embeddings(texts, model="embed-v4.0", input_type="search_document"):
    output = co.embed(
        texts=texts, 
        model=model, 
        input_type=input_type, 
        embedding_types=["float"]
    )
    return output.embeddings.float

In [6]:
df['query_embeds'] = get_embeddings(df['query'].tolist())
df.head()

Unnamed: 0,query,query_embeds
0,which airlines fly from boston to washington ...,"[0.05444336, -0.021484375, -0.002029419, -0.03..."
1,show me the airlines that fly between toronto...,"[0.022460938, 0.010925293, -0.015136719, -0.01..."
2,show me round trip first class tickets from n...,"[-0.053710938, 0.029418945, -0.005126953, 0.00..."
3,i'd like the lowest fare from denver to pitts...,"[0.048339844, 0.017211914, -0.020507812, -0.01..."
4,show me a list of ground transportation at bo...,"[0.04711914, -0.003829956, 0.008178711, -0.053..."


In [7]:
# Step 3: Visualize Embeddings on a Heatmap

# Function to return the principal components
def get_pc(arr, n):
    pca = PCA(n_components=n)
    embeds_transform = pca.fit_transform(arr)
    return embeds_transform

# Reduce embeddings to 10 principal components to aid visualization
embeds = np.array(df['query_embeds'].tolist())
embeds_pc = get_pc(embeds, 10)

# Set sample size to visualize
sample = 9

# Reshape the data for visualization purposes
source = pd.DataFrame(embeds_pc)[:sample]
source = pd.concat([source,df['query']],axis=1)
source = source.melt(id_vars=['query'])

# Configure the plot
chart = alt.Chart(source).mark_rect().encode(
    x=alt.X('variable:N', title="Embedding"),
    y=alt.Y('query:N', title='',axis=alt.Axis(labelLimit=500)),
    color=alt.Color('value:Q', title="Value", scale=alt.Scale(
                range=["#917EF3", "#000000"]))
)

result = chart.configure(background='#ffffff'
        ).properties(
        width=700,
        height=400,
        title='Embeddings with 10 dimensions'
       ).configure_axis(
      labelFontSize=15,
      titleFontSize=12)

# Show the plot
result

In [8]:
# Step 4: Visualize Embeddings on a 2D Plot

# Function to generate the 2D plot
def generate_chart(df,xcol,ycol,lbl='on',color='basic',title=''):
    chart = alt.Chart(df).mark_circle(size=500).encode(
        x=
        alt.X(xcol,
              scale=alt.Scale(zero=False),
              axis=alt.Axis(labels=False, ticks=False, domain=False)
             ),
        y=
        alt.Y(ycol,
              scale=alt.Scale(zero=False),
              axis=alt.Axis(labels=False, ticks=False, domain=False)
             ),
        color= alt.value('#333293') if color == 'basic' else color,
        tooltip=['query']
    )
    
    if lbl == 'on':
        text = chart.mark_text(align='left', baseline='middle',dx=15, size=13,color='black').encode(text='query', color= alt.value('black'))
    else:
        text = chart.mark_text(align='left', baseline='middle',dx=10).encode()
        
    result = (chart + text).configure(background="#FDF7F0").properties(
        width=800,
        height=500,
        title=title
    ).configure_legend(orient='bottom', titleFontSize=18,labelFontSize=18)
    
    return result


# Reduce embeddings to 2 principal components to aid visualization
embeds_pc2 = get_pc(embeds, 2)

# Add the principal components to dataframe
df_pc2 = pd.concat([df, pd.DataFrame(embeds_pc2)], axis=1)

# Plot the 2D embeddings on a chart
df_pc2.columns = df_pc2.columns.astype(str)
generate_chart(df_pc2.iloc[:sample],'0','1',title='2D Embeddings')

### **Introduction to Semantic Search**

In [9]:
# Step 1: Embed the Search Query
# Define new query
new_query = "How can I find a taxi or a bus when the plane lands?"

# Get embeddings of the new query
new_query_embeds = get_embeddings([new_query], input_type="search_query")[0]

In [10]:
# Calculate cosine similarity between the search query and existing queries
def get_similarity(target, candidates):
    # Turn list into array
    candidates = np.array(candidates)
    target = np.expand_dims(np.array(target),axis=0)

    # Calculate cosine similarity
    sim = cosine_similarity(target, candidates)
    sim = np.squeeze(sim).tolist()
    sort_index = np.argsort(sim)[::-1]
    sort_score = [sim[i] for i in sort_index]
    similarity_scores = zip(sort_index,sort_score)

    # Return similarity scores
    return similarity_scores

# Get the similarity between the search query and existing queries
similarity = get_similarity(new_query_embeds, embeds[:sample])

In [11]:
# View the top 5 articles
print('Query:')
print(new_query,'\n')

print('Most Similar Documents:')
for idx, sim in similarity:
    print(f'Similarity: {sim:.2f};', df.iloc[idx]['query'])

Query:
How can I find a taxi or a bus when the plane lands? 

Most Similar Documents:
Similarity: 0.34;  what ground transportation is available in boston
Similarity: 0.32;  show me boston ground transportation
Similarity: 0.32;  show me a list of ground transportation at boston airport
Similarity: 0.25;  i would like your rates between atlanta and boston on september third
Similarity: 0.21;  of all airlines which airline has the most arrivals in atlanta
Similarity: 0.15;  show me round trip first class tickets from new york to miami
Similarity: 0.15;  which airlines fly from boston to washington dc via other cities
Similarity: 0.14;  i'd like the lowest fare from denver to pittsburgh
Similarity: 0.11;  show me the airlines that fly between toronto and denver


In [12]:
# Step 3: Visualize the Results in a 2D Plot
# Create new dataframe and append new query
df_sem = df.copy()
df_sem.loc[len(df_sem.index)] = [new_query, new_query_embeds]

# Reduce embeddings dimension to 2
embeds_sem = np.array(df_sem['query_embeds'].tolist())
embeds_sem_pc2 = get_pc(embeds_sem, 2)

# Add the principal components to dataframe
df_sem_pc2 = pd.concat([df_sem, pd.DataFrame(embeds_sem_pc2)], axis=1)


# Create column for representing chart legend
df_sem_pc2['Source'] = 'Existing'
df_sem_pc2.at[len(df_sem_pc2)-1, 'Source'] = "New"

# Plot on a chart
df_sem_pc2.columns = df_sem_pc2.columns.astype(str)
selection = list(range(sample)) + [-1]
generate_chart(df_sem_pc2.iloc[selection],'0','1',color='Source',title='Semantic Search')


### **Clustering Using Embeddings**

In [13]:
# Step 1: Embed the Text for Clustering
# Embed the text for clustering
df['clustering_embeds'] = get_embeddings(df['query'].tolist(), input_type="clustering")
embeds = np.array(df['clustering_embeds'].tolist())

In [14]:
# Step 2: Cluster the Embeddings
# Pick the number of clusters
n_clusters = 2

# Cluster the embeddings
kmeans_model = KMeans(n_clusters=n_clusters, n_init='auto', random_state=0)
classes = kmeans_model.fit_predict(embeds).tolist()

# Store the cluster assignments
df_clust = df_pc2.copy()
df_clust['cluster'] = (list(map(str,classes)))

# Preview the cluster assignments
df_clust.head()

Unnamed: 0,query,query_embeds,0,1,cluster
0,which airlines fly from boston to washington ...,"[0.05444336, -0.021484375, -0.002029419, -0.03...",-0.081384,0.43332,0
1,show me the airlines that fly between toronto...,"[0.022460938, 0.010925293, -0.015136719, -0.01...",-0.084157,0.519704,0
2,show me round trip first class tickets from n...,"[-0.053710938, 0.029418945, -0.005126953, 0.00...",-0.231987,-0.033887,0
3,i'd like the lowest fare from denver to pitts...,"[0.048339844, 0.017211914, -0.020507812, -0.01...",-0.290163,-0.100033,0
4,show me a list of ground transportation at bo...,"[0.04711914, -0.003829956, 0.008178711, -0.053...",0.509154,0.131675,1


In [15]:
# Step 3: Visualize the Results in a 2D Plot
# Plot on a chart
df_clust.columns = df_clust.columns.astype(str)
generate_chart(df_clust.iloc[:sample],'0','1',lbl='on',color='cluster',title='Clustering with 2 Clusters')

### **Clustering Using Embeddings**

In [17]:
# Step 1: Embed the Text for Clustering
# Embed the text for clustering
df['clustering_embeds'] = get_embeddings(df['query'].tolist(), input_type="clustering")
embeds = np.array(df['clustering_embeds'].tolist())

# Step 2: Cluster the Embeddings
# Pick the number of clusters
n_clusters = 2

# Cluster the embeddings
kmeans_model = KMeans(n_clusters=n_clusters, n_init='auto', random_state=0)
classes = kmeans_model.fit_predict(embeds).tolist()

# Store the cluster assignments
df_clust = df_pc2.copy()
df_clust['cluster'] = (list(map(str,classes)))

# Preview the cluster assignments
df_clust.head()

Unnamed: 0,query,query_embeds,0,1,cluster
0,which airlines fly from boston to washington ...,"[0.05444336, -0.021484375, -0.002029419, -0.03...",-0.081384,0.43332,0
1,show me the airlines that fly between toronto...,"[0.022460938, 0.010925293, -0.015136719, -0.01...",-0.084157,0.519704,0
2,show me round trip first class tickets from n...,"[-0.053710938, 0.029418945, -0.005126953, 0.00...",-0.231987,-0.033887,0
3,i'd like the lowest fare from denver to pitts...,"[0.048339844, 0.017211914, -0.020507812, -0.01...",-0.290163,-0.100033,0
4,show me a list of ground transportation at bo...,"[0.04711914, -0.003829956, 0.008178711, -0.053...",0.509154,0.131675,1


In [18]:
# Step 3: Visualize the Results in a 2D Plot
# Plot on a chart
df_clust.columns = df_clust.columns.astype(str)
generate_chart(df_clust.iloc[:sample],'0','1',lbl='on',color='cluster',title='Clustering with 2 Clusters')