In [1]:
import cohere
import pandas as pd
import numpy as np
import altair as alt
import os

api_key = os.getenv('COHERE_API_KEY') # Paste your API key here. Remember to not share it publicly 
co = cohere.Client(api_key)

In [2]:
# Load the dataset to a dataframe
df_orig = pd.read_csv('https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/atis_intents_train.csv',names=['intent','query'])

# Take a small sample for illustration purposes
sample_classes = ['atis_airfare', 'atis_airline', 'atis_ground_service']
df = df_orig.sample(frac=0.12, random_state=30)
df = df[df.intent.isin(sample_classes)]
df_orig = df_orig.drop(df.index)
df.reset_index(drop=True,inplace=True)

# Remove unnecessary column 
intents = df['intent'] #save for a later need
df.drop(columns=['intent'], inplace=True)
df.head()

Unnamed: 0,query
0,which airlines fly from boston to washington ...
1,show me the airlines that fly between toronto...
2,show me round trip first class tickets from n...
3,i'd like the lowest fare from denver to pitts...
4,show me a list of ground transportation at bo...


In [3]:
# Get text embeddings
def get_embeddings(texts, model='embed-english-v3.0', input_type="search_document"):
  output = co.embed(
                model=model,
                input_type=input_type,
                texts=texts)
  return output.embeddings

df['query_embeds'] = get_embeddings(df['query'].tolist())
df.head()

Unnamed: 0,query,query_embeds
0,which airlines fly from boston to washington ...,"[0.026550293, 0.012084961, -0.00881958, 0.0113..."
1,show me the airlines that fly between toronto...,"[0.013084412, 0.01776123, -0.014343262, -0.003..."
2,show me round trip first class tickets from n...,"[0.02053833, -0.038482666, 0.061523438, 0.0099..."
3,i'd like the lowest fare from denver to pitts...,"[0.0016889572, 0.015411377, -0.029052734, 0.03..."
4,show me a list of ground transportation at bo...,"[0.03793335, -0.008010864, -0.002319336, -0.01..."


In [4]:
# Reduce dimensionality using PCA
from sklearn.decomposition import PCA

# Function to return the principal components
def get_pc(arr,n):
  pca = PCA(n_components=n)
  embeds_transform = pca.fit_transform(arr)
  return embeds_transform

In [5]:
# Reduce embeddings to 10 principal components to aid visualization
embeds = np.array(df['query_embeds'].tolist())
embeds_pc = get_pc(embeds,10)

In [6]:
# Set sample size to visualize
sample = 9

# Reshape the data for visualization purposes
source = pd.DataFrame(embeds_pc)[:sample]
source = pd.concat([source,df['query']], axis=1)
source = source.melt(id_vars=['query'])

# Configure the plot
chart = alt.Chart(source).mark_rect().encode(
    x=alt.X('variable:N', title="Embedding"),
    y=alt.Y('query:N', title='',axis=alt.Axis(labelLimit=500)),
    color=alt.Color('value:Q', title="Value", scale=alt.Scale(
                range=["#917EF3", "#000000"]))
)

result = chart.configure(background='#ffffff'
        ).properties(
        width=700,
        height=400,
        title='Embeddings with 10 dimensions'
       ).configure_axis(
      labelFontSize=15,
      titleFontSize=12)

# Show the plot
result

In [7]:
# Function to generate the 2D plot
def generate_chart(df,xcol,ycol,lbl='on',color='basic',title=''):
  chart = alt.Chart(df).mark_circle(size=500).encode(
    x=
    alt.X(xcol,
        scale=alt.Scale(zero=False),
        axis=alt.Axis(labels=False, ticks=False, domain=False)
    ),

    y=
    alt.Y(ycol,
        scale=alt.Scale(zero=False),
        axis=alt.Axis(labels=False, ticks=False, domain=False)
    ),
    
    color= alt.value('#333293') if color == 'basic' else color,
    tooltip=['query']
    )

  if lbl == 'on':
    text = chart.mark_text(align='left', baseline='middle',dx=15, size=13,color='black').encode(text='query', color= alt.value('black'))
  else:
    text = chart.mark_text(align='left', baseline='middle',dx=10).encode()

  result = (chart + text).configure(background="#FDF7F0"
        ).properties(
        width=800,
        height=500,
        title=title
       ).configure_legend(
  orient='bottom', titleFontSize=18,labelFontSize=18)
        
  return result

In [9]:
# Reduce embeddings to 2 principal components to aid visualization
embeds_pc2 = get_pc(embeds,2)

# Add the principal components to dataframe
df_pc2 = pd.concat([df, pd.DataFrame(embeds_pc2)], axis=1)

# Plot the 2D embeddings on a chart
df_pc2.columns = df_pc2.columns.astype(str)
generate_chart(df_pc2.iloc[:sample],'0','1',title='2D Embeddings')

In [10]:
# Calculate cosine similarity between the search query and existing queries

from sklearn.metrics.pairwise import cosine_similarity

def get_similarity(target,candidates):
  # Turn list into array
  candidates = np.array(candidates)
  target = np.expand_dims(np.array(target),axis=0)

  # Calculate cosine similarity
  sim = cosine_similarity(target,candidates)
  sim = np.squeeze(sim).tolist()
  sort_index = np.argsort(sim)[::-1]
  sort_score = [sim[i] for i in sort_index]
  similarity_scores = zip(sort_index,sort_score)

  # Return similarity scores
  return similarity_scores


In [11]:
# Add new query
new_query = "show business fares"

# Get embeddings of the new query
new_query_embeds = get_embeddings([new_query], input_type="search_query")[0]

In [12]:
# Get the similarity between the search query and existing queries
similarity = get_similarity(new_query_embeds,embeds[:sample])

# View the top 5 articles
print('Query:')
print(new_query,'\n')

print('Similar queries:')
for idx,sim in similarity:
  print(f'Similarity: {sim:.2f};',df.iloc[idx]['query'])

Query:
show business fares 

Similar queries:
Similarity: 0.28;  show me round trip first class tickets from new york to miami
Similarity: 0.24;  show me boston ground transportation
Similarity: 0.23;  i'd like the lowest fare from denver to pittsburgh
Similarity: 0.21;  which airlines fly from boston to washington dc via other cities
Similarity: 0.21;  i would like your rates between atlanta and boston on september third
Similarity: 0.20;  what ground transportation is available in boston
Similarity: 0.20;  of all airlines which airline has the most arrivals in atlanta
Similarity: 0.20;  show me the airlines that fly between toronto and denver
Similarity: 0.19;  show me a list of ground transportation at boston airport


In [13]:
# Create new dataframe and append new query
df_sem = df.copy()
df_sem.loc[len(df_sem.index)] = [new_query, new_query_embeds]

# Reduce embeddings dimension to 2
embeds_sem = np.array(df_sem['query_embeds'].tolist())
embeds_sem_pc2 = get_pc(embeds_sem,2)

# Add the principal components to dataframe
df_sem_pc2 = pd.concat([df_sem, pd.DataFrame(embeds_sem_pc2)], axis=1)

In [14]:
# Create column for representing chart legend
df_sem_pc2['Source'] = 'Existing'
df_sem_pc2.at[len(df_sem_pc2)-1, 'Source'] = "New"

# Plot on a chart
df_sem_pc2.columns = df_sem_pc2.columns.astype(str)
selection = list(range(sample)) + [-1]
generate_chart(df_sem_pc2.iloc[selection],'0','1',color='Source',title='Semantic Search')

In [17]:
from sklearn.cluster import KMeans

# Embed the text for clustering
df['clustering_embeds'] = get_embeddings(df['query'].tolist(), input_type="clustering")
embeds = np.array(df['clustering_embeds'].tolist())

# Pick the number of clusters
df_clust = df_pc2.copy()
n_clusters=3

# Cluster the embeddings
kmeans_model = KMeans(n_clusters=n_clusters, random_state=0)
classes = kmeans_model.fit_predict(embeds).tolist()
df_clust['cluster'] = (list(map(str,classes)))

# Plot on a chart
df_clust.columns = df_clust.columns.astype(str)
generate_chart(df_clust.iloc[:sample],'0','1',lbl='on',color='cluster',title='Clustering with 2 Clusters')

  super()._check_params_vs_input(X, default_n_init=10)


In [18]:
# Bring back the 'intent' column so we can build the classifier
df_class = df_pc2.copy()
df_class['intent'] = intents

# Use the remaining dataset as training data
df_test = df_class[:sample]
df_train = df_class[sample:]

# Reset the index of the slices
df_test = df_test.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)

df_test = df_test.drop('query_embeds', axis=1)
df_train = df_train.drop('query_embeds', axis=1)

In [19]:
# Embed the text for clustering
df_train['classification_embeds'] = get_embeddings(df_train['query'].tolist(), input_type="classification")
# embeds = np.array(df_train['classification_embeds'].tolist())

In [20]:
# Train the classifier with Support Vector Machine (SVM) algorithm

# import SVM classifier code
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Initialize the classifier
svm_classifier = make_pipeline(StandardScaler(), SVC())

# Prepare the training features and label
features = df_train['classification_embeds'].tolist()
label = df_train['intent']

# Fit the support vector machine
svm_classifier.fit(features, label)

In [22]:
# Predict with test data

# Prepare the test inputs
# df_test = df_test.copy()
df_test['classification_embeds'] = get_embeddings(df_test['query'].tolist(), input_type="classification")
inputs = df_test['classification_embeds'].tolist()

# Predict the labels
df_test['intent_pred'] = svm_classifier.predict(inputs)

# Compute the score
score = svm_classifier.score(inputs, df_test['intent'])
print(f"Prediction accuracy is {100*score}%")

Prediction accuracy is 100.0%


In [23]:
# Plot the predicted classes
df_test.columns = df_test.columns.astype(str)
generate_chart(df_test,'0','1',lbl='off',color='intent_pred',title='Classification - Prediction')

In [24]:
# Plot the actual classes
generate_chart(df_test,'0','1',lbl='off',color='intent',title='Classification - Actual')