In [1]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

# Sample documents for the search index
documents = [
    "I love eating pizza",
    "The movie was great",
    "I enjoy playing soccer",
    "Pizza is my favorite food",
    "Soccer is an interesting sport"
]

# Preprocessing steps: tokenization and stop word removal
stop_words = set(stopwords.words('english'))
tokenized_documents = [nltk.word_tokenize(doc.lower()) for doc in documents]
filtered_documents = [[word for word in doc if word not in stop_words] for doc in tokenized_documents]

# Convert documents to TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in filtered_documents])

# User query
query = "what sport do you like"

# Preprocess the query
tokenized_query = nltk.word_tokenize(query.lower())
filtered_query = [word for word in tokenized_query if word not in stop_words]

# Convert the query to a TF-IDF vector
query_vector = vectorizer.transform([' '.join(filtered_query)])

# Compute cosine similarity between the query vector and document vectors
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Sort documents based on similarity scores
results = [(documents[i], score) for i, score in enumerate(cosine_similarities)]
results = sorted(results, key=lambda x: x[1], reverse=True)

# Print the search results
for result in results:
    print(f"Document: {result[0]}\nSimilarity Score: {result[1]}\n")


Document: Soccer is an interesting sport
Similarity Score: 0.6141889663426563

Document: I love eating pizza
Similarity Score: 0.0

Document: The movie was great
Similarity Score: 0.0

Document: I enjoy playing soccer
Similarity Score: 0.0

Document: Pizza is my favorite food
Similarity Score: 0.0



[nltk_data] Downloading package punkt to /Users/coulson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/coulson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load Universal Sentence Encoder
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Sample conversation records
conversation_records = [
    "Sure, what specific product are you looking for?",
    "I'm looking for a smartphone with a good camera.",
    "We have several options available. Let me provide you with some recommendations."
]

# User query
query = "I want to buy a new phone with a great camera."

# Preprocess the query and conversation records
preprocessed_query = [query]
preprocessed_records = conversation_records

# Split records into smaller batches
batch_size = 2
record_batches = [preprocessed_records[i:i+batch_size] for i in range(0, len(preprocessed_records), batch_size)]

# Encode the query and conversation records batch-wise
query_vector = use_model(preprocessed_query)
record_vectors = []

for batch in record_batches:
    batch_vectors = use_model(batch)
    record_vectors.extend(batch_vectors)

record_vectors = tf.concat(record_vectors, axis=0)

# Compute cosine similarity between the query vector and record vectors
cosine_similarities = cosine_similarity(query_vector, record_vectors).flatten()

# Sort records based on similarity scores
results = [(conversation_records[i], score) for i, score in enumerate(cosine_similarities)]
results = sorted(results, key=lambda x: x[1], reverse=True)

# Print the search results
for result in results:
    print(f"Conversation: {result[0]}\nSimilarity Score: {result[1]}\n")


: 

: 