In [2]:
import os
import openai

import pandas as pd
import itertools
import pinecone
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

api_key  = os.getenv('PINECONE_API_KEY')


- Run a mini dataset.  Reduced from 27K to 500 rows.
- The original code in the blog caused a deadlock.

In [13]:
pinecone.init(api_key=api_key, environment='us-west4-gcp-free')

index_name = "question-answering"

index = pinecone.Index(index_name=index_name)

DATA_FILE = f"/Users/ytchen/Documents/projects/research/data/comcast_mini.csv"

pd.set_option("display.max_colwidth", 500)

df = pd.read_csv(
    f"{DATA_FILE}",  usecols=["Ticket", "CustomerComplaint"], index_col=False
)
df = df.sample(frac=1).reset_index(drop=True)
df.drop_duplicates(inplace=True)
df['Ticket'] = df['Ticket'].apply(str)
print(df.describe())


        Ticket CustomerComplaint
count       48                48
unique      48                46
top     230876           Comcast
freq         1                 3


In [20]:
model = SentenceTransformer("average_word_embeddings_glove.6B.300d")

df["question_vector"] = df.CustomerComplaint.apply(lambda x: model.encode(str(x)).tolist())
print(df.describe())
print(df.head())


        Ticket CustomerComplaint  \
count       48                48   
unique      48                46   
top     230876           Comcast   
freq         1                 3   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            question_vector  
count                                                                                                                                                                                                                                                                                                                

In [21]:
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

for batch in chunks(zip(df.Ticket, df.question_vector)):
    index.upsert(vectors=batch)
    

In [29]:
# the user queries to check against
#     "Wifi Not working",
#    "Speed too slow",
query_questions = [
     "Wifi Not working",
]

# embeddings
query_vectors = [model.encode(str(question)).tolist() for question in query_questions]
#print(query_vectors)


In [30]:
# the result with the most similar indexed questions
#query_results = index.query(queries=query_vectors, top_k=5)
query_results = index.query(queries=query_vectors, top_k=3)
print(query_results)


{'results': [{'matches': [{'id': '360759', 'score': 0.895923793, 'values': []},
                          {'id': '350675', 'score': 0.88625443, 'values': []},
                          {'id': '347054', 'score': 0.658680201, 'values': []}],
              'namespace': ''}]}


In [31]:
#print(df.head())


In [32]:
#                 df[df.Ticket == _id].CustomerComplaint.values[0] for _id in ids
for question, res in zip(query_questions, query_results.results):
    print("\n\n\n Original question : " + str(question))
    print("\n Most similar questions based on pinecone vector search: \n")

    ids = [match.id for match in res.matches]
    print(ids)
    
    scores = [match.score for match in res.matches]
    print(scores)
    
    df_result = pd.DataFrame(
        {
            "Ticket#": ids,
            "Question": [
                df[df.Ticket == _id].CustomerComplaint for _id in ids
            ],
            "Score": scores,
        }
    )
    print(df_result)   
    




 Original question : Wifi Not working

 Most similar questions based on pinecone vector search: 

['360759', '350675', '347054']
[0.895923793, 0.88625443, 0.658680201]
  Ticket#  \
0  360759   
1  350675   
2  347054   

                                                                         Question  \
0  47    Wifi internet not working as well
Name: CustomerComplaint, dtype: object   
1                              Series([], Name: CustomerComplaint, dtype: object)   
2                              Series([], Name: CustomerComplaint, dtype: object)   

      Score  
0  0.895924  
1  0.886254  
2  0.658680  
