In [None]:
# Semantic Search with Medium Article by Title & Subtitle

In [None]:
### Load Data

In [None]:
# Verify Python Environment

import sys
print(sys.executable)

In [None]:
# Check Installed Packages

!{sys.executable} -m pip list

In [None]:
# Reinstall Pandas

#!{sys.executable} -m pip install pandas

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("medium_post_titles.csv", nrows=10000) # exercise whole data set

In [None]:
df.head()

In [None]:
df["subtitle_truncated_flag"].value_counts()

### Data Cleanup

In [None]:
#df.isna().sum()

df = df.dropna()
df = df[~df["subtitle_truncated_flag"]]
# df["subtitle_truncated_flag"].value_counts()

df['title_extended'] = df['title'] + df['subtitle']

In [None]:
# df.head()
# df['category'].nunique() # metadata
# df.shape # 6k+ vectors

### Prep for Upsert

In [None]:
# init pinecone

# API_KEY

#from pinecone import Pinecone
import os
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone
pc = Pinecone(api_key="your-api-key")

In [None]:
# Check if an index exists and create one if it does not
if 'medium-data' not in pc.list_indexes().names():
    pc.create_index(
        name='medium-data', 
        dimension=384, 
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [None]:
import sys
print(sys.executable)

In [None]:
# !{sys.executable} -m pip install sentence-transformers

In [None]:
# !pip list

In [None]:
#import sys
#print(sys.path)

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

In [None]:
df.head(2)

In [None]:
df['values'] = df['title_extended'].map(
    lambda x: (model.encode(x)).tolist()) # python list, 6k rows

In [None]:
df.head()

In [None]:
df['id'] = df.reset_index(drop = 'index').index

In [None]:
df['metadata'] = df.apply(lambda x: {
        'title' : x['title'],
        'subtitle' : x['subtitle'],
        'category' : x['category']
}, axis=1)

In [None]:
df.head(2)

In [None]:
df_upsert = df[['id', 'values', 'metadata']]

In [None]:
df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))

In [None]:
index = pc.Index('medium-data')

In [None]:
index.upsert_from_dataframe(df_upsert)

### Query

In [None]:
# Encode the query and convert to list
query_vector = model.encode("which city is most beautiful?").tolist() # python list

In [None]:
# Ensure the vector length matches the expected dimension of 384
if len(query_vector) != 384:
    raise ValueError("The dimension of the query vector does not match the expected dimension of 384.")

# Perform the query using keyword arguments
results = index.query(vector=query_vector, 
                      top_k=10, 
                      include_metadata=True,
                      include_values=True)

In [None]:
for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['title']}")

In [None]:
for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['subtitle']}")

In [None]:
for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['title']}")