# Semantic Search on Medium Post Titles

In [17]:
import pandas as pd

df = pd.read_csv('medium_post_titles.csv', nrows=10000)

df.head(10)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
3,equality,"""Call me Nat Love"" :Black Cowboys and the Fron...",,False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False
6,politics,"""Diss"" vs. ""Piss"": The Blue Wave and Yellow Tr...",Michael Gofman & Matthew Wigler explore how bu...,False
7,health,"""Doctor, he's gone into shock!""",You've seen it in movies and on television. B...,False
8,culture,"""Happily Ever After: Fairy Tales for Every Chi...",Television shows have an invaluable opportunit...,False
9,poetry,"""I Love You"" The Dangerous Toxic Truth","The Big, Smelly Heap of Lies Pretending To Be ...",False


### Cleaning and Pre-Processing Data

In [18]:
df["subtitle_truncated_flag"].value_counts()

subtitle_truncated_flag
False    6318
True     3682
Name: count, dtype: int64

In [19]:
df.isna().sum() # Check for missing values

category                     0
title                        0
subtitle                   107
subtitle_truncated_flag      0
dtype: int64

In [20]:
df = df.dropna() # Drop missing values
df.isna().sum()

category                   0
title                      0
subtitle                   0
subtitle_truncated_flag    0
dtype: int64

In [21]:
df = df[~df["subtitle_truncated_flag"]] # Drop rows where subtitle_truncated_flag is True

df["subtitle_truncated_flag"].value_counts()

subtitle_truncated_flag
False    6211
Name: count, dtype: int64

In [22]:
df["title_extended"] = df["title"] + " " + df["subtitle"]
df["title_extended"][0]

'"21 Conversations" - A fun (and easy) game for teams to get to know each other A (new?) Icebreaker game to get your team to say all the interesting stuff'

In [23]:
df["category"].nunique() # How many unique categories are there?

93

In [24]:
df.shape

(6211, 5)

Then there will be 6211 vectors.

### Prepare for Upsert

In [25]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "medium-posts"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
df["values"] = df["title_extended"].map(lambda x: model.encode(x).tolist())

df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.030211886391043663, -0.012802648358047009,..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars Hill Author and UW lec...","[-0.0233279038220644, -0.010735482908785343, -..."
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!"" Or, a primer...","[0.037407323718070984, -0.000856809550896287, ..."
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?"" How w...","[-0.013686479069292545, 0.004296108614653349, ..."
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To...","[-0.029730871319770813, 0.0013729340862482786,..."


In [27]:
df["id"] = df.reset_index(drop="index").index.astype(str)  # Generate unique sequential id for each row as string
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.030211886391043663, -0.012802648358047009,...",0
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars Hill Author and UW lec...","[-0.0233279038220644, -0.010735482908785343, -...",1
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!"" Or, a primer...","[0.037407323718070984, -0.000856809550896287, ...",2
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?"" How w...","[-0.013686479069292545, 0.004296108614653349, ...",3
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To...","[-0.029730871319770813, 0.0013729340862482786,...",4


In [28]:
# Create metadata for each row
df["metadata"] = df.apply(lambda x: {
    "title": x["title"],
    "subtitle": x["subtitle"],
    "category": x["category"]
}, axis=1)

df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id,metadata
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.030211886391043663, -0.012802648358047009,...",0,"{'title': '""21 Conversations"" - A fun (and eas..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars Hill Author and UW lec...","[-0.0233279038220644, -0.010735482908785343, -...",1,"{'title': '""Biblical Porn"" at Mars Hill', 'sub..."
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False,"""CISGENDER?! Is That A Disease?!"" Or, a primer...","[0.037407323718070984, -0.000856809550896287, ...",2,"{'title': '""CISGENDER?! Is That A Disease?!""',..."
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False,"""Can I Train my Model on Your Computer?"" How w...","[-0.013686479069292545, 0.004296108614653349, ...",3,"{'title': '""Can I Train my Model on Your Compu..."
5,cryptocurrency,"""Cypherpunks and Wall Street"": The Security To...",Bruce Fenton presents at the World Blockchain ...,False,"""Cypherpunks and Wall Street"": The Security To...","[-0.029730871319770813, 0.0013729340862482786,...",4,"{'title': '""Cypherpunks and Wall Street"": The ..."


### Upsert Data

In [31]:
df_upsert = df[["id", "values", "metadata"]]

index.upsert_from_dataframe(df_upsert)

sending upsert requests:   0%|          | 0/6211 [00:00<?, ?it/s]

{'upserted_count': 6211}

### Query Data

In [47]:
results = index.query(
    top_k=10,
    include_metadata=True,
    vector=model.encode("What is the best way to learn Machine Learning?").tolist()
)

for result in results["matches"]:
    print(f"{round(result['score'], 4)}: {result['metadata']['title']}")


0.7476: A Beginner’s Guide to Machine Learning
0.6335: A Beginner’s Guide to Machine Learning
0.6191: A Simple Guide to the Basics of A.I.
0.6111: 10 Machine Learning Methods that Every Data Scientist Should Know
0.5789: Algorithms to make you more effective
0.5765: 4 easy steps to improve your machine learning code performance
0.574: A practical guide to collecting ML datasets
0.5652: 4 Machine Learning Techniques with Python
0.5605: 7 Machine Learning lessons that stuck with me this year
0.5452: A quick introduction to derivatives for machine learning people
