# Lesson 3 - Basic Recommendation Systems #

In [None]:
!pip install -qU "pinecone-client[grpc]" openai tqdm langchain pandas


In [None]:
from collections import Counter
from google.colab import userdata
from openai import OpenAI

import pandas as pd
import pinecone
import time
from tqdm import tqdm


### Download a sample article dataset. ###
* The [dataset](https://components.one/datasets/all-the-news-2-news-articles-dataset/) used throughout this example contains 2.7 million news articles and essays from 27 American publications.
* The link to the data is [here](https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip)

In [None]:
!wget https://www.dropbox.com/s/cn2utnr5ipathhh/all-the-news-2-1.zip -q --show-progress


### Let's prepare the data ###
* unzip
* examine the shape/format of the data

In [None]:
!unzip all-the-news-2-1.zip

* Let's look at the headers

In [None]:
with open('all-the-news-2-1.csv', 'r') as f:
  header = f.readline()
  print(header)

* Actually let's look at the data itself use a Dataframe

In [None]:
df = pd.read_csv('all-the-news-2-1.csv', nrows=99)
df.head()

### Prepare Pinecone ###
* Get our API keys
* Prepare an index
* Connect to Pinecone
* Note to keep things clean across subsequent runs, let's delete and recreate the index

In [None]:
# get api key from app.pinecone.io
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

openai_client = OpenAI(api_key=OPENAI_API_KEY)

pinecone.init(
    api_key=PINECONE_API_KEY
)

index_name = 'lesson3'

pinecone.delete_index(name=index_name)
pinecone.create_index(name=index_name, dimension=1536)
time.sleep(1)

index = pinecone.GRPCIndex(index_name)

index

In [None]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)


### Prepare and insert data ###
* Read as a dataframe, CHUNK_SIZE rows at a time
* Extract the article title, author and article itself
* Build embeddings from the titles only
* Insert into Pinecone

In [None]:
CHUNK_SIZE=500
TOTAL_ROWS=10000
progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv('all-the-news-2-1.csv', chunksize=CHUNK_SIZE, nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:
    chunk = chunk.dropna()
    articles = chunk['article'].tolist()
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings.data[i].embedding,
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    index.upsert(prepped)
    progress_bar.update(len(chunk))

print('DONE')


In [None]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)


### Fetch results from Pinecone ###
* Get the embedding for *search_term*
* Query pinecone, return result and format


In [None]:
def get_recommendations(pinecone_index, search_term, top_k=10):
  embed = get_embeddings([search_term]).data[0].embedding
  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
  return res

reco = get_recommendations(index, 'tennis')
for r in reco.matches:
  print(f'{r.score} : {r.metadata["title"]}')


### Can we do better? ###
* We were embedding article titles before
* Let's try embedding the article itself

In [None]:
articles_index_name = 'lesson3articles'
pinecone.delete_index(name=articles_index_name)
pinecone.create_index(name=articles_index_name, dimension=1536)
time.sleep(1)
articles_index = pinecone.GRPCIndex(articles_index_name)
articles_index

### Chunk up the articles and generate embeddings ###
* Use Langchain RecursiveCharacterTextSplitter to chunk
* Read the file into chunks of 1,000 rows each
* For each chunk, get the articles out
* Then for each article, generate one or more embeddings per article (depending on length)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import trange
CHUNK_SIZE=500
chunks = pd.read_csv('all-the-news-2-1.csv', chunksize=CHUNK_SIZE, nrows=9999)
pd.options.display.max_rows = 999
embed_num = 0
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
for chunk in chunks:
    chunk = chunk.dropna()
    articles = chunk['article'].tolist()
    titles = chunk['title'].tolist()
    for art_idx in trange(0, len(articles)):
      art = articles[art_idx]
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      prepped = []
      for embedding in embeddings.data:
        prepped.append({'id':str(embed_num), 'values':embedding.embedding, 'metadata':{'title':titles[art_idx]}})
        embed_num += 1
      articles_index.upsert(prepped)

print('DONE')

In [None]:

reco = get_recommendations(articles_index, 'obama', top_k=100)
seen = {}
for r in reco.matches:
  title = r.metadata['title']
  if title not in seen:
    print(f'{r.score} : {title}')
    seen[title] = '.'