# Lesson 3 - Recommender Systems

### Import the Needed Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange
from DLAIUtils import Utils

import pandas as pd
import time
import os

In [3]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()

### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):

!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

!unzip all-the-news-3.zip

In [4]:
#with open('./data/all-the-news-3.csv', 'r') as f:
with open('./all-the-news-3.csv', 'r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [5]:
#df = pd.read_csv('./data/all-the-news-3.csv', nrows=99)
df = pd.read_csv('./all-the-news-3.csv', nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


### Setup Pinecone

In [6]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)
util = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)

### 1.  Create Embeddings of the News Titles

In [7]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [8]:
CHUNK_SIZE=400
TOTAL_ROWS=10000
progress_bar = tqdm(total=TOTAL_ROWS)
#chunks = pd.read_csv('./data/all-the-news-3.csv', chunksize=CHUNK_SIZE, 
chunks = pd.read_csv('./all-the-news-3.csv', chunksize=CHUNK_SIZE, 
                     nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings.data[i].embedding,
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
      index.upsert(prepped)
      prepped = []
    progress_bar.update(len(chunk))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [9]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

### Build the Recommender System

In [10]:
def get_recommendations(pinecone_index, search_term, top_k=10):
  embed = get_embeddings([search_term]).data[0].embedding
  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
  return res

In [11]:
reco = get_recommendations(index, 'obama')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

0.850471437 : Barack Obama just stepped off the sidelines to defend Obamacare
0.849117041 : President Obama has a new plan to fight the opioid epidemic
0.848726213 : “Our democracy is at stake”: Obama delivers his first post-presidency campaign speech
0.848379135 : Obama: if you were fine with big government until it served black people, rethink your biases
0.846334 : President Obama: Michelle & I Are Gonna Be Renters
0.844644308 : Obama meets with national security team on Syria, Islamic State
0.843859673 : Vox Sentences: Obama got a warmer welcome in Hiroshima than the Japanese prime minister
0.843687534 : Barack Obama in talks to create shows for Netflix: New York Times
0.843388855 : Watch President Obama dance the tango in Argentina
0.841180623 : Clinton, Obama pledge unity behind Trump presidency


### 2.  Create Embeddings of All News Content

In [12]:
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))
articles_index = pinecone.Index(INDEX_NAME)

In [13]:
def embed(embeddings, title, prepped, embed_num):
  for embedding in embeddings.data:
    prepped.append({'id':str(embed_num), 'values':embedding.embedding, 'metadata':{'title':title}})
    embed_num += 1
    if len(prepped) >= 100:
        articles_index.upsert(prepped)
        prepped.clear()
  return embed_num

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>news_data_rows_num = 100</code>):</b> In this lab, we've initially set <code>news_data_rows_num</code> to 100 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 200, 400, 700, and 1000. You'll likely notice better and more relevant results.</p>

In [15]:
news_data_rows_num = 100

embed_num = 0 #keep track of embedding number for 'id'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, 
    chunk_overlap=20) # how to chunk each article
prepped = []
#df = pd.read_csv('./data/all-the-news-3.csv', nrows=news_data_rows_num)
df = pd.read_csv('./all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num = embed(embeddings, title, prepped, embed_num)

....................................................................................................

In [16]:
articles_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

### Build the Recommender System

In [17]:
reco = get_recommendations(articles_index, 'obama', top_k=100)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'

0.821593046 : Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.819327295 : U.S. lawmakers ask for disclosure of number of Americans under surveillance
0.812569439 : NYPD Honcho Insulted by 'Hamilton' Star Lin-Manuel Miranda Celebrating Obama's Controversial Prisoner Release
0.80749172 : Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.806707561 : Trump keeping options open as Republican feud rages
0.801382482 : Michael Bloomberg Is Seriously Considering a Presidential Run
0.800629556 : The most revealing Republican ad of the election is an attack ad against Tim Kaine
0.7985062 : Exclusive: Trump considering fracking mogul Harold Hamm as energy secretary - sources
0.798201919 : Trump tells anti-abortion marchers he will support them
0.798167348 : The government official in charge of ethics just harshly condemned Trump’s plan
0.794105053 : Exclusive: China shuns U.S. request for talks on airline website dispute over Taiwan
0.79272 : “E