In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange
from DLAIUtils import Utils

import pandas as pd
import time
import os

In [3]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()


In [5]:
!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"



In [6]:
!unzip all-the-news-3.zip

Archive:  all-the-news-3.zip
  inflating: all-the-news-3.csv      


In [7]:
with open('./all-the-news-3.csv','r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [8]:
df = pd.read_csv('./all-the-news-3.csv', nrows = 99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [9]:
openai_client = OpenAI(api_key = OPENAI_API_KEY)
utils = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
pinecone = Pinecone(api_key = PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name = INDEX_NAME, dimension= 1536, metric='cosine',
                      spec = ServerlessSpec(cloud='aws', region='us-west-2'))
index = pinecone.Index(INDEX_NAME)

## Create embeddings of the new titles

In [10]:
def get_embeddings(articles, model="text-embedding-ada-002"):
    return openai_client.embeddings.create(input = articles, model = model)

In [14]:
CHUNK_SIZE = 400
TOTAL_ROWS = 10000

progress_bar = tqdm(total = TOTAL_ROWS)
chunks = pd.read_csv('./all-the-news-3.csv', chunksize=CHUNK_SIZE, nrows= TOTAL_ROWS)

chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id': str(chunk_num*CHUNK_SIZE+i), 'values': embeddings.data[i].embedding,
    'metadata':{'title': titles[i]},} for i in range(0,len(titles))]
    chunk_num += 1
    if len(prepped) >= 200:
        index.upsert(prepped)
        prepped = []
    progress_bar.update(len(chunk))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

## Build recommender system

In [16]:
def get_recommendations(pinecone_index, search_term, top_k= 10):
    embed = get_embeddings([search_term]).data[0].embedding
    res = pinecone_index.query(vector = embed, top_k = top_k, include_metadata = True)
    return res



In [19]:
reco = get_recommendations(index, 'obama')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

0.849901617 : Barack Obama just stepped off the sidelines to defend Obamacare
0.848412454 : President Obama has a new plan to fight the opioid epidemic
0.848278 : “Our democracy is at stake”: Obama delivers his first post-presidency campaign speech
0.847949 : Obama: if you were fine with big government until it served black people, rethink your biases
0.845737576 : President Obama: Michelle & I Are Gonna Be Renters
0.84419769 : Obama meets with national security team on Syria, Islamic State
0.843295038 : Vox Sentences: Obama got a warmer welcome in Hiroshima than the Japanese prime minister
0.842703104 : Watch President Obama dance the tango in Argentina
0.840954602 : Obama and Supreme Court Tag Team on Juvenile Justice Reform
0.840886533 : Barack Obama in talks to create shows for Netflix: New York Times


## create embeddings of all news content

In [20]:
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(name = INDEX_NAME)
pinecone.create_index(name = INDEX_NAME, dimension = 1536, metric='cosine',
                      spec = ServerlessSpec(cloud='aws',region ='us-west-2'))
articles_index = pinecone.Index(INDEX_NAME)

In [21]:
def embed(embeddings, title, prepped, embed_num):
    for embedding in embeddings.data:
        prepped.append({'id':str(embed_num),'values': embedding.embedding, 
                        'metadata':{'title':title}})
        embed_num += 1
        if len(prepped) >= 100:
            articles_index.upsert(prepped)
            prepped.clear()
    return embed_num

In [23]:
news_data_rows_num = 100
embed_num = 0 #keep track of embedding number for 'id'
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, 
                                               chunk_overlap=20) # how to chunk the article
prepped = []
df = pd.read_csv('./all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

for i in range(0, len(articles_list)):
    print(".", end ="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art,str):
        texts = text_splitter.split_text(art)
        embeddings = get_embeddings(texts)
        emebd_num = embed(embeddings, title, prepped, embed_num)
        

....................................................................................................

In [25]:
articles_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 54}},
 'total_vector_count': 54}

In [26]:
reco = get_recommendations(articles_index, 'obama', top_k = 100)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score}:{title}')
        seen['title'] = "."

0.821058929:Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.820152402:Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.814246416:Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.791367888:Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.787421465:Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.779153466:Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.775539637:Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.772494316:Why Obama is vetting Nevada's Republican governor for the Supreme Court
0.769901633:Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.764554381:Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.764509678:MaliVai Washington on Men's Tennis Today and His Historic Wimbledon Run 20 Years Ago
0.763626575:MaliVai Washington on Men's Tennis Toda