In [146]:
from dotenv import dotenv_values
import requests
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
config = dotenv_values(".env")
openai.api_key =  config['OPENAI_KEY']


In [147]:
# Reading the data
df = pd.read_csv("./data/wp_posts.csv")
df.head()

Unnamed: 0,title,content,link,date,author,type
0,9 eventos de Halloween que no te puedes perder...,Por si andas buscando a qué fiesta llegar ante...,https://www.plateapr.com/eventos-de-halloween-...,2022-10-14T07:05:00,13,post
1,8 terrazas únicas donde puedes tomarte un buen...,En Puerto Rico no faltan los coffee shops. Ya ...,https://www.plateapr.com/terrazas-unicas-donde...,2022-10-13T13:18:57,13,post
2,Todo lo que puedes hacer en Caguas con solo $30,En el amplio valle de Caguas puedes toparte co...,https://www.plateapr.com/que-hacer-en-caguas-p...,2022-10-11T14:20:40,13,post
3,"Jazz, laberintos embrujados y otras 8 activida...","Llegó el mes de las brujas y con él, todas las...",https://www.plateapr.com/que-hacer-este-fin-de...,2022-10-07T02:24:00,13,post
4,3 restaurantes que dan cátedra de recuperación...,A la hora de ser solidarios en la recuperación...,https://www.plateapr.com/restaurantes-recupera...,2022-10-05T17:27:05,34,post


In [148]:
df.isnull().sum()

title      2
content    1
link       0
date       0
author     0
type       0
dtype: int64

In [149]:
len(df)

187

In [150]:
df = df.dropna()

In [151]:
len(df)

184

In [152]:
articles = [f for f in df.content.values]
titles = [t for t in df.title.values]

In [191]:
# This will take a whileee since we need to delay on every request to not exceed the limit of requests.
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, engine="text-similarity-davinci-001") -> list[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], engine=engine)["data"][0]["embedding"]

X = []
x_titles = []
# text-similarity-{ada, babbage, curie, davinci}-001


engines = [f"text-similarity-{e}-001" for e in ["ada", "babbage", "curie", "davinci"]]

print(engines)

failed = []
import time
for e in engines:
    for i, (article, title) in enumerate(zip(articles, titles)):
        # try:
        max_len = 5900
        if len(article) <max_len:
            art = article
        else:
            art = article[0:max_len]
        X.append(get_embedding(art, engine=e))
        time.sleep(3)
        x_titles.append(title)
        print(f"{i} {e} passed")

    pd.DataFrame(X).to_csv(f'./data/{e}.csv',  index=False)

['text-similarity-ada-001', 'text-similarity-babbage-001', 'text-similarity-curie-001', 'text-similarity-davinci-001']
0 text-similarity-ada-001 passed
1 text-similarity-ada-001 passed
2 text-similarity-ada-001 passed
3 text-similarity-ada-001 passed
4 text-similarity-ada-001 passed
5 text-similarity-ada-001 passed
6 text-similarity-ada-001 passed
7 text-similarity-ada-001 passed
8 text-similarity-ada-001 passed
9 text-similarity-ada-001 passed
10 text-similarity-ada-001 passed
11 text-similarity-ada-001 passed
12 text-similarity-ada-001 passed
13 text-similarity-ada-001 passed
14 text-similarity-ada-001 passed
15 text-similarity-ada-001 passed
16 text-similarity-ada-001 passed
17 text-similarity-ada-001 passed
18 text-similarity-ada-001 passed
19 text-similarity-ada-001 passed
20 text-similarity-ada-001 passed
21 text-similarity-ada-001 passed
22 text-similarity-ada-001 passed
23 text-similarity-ada-001 passed
24 text-similarity-ada-001 passed
25 text-similarity-ada-001 passed
26 text

RetryError: RetryError[<Future at 0x13b644cd0 state=finished raised RateLimitError>]

In [192]:
pd.DataFrame(X).to_csv(f'./data/{e}.csv')

In [250]:
def fix_file(file_name, size=184):
    data = pd.read_csv(f'./data/{file_name}.csv')
    start = len(data) - size
    stop = len(data)
    data = data.iloc[start: stop]
    cols = [c for c in data.columns if c != 'Unnamed: 0']
    data[cols].to_csv(f"{file_name}.csv", index=False)

In [200]:
babbage = pd.read_csv('./data/text-similarity-babbage-001.csv')


In [202]:
size = 184
start = len(babbage) - size
stop 

184.0

In [210]:
babbage.iloc[int(len(babbage)/2): len(babbage)].columns

Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '2038', '2039', '2040', '2041', '2042', '2043', '2044', '2045', '2046',
       '2047'],
      dtype='object', length=2049)

In [217]:
cols = [c for c in babbage.columns if c != 'Unnamed: 0']
babbage.iloc[int(len(babbage)/2): len(babbage)][cols].to_csv("text-similarity-babbage-001.csv", index=False)

In [218]:

curie = pd.read_csv('./data/text-similarity-curie-001.csv')

In [226]:
start = len(curie) - size
stop = len(curie)
cols = [c for c in curie.columns if c != 'Unnamed: 0']
curie.iloc[start: stop][cols].to_csv("text-similarity-curie-001.csv", index=False)

In [233]:
davinci = pd.read_csv('./data/text-similarity-davinci-001.csv')
start = len(davinci) - size
stop = len(davinci)
cols = [c for c in davinci.columns if c != 'Unnamed: 0']
davinci.iloc[start: stop][cols].to_csv("text-similarity-davinci-001.csv", index=False)

In [251]:
engines = [f"text-similarity-{e}-001" for e in ["ada", "babbage", "curie", "davinci"]]

for e in engines:
    print(e)
    if e == 'text-similarity-davinci-001':
        fix_file(e, 106)
    else:
         fix_file(e)

text-similarity-ada-001
text-similarity-babbage-001
text-similarity-curie-001
text-similarity-davinci-001


In [253]:
df.to_csv('text-similarity-documents.csv', index=False)