In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from DLAIUtils import Utils

import ast
import os
import pandas as pd

In [3]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()


In [4]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
utils = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name = INDEX_NAME, dimension = 1536, metric='cosine',
                      spec = ServerlessSpec(cloud='aws', region='us-west-2'))
index = pinecone.Index(INDEX_NAME)

In [5]:
!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

In [6]:
!unzip lesson2-wiki.csv.zip

Archive:  lesson2-wiki.csv.zip
  inflating: wiki.csv                


## TO achieve a more comprehensive context, large number of articles is beneficial. 

In [8]:
max_articles_num = 500
df = pd.read_csv('./wiki.csv',nrows = max_articles_num)
df.head()

Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."
2,1-1,"{'chunk': 1, 'source': 'https://simple.wikiped...","[-0.0015197008615359664, -0.007858820259571075..."
3,1-2,"{'chunk': 2, 'source': 'https://simple.wikiped...","[-0.009930099360644817, -0.012211072258651257,..."
4,1-3,"{'chunk': 3, 'source': 'https://simple.wikiped...","[-0.011600767262279987, -0.012608098797500134,..."
5,1-4,"{'chunk': 4, 'source': 'https://simple.wikiped...","[-0.026462381705641747, -0.016362832859158516,..."


In [10]:
prepped = []
for i, row in tqdm(df.iterrows(), total = df.shape[0]):
    meta = ast.literal_eval(row['metadata'])
    prepped.append({'id': row['id'], 
    'values': ast.literal_eval(row['values']),
    'metadata': meta})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []


  0%|          | 0/500 [00:00<?, ?it/s]

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 500}},
 'total_vector_count': 500}

In [12]:
OPENAI_API_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key= OPENAI_API_KEY)

def get_embeddings(articles, model = "text-embedding-ada-002"):
    return openai_client.embeddings.create(input = articles, model = model)

In [15]:
query = "What is the berlin wall?"
embed = get_embeddings(query)
#print(embed)
res = index.query(vector = embed.data[0].embedding, top_k =3, include_metadata= True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))

August 13  1961: Building of the Berlin Wall begins.
 August 14  1945: Japan announces its surrender at the end of World War II.
 August 14/15  1947: India is partitioned at independence from the UK, as the new mainly Islamic state of Pakistan is created.
 August 15  1960: The Republic of the Congo becomes independent.
 August 15  1971: Bahrain becomes independent.
 August 16  1977: Elvis Presley dies aged 42, leading to a worldwide outpouring of grief.
 August 17  1945: Indonesia declares independence from the Netherlands.
 August 17  1960: Gabon becomes independent.
 August 17  1962: Peter Fechter becomes the first person to be shot dead at the Berlin Wall.
 August 19  43 BC: Augustus becomes Roman consul.
 August 19  14: Augustus dies.
 August 19  1919: Afghanistan becomes independent.
 August 19  1991: The August Coup against Mikhail Gorbachev, in the Soviet Union, begins.
 August 20  1940: Leon Trotsky is fatally wounded with an ice pick in Mexico.
 August 20  1968: The Prague Spr

In [16]:
query = "write an article titled: what is the berlin wall?"
embed = get_embeddings([query])
res = index.query(vector = embed.data[0].embedding, top_k = 3, include_metadata= True)
contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = ("Answer the question based on the context below. \n\n"+ 
                "Context: \n"
               )

prompt_end = (
    f"\n\n Question: {query}\n Answer:"
)

prompt = (
    prompt_start + "\n\n ----- \n\n".join(contexts) + prompt_end)

print(prompt)

Answer the question based on the context below. 

Context: 
August 13  1961: Building of the Berlin Wall begins.
 August 14  1945: Japan announces its surrender at the end of World War II.
 August 14/15  1947: India is partitioned at independence from the UK, as the new mainly Islamic state of Pakistan is created.
 August 15  1960: The Republic of the Congo becomes independent.
 August 15  1971: Bahrain becomes independent.
 August 16  1977: Elvis Presley dies aged 42, leading to a worldwide outpouring of grief.
 August 17  1945: Indonesia declares independence from the Netherlands.
 August 17  1960: Gabon becomes independent.
 August 17  1962: Peter Fechter becomes the first person to be shot dead at the Berlin Wall.
 August 19  43 BC: Augustus becomes Roman consul.
 August 19  14: Augustus dies.
 August 19  1919: Afghanistan becomes independent.
 August 19  1991: The August Coup against Mikhail Gorbachev, in the Soviet Union, begins.
 August 20  1940: Leon Trotsky is fatally wounded 

In [17]:
res = openai_client.completions.create(
    model = "gpt-3.5-turbo-instruct",
    prompt = prompt,
    temperature = 0,
    max_tokens = 636,
    top_p = 1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None)
print("*" * 100)
print(res.choices[0].text)

****************************************************************************************************

The Berlin Wall was a physical barrier that divided the city of Berlin, Germany from 1961 to 1989. It was built by the German Democratic Republic (GDR), also known as East Germany, in an effort to prevent its citizens from fleeing to the democratic West Germany. The wall was a symbol of the Cold War and the ideological divide between communism and capitalism.

The construction of the Berlin Wall began on August 13, 1961, and it consisted of a concrete wall, barbed wire, and guard towers. The wall stretched for 96 miles, dividing the city into East and West Berlin. It was heavily guarded by armed soldiers who were authorized to shoot anyone attempting to cross the wall.

The Berlin Wall was not only a physical barrier, but it also represented the separation of families and friends. Many East Germans were unable to visit their loved ones in the West, and vice versa. The wall also had a s