# Parse the Chunks

In [1]:
def parse_chunks_file(file_path):
    chunks = []
    with open(file_path, 'r') as f:
        current_chunk = {}
        for line in f:
            line = line.strip()
            if line.startswith("Chunk #"):
                current_chunk["chunk_number"] = int(line.split("#")[1])
            elif line.startswith("Source:"):
                current_chunk["source"] = line.split(": ")[1]
            elif line.startswith("Text:"):
                current_chunk["text"] = line.split(": ", 1)[1]
                # Add completed chunk and reset
                chunks.append(current_chunk)
                current_chunk = {}
    return chunks

parsed_chunks = parse_chunks_file("chunks.txt")

In [3]:
len(parsed_chunks)

15

In [4]:
parsed_chunks[0]

{'chunk_number': 1,
 'source': 'cnn_article.txt',
 'text': 'title: a shocking chinese ai advancement called deepseek is sending us stocks plunging cnn us stocks dropped sharply monday and chipmaker nvidia lost nearly 600 billion in market value after a surprise advancement from a chinese artificial intelligence company, deepseek , threatened the aura of invincibility surrounding americas technology industry. deepseek , a one-year-old startup, revealed a stunning capability last week: it presented a chatgpt-like ai model called r1, which has all the familiar abilities, operating at a fraction of the cost of openais, googles or metas popular ai models. the company said it had spent just 5.6 million on computing power for its base model, compared with the hundreds of millions or billions of dollars us companies spend on their ai technologies. that sent shockwaves through markets, in particular the tech sector, on monday. the tech-heavy nasdaq plunged by 3.1 and the broader sp 500 fell 1.5

In [5]:
parsed_chunks[1]

{'chunk_number': 2,
 'source': 'cnn_article.txt',
 'text': 'leading tech investors, called deepseek one of the most amazing and impressive breakthroughs ive ever seen, in a post on x . the stunning achievement from a relatively unknown ai startup becomes even more shocking when considering that the united states for years has worked to restrict the supply of high-power ai chips to china , citing national security concerns. that means deepseek was able to achieve its low-cost model on under-powered ai chips. tech stocks tumble us tech stocks got hammered monday. nvidia nvda , the leading supplier of ai chips, fell nearly 17 and lost 588.8 billion in market value by far the most market value a stock has ever lost in a single day, more than doubling the previous record of 240 billion set by meta nearly three years ago. for perspective, nvidia lost more in market value monday than all but 13 companies are worth period. nvidia began the day as the most valuable publicly traded stock on the 

In [6]:
parsed_chunks[14]

{'chunk_number': 15,
 'source': 'Document3.txt',
 'text': 'will likely drive innovation and shape the future of intelligent systems, offering new possibilities for solving complex problems and enhancing human capabilities.'}

# Generate Embeddings

In [8]:
import os
import openai
from dotenv import load_dotenv
load_dotenv()

client = openai.OpenAI(
  api_key=os.getenv("TOGETHER_API_KEY"),
  base_url=os.getenv("TOGETHER_BASE_URL"),
)

In [10]:
def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="togethercomputer/m2-bert-80M-32k-retrieval"
    )
    return response.data[0].embedding

for chunk in parsed_chunks:
    chunk["embedding"] = get_embedding(chunk["text"])

# Prepare for Pinecone

In [11]:
pinecone_data = []
for chunk in parsed_chunks:
    pinecone_data.append({
        "id": f"chunk_{chunk['chunk_number']}",
        "values": chunk["embedding"],
        "metadata": {
            "source": chunk["source"],
            "chunk_number": chunk["chunk_number"],
            "text": chunk["text"]
        }
    })

In [13]:
print(pinecone_data[0])

{'id': 'chunk_1', 'values': [0.06969701, -0.03318938, 0.17799321, 0.3000356, -0.0076096333, -0.17303014, 0.2780669, -0.042178836, 0.26382613, 0.12504727, -0.36713076, 0.11640171, -0.34269002, -0.09929308, 0.037152722, 0.102378555, -0.021582397, -0.19380191, 0.0029728687, -0.025832728, -0.35480845, -0.18215656, -0.14970814, -0.28141427, 0.05867895, -0.35171372, -0.059981313, -0.041587032, 0.26130745, 0.05552418, 0.16992815, -0.11347354, -0.11250953, -0.03088671, -0.14819552, 0.15538973, -0.039707698, 0.21913281, 0.0647081, -0.030804869, -0.11696476, -0.2141617, -0.207563, 0.030417865, 0.09498098, 0.18072549, -0.07526764, -0.12122867, 0.12985463, 0.12910214, -0.23375233, 0.067487344, -0.2284165, 0.14605036, -0.15443064, 0.028970614, -0.009370593, 0.0062874984, -0.20960324, -0.035867997, 0.10395962, 0.046016, -0.058865502, 0.19650206, -0.049542695, -0.09823916, -0.1240866, 0.052633103, 0.043445487, 0.20876175, 0.1649083, 0.09289101, -0.0014520021, -0.13081628, -0.0076122824, 0.16125019, 0

In [14]:
len(pinecone_data[0]["values"])

768

Our embedding vector is of 768D


# Upsert to Pinecone

In [18]:
from pinecone import Pinecone, ServerlessSpec
load_dotenv()

pinecone = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "assignment"


if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=768,  
        metric="cosine",
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
    )

index = pinecone.Index(index_name)


batch_size = len(parsed_chunks)
for i in range(0, len(pinecone_data), batch_size):
    batch = pinecone_data[i:i+batch_size]
    index.upsert(vectors=batch)

print(f"Successfully uploaded {len(pinecone_data)} chunks to Pinecone")

Successfully uploaded 15 chunks to Pinecone
