In [1]:
import google.generativeai as genai
import chromadb
import os
import re

In [2]:
google_api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key = google_api_key)

In [32]:
#Initializing client to create a collection to embed and build the vector database

client = chromadb.Client()

collection = client.get_or_create_collection(name = 'Ramin_v_Marx')

In [4]:
#Determine best model to use for embeddings

list_o_models = genai.list_models()

for m in list_o_models:
    if 'embedContent' in m.supported_generation_methods:
        print(m.name)

models/embedding-001
models/text-embedding-004


In [23]:
genai.get_model('models/text-embedding-004')

Model(name='models/text-embedding-004',
      base_model_id='',
      version='004',
      display_name='Text Embedding 004',
      description='Obtain a distributed representation of a text.',
      input_token_limit=2048,
      output_token_limit=1,
      supported_generation_methods=['embedContent'],
      temperature=None,
      max_temperature=None,
      top_p=None,
      top_k=None)

In [24]:
genai.get_model('models/embedding-001')

Model(name='models/embedding-001',
      base_model_id='',
      version='001',
      display_name='Embedding 001',
      description='Obtain a distributed representation of a text.',
      input_token_limit=2048,
      output_token_limit=1,
      supported_generation_methods=['embedContent'],
      temperature=None,
      max_temperature=None,
      top_p=None,
      top_k=None)

In [None]:
# Both the same but documentation recommends text-embedding-004 for new projects

# Load content and preprocess

In [5]:
with open("C:/Users/Taylor/OneDrive/Desktop/LLM docs/Overall Philosophy.txt", "r", encoding = 'utf-8') as first_file:
    text = first_file.read()

In [6]:
text = text.replace("\n", " ")

In [7]:
with open("C:/Users/Taylor/OneDrive/Desktop/LLM docs/Wikipedia.txt", "r", encoding = 'utf-8') as second_file:
    wiki_text = second_file.read()

In [8]:
wiki_text = wiki_text.replace("\n", " ")

In [9]:
#Remove all the citations

wiki_text = re.sub(r"\[.*?\]", "", wiki_text)

In [10]:
# Split text where I made variables in the text doc (first is summary)

match = re.search(r"wikipedia_summary(.*?)das_kapital", wiki_text)
wiki_summary = match.group(1).strip()
wiki_summary = wiki_summary.replace('"""', "")

In [11]:
# For Das Kapital portion

match = re.search(r"das_kapital(.*?)influences", wiki_text)
wiki_kapital = match.group(1).strip()
wiki_kapital = wiki_kapital.replace('"""', "")

In [12]:
# Now for influences

match = re.search(r"influences(.*?)human_nature", wiki_text)
wiki_influences = match.group(1).strip()
wiki_influences = wiki_influences.replace('"""', "")

In [13]:
# View on human nature

match = re.search(r"human_nature(.*?)class_struggle", wiki_text)
wiki_human = match.group(1).strip()
wiki_human = wiki_human.replace('"""', "")

In [14]:
# Class struggle

match = re.search(r"class_struggle(.*?)critiques", wiki_text)
wiki_class = match.group(1).strip()
wiki_class = wiki_class.replace('"""', "")

In [15]:
# Critiques

match = re.search(r"critiques(.*?)international_relations", wiki_text)
wiki_critique = match.group(1).strip()
wiki_critique = wiki_critique.replace('"""', "")

In [16]:
# Views on international relations

match = re.search(r"international_relations(.*?)legacy", wiki_text)
wiki_international = match.group(1).strip()
wiki_international = wiki_international.replace('"""', "")

In [17]:
# Legacy

match = re.search(r"legacy(.*?)quotes_and_origins", wiki_text)
wiki_legacy = match.group(1).strip()
wiki_legacy = wiki_legacy.replace('"""', "")

In [18]:
# Quotes

match = re.search(r"quotes_and_origins(.*?)$", wiki_text)
wiki_quotes = match.group(1).strip()
wiki_quotes = wiki_quotes.replace('"""', "")

## Split into chunks

In [19]:
def split_text(text, max_words = 700):
    words = text.split()
    chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
    return chunks

In [20]:
# applying split_text function to all content

overall_philosophy = split_text(text)
marx_summary = split_text(wiki_summary)
das_kapital_wiki = split_text(wiki_kapital)
marx_influences = split_text(wiki_influences)
marx_human_nature = split_text(wiki_human)
marx_class_struggle = split_text(wiki_class)
marx_critiques = split_text(wiki_critique)
marx_international = split_text(wiki_international)
marx_legacy = split_text(wiki_legacy)
marx_quotes = split_text(wiki_quotes)

# Embed all content

In [21]:
# Start by adding all lists to a dictionary

content_groups = {
    "overall_philosophy": overall_philosophy,
    "marx_summary": marx_summary,
    "das_kapital_wiki": das_kapital_wiki,
    "marx_influences": marx_influences,
    "marx_human_nature": marx_human_nature,
    "marx_class_struggle": marx_class_struggle,
    "marx_critiques": marx_critiques,
    "marx_international": marx_international,
    "marx_legacy": marx_legacy,
    "marx_quotes": marx_quotes
}

In [48]:
# Iterate through dictionary to add each chunk to the embedding function, then add the embedding to the collection

for group_name, chunks in content_groups.items():
    for i, chunk in enumerate(chunks):
        embedding = genai.embed_content(model = "models/text-embedding-004", content = chunk)["embedding"]

        collection.upsert(
            embeddings = [embedding],
            documents = [chunk],
            metadatas = [{"group": group_name, "chunk_id": i}],
            ids = [f'{group_name}_chunk_{i}']
    )

Add of existing embedding ID: overall_philosophy_chunk_0
Insert of existing embedding ID: overall_philosophy_chunk_0
Add of existing embedding ID: overall_philosophy_chunk_1
Insert of existing embedding ID: overall_philosophy_chunk_1
Add of existing embedding ID: overall_philosophy_chunk_2
Insert of existing embedding ID: overall_philosophy_chunk_2
Add of existing embedding ID: overall_philosophy_chunk_3
Insert of existing embedding ID: overall_philosophy_chunk_3
Add of existing embedding ID: marx_summary_chunk_0
Insert of existing embedding ID: marx_summary_chunk_0
Add of existing embedding ID: marx_summary_chunk_1
Insert of existing embedding ID: marx_summary_chunk_1
Add of existing embedding ID: marx_summary_chunk_2
Insert of existing embedding ID: marx_summary_chunk_2
Add of existing embedding ID: marx_summary_chunk_3
Insert of existing embedding ID: marx_summary_chunk_3
Add of existing embedding ID: das_kapital_wiki_chunk_0
Insert of existing embedding ID: das_kapital_wiki_chunk_0

In [51]:
#Checking that embeddings are stored

embeddings = collection.get(include = ["embeddings"])

embeddings['embeddings'][:5]

# Simple ChatBot Logic to test out

In [65]:
def chatbot_response(user_input):
    query_embedding = genai.embed_content(model = "models/text-embedding-004", content = user_input)["embedding"]

    results = collection.query(
        query_embeddings = [query_embedding]
    )

    retrieved_text = results['documents']
    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(f"Based on the following, respond as Karl Marx would: {retrieved_text}\n\nUser: {user_input}")

    return response.text

In [66]:
user_input = "give me a quick summary of karl marx's history"

chatbot_response(user_input)

'Bah!  A "quick summary" of *my* history?  As if the dialectical unfolding of my life, interwoven with the material conditions of 19th-century Europe, could be reduced to a trivial anecdote!  \n\nNevertheless, for the bourgeoisie\'s sake, here\'s a skeletal outline:  Young Hegelian, studied philosophy and law, became a radical journalist, expelled from multiple countries for my revolutionary pronouncements against the inherent injustices of capitalism. Collaborated with Engels, formulated historical materialism, wrote *Das Kapital* (the definitive analysis of capitalist exploitation!),  actively involved in the International Workingmen\'s Association.  Ultimately, I dedicated my life to exposing the contradictions of capitalism and laying the groundwork for the inevitable proletarian revolution.  The struggle continues!\n'