In [1]:
# conda/pip install langchain
# conda/pip install langchain-ollama
# conda/pip install llama-index


## Preprocess a Reddit dataset

Reddit dataset url: : https://www.kaggle.com/datasets/rodmcn/askreddit-questions-and-answers/data

In [None]:

import polars as pl
import numpy as np
import ollama
import llama_index
from llama_index.embeddings.ollama import OllamaEmbedding

ollama_embedding = OllamaEmbedding(model_name="llama3.1") 

In [3]:
df = pl.read_csv('./data/reddit_answers_long.csv', separator=';')
df

Unnamed: 0_level_0,q_id,text,votes
i64,str,str,f64
0,"""hvbvpz""","""Two pet ducks. You may be temp…",2359.0
1,"""hvbvpz""","""Nice try Jeff Bezos""",764.0
2,"""hvbvpz""","""A curved shower rod. Seriously…",1525.0
3,"""hvbvpz""","""Another monitor. Your producti…",1227.0
4,"""hvbvpz""","""A nasal irrigation kit - eithe…",659.0
…,…,…,…
5940821,"""3kf27v""","""Money is the most important th…",4.0
5940823,"""3kf27v""","""""""If you can't learn how to sh…",6.0
5940824,"""3kf27v""","""Everyone in college writes in …",5.0
5940825,"""3kf27v""","""""""Everything happens for a rea…",7.0


In [4]:
df_sorted = df.with_columns(pl.col('text').map_elements(lambda x: len(x), return_dtype=pl.Int32).alias('answer_len_in_char')) \
    .sort('answer_len_in_char', descending=True)
df_sorted

Unnamed: 0_level_0,q_id,text,votes,answer_len_in_char
i64,str,str,f64,i32
242329,"""erd482""","""It's nearly the beginning of a…",2.0,4564852
3571290,"""hf4cta""","""Me: I think that in order to s…",4.0,2906243
2770234,"""nslkd/""","""What if he has to go poop? Ah…",1.0,2154598
4785025,"""su8sn/""","""(and its not unusual for my co…",3.0,1891855
5551815,"""iptrin""","""\ it's never the same spot on …",45.0,1819715
…,…,…,…,…
5909814,"""j2dozl""","""E""",3.0,1
5912551,"""1cfbzg""",""".""",14.0,1
5913158,"""g4fslr""","""K""",3.0,1
5917507,"""hb2uy9""","""K""",39.0,1


In [5]:
# split into chunks to improve the effectiveness and efficiency 
def split_text_into_chunks(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [6]:
%%time

all_chunks = []
for row in df_sorted.rows():
    long_txt = row[2]
    chunks = split_text_into_chunks(long_txt, chunk_size=512)
    all_chunks.extend(chunks)

print(len(all_chunks))

6830889
CPU times: total: 3.59 s
Wall time: 9.79 s


In [7]:
# overall text is too large to handle in a laptop
# em_batch = ollama_embedding.get_text_embedding_batch(all_chunks, show_progress=True)

## Sample some related answers from this Reddit dataset

Due to the limited GPU resource, only a sample of text is used for later tasks.

In [8]:
# related answer with key words: game hardware
df_game_hardware = df.filter((pl.col('text').str.contains('game')) & (pl.col('text').str.contains('hardware')))
df_game_hardware

Unnamed: 0_level_0,q_id,text,votes
i64,str,str,f64
30669,"""clwnoc""","""""In the late 1980s Nintendo an…",1606.0
53052,"""epj100""","""Do you like dungeon-diving? Do…",187.0
55547,"""b7ssbh""","""There are CPUs that can change…",336.0
68751,"""i5htyn""","""Any action/invasion/war movie/…",774.0
70275,"""grh52m""","""My younger brother held a magn…",272.0
…,…,…,…
5854144,"""fgwmq2""","""I decided I was finally going …",34.0
5868868,"""efglq9""","""There can be no end all game, …",3.0
5877450,"""5u8pou""","""""1/5. Remember the days of the…",2.0
5878261,"""191r2k""","""EEAECO.com gives student disco…",7.0


In [9]:
%%time

all_chunks = []
for row in df_game_hardware.rows():
    long_txt = row[2][:5120]
    # print(len(long_txt))
    chunks = split_text_into_chunks(long_txt, chunk_size=512)
    all_chunks.extend(chunks)

print(len(all_chunks))

894
CPU times: total: 93.8 ms
Wall time: 355 ms


In [10]:
em_batch = ollama_embedding.get_text_embedding_batch(all_chunks, show_progress=True)

Generating embeddings:   0%|          | 0/894 [00:00<?, ?it/s]

In [11]:
# check the embedding of the first 10 tokens in the first batch
em_batch[0][:10]

[-2.517793655395508,
 -1.9537627696990967,
 -3.294511318206787,
 0.14879542589187622,
 5.603634834289551,
 1.1601753234863281,
 -0.6537939310073853,
 -1.8276612758636475,
 -3.081853151321411,
 -1.9861308336257935]

In [12]:
import pickle

with open('./em_batch.pickle', 'wb') as em_batch_file:
    pickle.dump(em_batch, em_batch_file)

In [13]:
em2chunkid = {}
for i, em in enumerate(em_batch):
    em2chunkid.update({str(em):i})

In [14]:
import numpy as np
from numpy.linalg import norm

#  use cos similarity to find top similar ones
def find_top_similar_ones(question: str, topn: int):
    q_em = ollama_embedding.get_query_embedding(question)
    q_em_norm = norm(q_em)
    sim_scores = [(np.dot(q_em, em) / (q_em_norm * norm(em))) for em in em_batch]
    return sorted(zip(sim_scores, range(len(em_batch))), reverse=True)[:topn]
    

In [15]:
top_sim_chunks = find_top_similar_ones("choices of gaming pc", 10)
top_sim_chunks

[(0.48674594466955334, 70),
 (0.45819718061900566, 403),
 (0.45819718061900566, 402),
 (0.45819718061900566, 401),
 (0.4534215338555377, 15),
 (0.4496664370211112, 285),
 (0.4279581000087091, 871),
 (0.4258743637389518, 391),
 (0.42385206006902476, 3),
 (0.4228706963397614, 524)]

In [16]:
for tc in top_sim_chunks:
    print(all_chunks[tc[1]])

every 6 months and having to upgrade or buy a new one though.
NES games/hardware.
NES games/hardware.
NES games/hardware.
 awe-inspiring kinds of advancements. Just faster, better resolved, and less laggy rehashings. It seems like the video card has become the dominant piece of hardware, whereas before it was more about the CPU and memory. Upgrading was a yearly chore, when it could be afforded. My friends and I haven't upgraded our setups in half a decade with the exception of video cards. I'm not complaining or condemning anything at all, these are just my observations of how things have changed. That being said, the big box
Do you have the latest and greatest hardware to play this game? I've spent a lot of money on PCs through my life.
You know, it used to be computer games. For de-stressing purposes, as violent as possible for the sake of increasing the catharsis level.  But actually, whats happened to gaming lately is now a stressor for me, as its yet another one of my hobbies tha

In [17]:
top_sim_chunks = find_top_similar_ones("acceptable hardware for students", 10)
for tc in top_sim_chunks:
    print(all_chunks[tc[1]])

op (video game reviews/video game culture), lindsay ellis (analyses of pop culture, mainly film and theater)
start playing crysis or some other over-the-top-hardware-required-game and fry the shit out of the little bugger EDIT: alternatively start googleing pesticides and see if he gets the hint
ut i'm not actually sure if it's true or not
 equipment to acess. Furthermore, social media is the highlight reel of history, not the story. Very few people show their true selves in their twitter or Facebook feed. Also a number
every 6 months and having to upgrade or buy a new one though.
ors/advisors with frequency, and DO NOT LET THEM tell you to just look it up on X page. They're paid well enough, and you pay enough. Insist on getting it written down. As a former tech salesperson: if you need a laptop for your classes - opt for light. You will curse yourself after you have to stand in the bookstore lineup for 3 hours with 17lbs of laptop on your back. Every time. Go for light if it's possib

## LLM + RAG

Ask the following questions,

* I'd like to set up a gaming computer for Black Myth: Wukong
* any recommendations for a school student with a very limited budget?

Compare the answer from LLM without RAG and with RAG, the latter answer is better because of the additional knowledge provided by Reddit.

In [18]:
import pickle

with open('./em_batch.pickle', 'rb') as em_batch_file:
    em_batch = pickle.load(em_batch_file)

len(em_batch)

894

In [19]:
from langchain_ollama import OllamaLLM

model = OllamaLLM(model="llama3.1")

In [20]:
from langchain_core.prompts import ChatPromptTemplate

ptemplate = """
With the conversation context: {context}
Please answer {question}

"""

prompt = ChatPromptTemplate.from_template(ptemplate)
chain = prompt | model

In [21]:
# only use LLM
def chat_with_AI():
    context = ""
    print("Please say sth.")
    while True:
        user_question = input("You: ")
        if user_question.lower() == "bye":
            break
        print("User: ", user_question)
        result = chain.invoke({"context": context, "question": user_question})
        print("AI: ", result)
        context += f"\n User: {user_question} \n AI: {result}"

In [22]:
chat_with_AI()

Please say sth.
User:  I'd like to set up a gaming computer for Black Myth: Wukong
AI:  Black Myth: Wukong is an action-adventure game with impressive visuals and demanding system requirements. To set up a gaming computer that can handle it smoothly, I'd recommend considering the following components:

1. **CPU:** A strong processor with multiple cores will help with multitasking and provide a smooth gaming experience. For Black Myth: Wukong, consider at least an Intel Core i7-12700K or AMD Ryzen 9 5900X.

2. **GPU:** The graphics card is crucial for the game's performance. An NVIDIA GeForce RTX 3070 or AMD Radeon RX 6800 XT would be a good choice. If you can afford more, going up to an RTX 3080 or RX 6900 XT would provide even better visuals and performance.

3. **RAM:** The game requires at least 16 GB of RAM, but for a smooth experience with all the features turned on, consider using 32 GB (2x16 GB) of DDR5 memory (if your CPU supports it). For older systems that support DDR4, using

In [25]:
# use the text from reddit to improve performance of chatbot
def chat_with_AI_RAG():
    context = ""
    print("Please say sth.")
    while True:
        user_question = input("You: ")
        if user_question.lower() == "bye":
            break
        print("User: ", user_question)
        top_sim_chunks = find_top_similar_ones(user_question, 10)
        for tc in top_sim_chunks:
            context += f"\n AI: {all_chunks[tc[1]]}"
        
        result = chain.invoke({"context": context, "question": user_question})
        print("AI: ", result)
        context += f"\n User: {user_question} \n AI: {result}"

In [26]:
chat_with_AI_RAG()

Please say sth.
User:  I'd like to set up a gaming computer for Black Myth: Wukong
AI:  A gamer's quest!

Based on our conversation context, I'm assuming you're looking to build a gaming PC that can handle modern games smoothly. For Black Myth: Wukong, which is a 3D platformer with some impressive visuals, here are the minimum and recommended system requirements:

Minimum:

* CPU: Intel Core i5-11600K or AMD Ryzen 5 5600X
* GPU: NVIDIA GeForce GTX 1060 or AMD Radeon RX 580
* RAM: 8 GB DDR4
* Storage: 128 GB SSD

Recommended:

* CPU: Intel Core i7-11700K or AMD Ryzen 9 5900X
* GPU: NVIDIA GeForce RTX 3060 or AMD Radeon RX 6800 XT
* RAM: 16 GB DDR4
* Storage: 512 GB SSD

Considering these requirements, I'd suggest building a PC with the following specs:

**CPU:** Intel Core i7-11700K (11th Gen) - A powerful and efficient CPU that can handle demanding games.
**GPU:** NVIDIA GeForce RTX 3060 (6GB GDDR6) - A popular and powerful GPU that provides smooth performance in modern games.
**RAM:**