In [None]:
# %pip install scikit-learn

In [213]:
import requests
import pandas as pd
import time

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import logging
import http.client
import os

### Crawl Trivia API

In [212]:
# cat_count = requests.get("https://opentdb.com/api_count.php?category=CATEGORY_ID_HERE")
cats = requests.get("https://opentdb.com/api_category.php").json()

cats_list = cats['trivia_categories']
cats_dict = {} # maps id to name
for cat in cats_list:
    cats_dict[cat['id']] = cat['name']
for id, name in cats_dict.items():
    print(f"[{id}] {name}")
    
cat_names = []
cat_counts = []
IDs = [id for id in list(range(9,33,1))] # [30, 29, 19, 18, 17, 12]
for id in IDs:
    cat_names += [cats_dict[id]]
    cat = [requests.get(f"https://opentdb.com/api_count.php?category={id}")]
    print(cat[0].json())
    cat_counts += [cat[0].json()]

for cat in cat_counts:
    print(f"{cats_dict[cat['category_id']]}: {cat['category_question_count']['total_question_count']}")
cat_counts = [cat['category_question_count']['total_question_count'] for cat in cat_counts]


[9] General Knowledge
[10] Entertainment: Books
[11] Entertainment: Film
[12] Entertainment: Music
[13] Entertainment: Musicals & Theatres
[14] Entertainment: Television
[15] Entertainment: Video Games
[16] Entertainment: Board Games
[17] Science & Nature
[18] Science: Computers
[19] Science: Mathematics
[20] Mythology
[21] Sports
[22] Geography
[23] History
[24] Politics
[25] Art
[26] Celebrities
[27] Animals
[28] Vehicles
[29] Entertainment: Comics
[30] Science: Gadgets
[31] Entertainment: Japanese Anime & Manga
[32] Entertainment: Cartoon & Animations
{'category_id': 9, 'category_question_count': {'total_question_count': 307, 'total_easy_question_count': 124, 'total_medium_question_count': 123, 'total_hard_question_count': 60}}
{'category_id': 10, 'category_question_count': {'total_question_count': 97, 'total_easy_question_count': 30, 'total_medium_question_count': 41, 'total_hard_question_count': 26}}
{'category_id': 11, 'category_question_count': {'total_question_count': 248, 'tot

In [214]:
cats_df = pd.DataFrame({"category": cat_names, "count": cat_counts, "id": IDs}).set_index("id")
cats_df.to_json(f"data/triviabot/questions_info.json")
cats_df

Unnamed: 0_level_0,category,count
id,Unnamed: 1_level_1,Unnamed: 2_level_1
9,General Knowledge,307
10,Entertainment: Books,97
11,Entertainment: Film,248
12,Entertainment: Music,366
13,Entertainment: Musicals & Theatres,31
14,Entertainment: Television,170
15,Entertainment: Video Games,966
16,Entertainment: Board Games,59
17,Science & Nature,230
18,Science: Computers,159


In [169]:
def fetch_questions(session_token, category_code, total_questions):
    questions = []
    print(f"Fetching {total_questions} questions for category {category_code}")
    while len(questions) < total_questions:
        batch_size = min(50, total_questions - len(questions), total_questions)
        response = requests.get(f"https://opentdb.com/api.php?amount={batch_size}&category={category_code}&token={session_token}")
        response.raise_for_status()  # This will raise an exception if the request failed
        batch_questions = response.json()['results']
        questions += batch_questions
        print(f"(Response: {response.status_code}: {len(questions)}/{total_questions} questions fetched)")
        if len(questions) < total_questions:
            time.sleep(5)  
    return questions

In [170]:
http.client.HTTPConnection.debuglevel = 0
logging.basicConfig() 
logging.getLogger().setLevel(logging.CRITICAL)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.CRITICAL) # DEBUG, INFO, WARNING, ERROR, CRITICAL
requests_log.propagate = True

token_response = requests.get("https://opentdb.com/api_token.php?command=request")
session_token = token_response.json().get('token')
print(f"Session Token: {session_token}")

questions_dict = {} # maps id to subset of questions
print(f"Start collecting trivias...")
for cat_id in list(cats_df.index):
    questions = fetch_questions(session_token, cat_id, cats_df.loc[cat_id, "count"])
    questions_dict[cat_id] = questions
    # make dir if not exist
    dir_name = f"data/triviabot/"
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    pd.DataFrame(questions).to_json(f"data/triviabot/questions_{cat_id}.json")
    print(f"Downloaded {len(questions)} questions for {cats_df.loc[cat_id, 'category']}")
    time.sleep(5)  # API Limit cooldown 

Session Token: 1ca5b359d98468e15a959d18fbb62a00298b6891afdcf5b6040148931a653082
Start collecting trivias...
Fetching 313 questions for category 23
(Response: 200: 50/313 questions fetched)
(Response: 200: 100/313 questions fetched)
(Response: 200: 150/313 questions fetched)
(Response: 200: 200/313 questions fetched)
(Response: 200: 250/313 questions fetched)
(Response: 200: 300/313 questions fetched)
(Response: 200: 313/313 questions fetched)
Downloaded 313 questions for History


### Load

In [215]:
# load and concatinate all questions into one dataframe
questions_df = pd.DataFrame()
# load infos first
questions_info = pd.read_json(f"data/triviabot/questions_info.json")
for cat_id in questions_info["category"].keys():
    questions_df = pd.concat([questions_df, pd.read_json(f"data/triviabot/questions_{cat_id}.json")])
questions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4124 entries, 0 to 88
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   type               4124 non-null   object
 1   difficulty         4124 non-null   object
 2   category           4124 non-null   object
 3   question           4124 non-null   object
 4   correct_answer     4124 non-null   object
 5   incorrect_answers  4124 non-null   object
dtypes: object(6)
memory usage: 225.5+ KB


#### Count-based selection

In [217]:
def preprocess_and_extract_keywords(text):
    # Tokenize the text into words
    words = word_tokenize(text.lower())  # Convert to lowercase
    # Remove stopwords
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    # Optionally, use lemmatization to get the base form of words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return lemmatized_words

def calculate_relevance_nltk(row, keywords):
    question_keywords = preprocess_and_extract_keywords(row['question'])
    answer_keywords = preprocess_and_extract_keywords(row['correct_answer'])
    # Combine question and answer keywords
    combined_keywords = question_keywords + answer_keywords
    # Count how many user keywords appear in the combined keywords
    relevance_score = sum(keyword in combined_keywords for keyword in keywords)
    return relevance_score

user_message = "Space crafts are so cool, I would love to travel the universe in a futuristic vihicle!"
user_keywords = preprocess_and_extract_keywords(user_message)
print(f"User keywords: {user_keywords}")

questions_df['relevance_nltk'] = questions_df.apply(calculate_relevance_nltk, keywords=user_keywords, axis=1)
df_sorted_nltk = questions_df.sort_values(by='relevance_nltk', ascending=False)
print(df_sorted_nltk[['question', 'correct_answer', 'relevance_nltk']].head())

                                              question  \
11   This movie contains the quote, &quot;I love th...   
105  Which American civilization is the source of t...   
105  If you planted the seeds of Quercus robur, wha...   
144  Which slogan did the fast food company, McDona...   
196  Of the following space shooter games, which ca...   

               correct_answer  relevance_nltk  
11             Apocalypse Now               3  
105                The Mayans               2  
105                     Trees               2  
144  We Love to See You Smile               2  
196            Space Invaders               2  


#### Vectorized selection

In [218]:
# GPT generated sample messanges

user_messages = {}
user_messages[0] = "User1: Did anyone catch that recent study on neural correlates of consciousness? Fascinating how they're mapping brain activity to subjective experiences."
user_messages[1] = "User2: Yes, I saw that! It's incredible. Makes me wonder how far we are from truly understanding what consciousness is at a neural level."
user_messages[2] = "User3: Speaking of brain activity, I've been diving into how AI might replicate aspects of human cognition. The parallels and differences are quite intriguing."
user_messages[3] = "User4: That's a great point, User3. I'm curious about the ethical implications of AI that can mimic cognitive functions. Where do we draw the line between utility and privacy?"
user_messages[4] = "User5: Has anyone here experimented with brain-computer interfaces (BCIs)? I'm working on a project that explores how BCIs can aid in neurorehabilitation."
user_messages[5] = "User6: User5, that sounds amazing! I believe BCIs hold so much potential, especially for people with mobility issues. The future of prosthetics is bright."
user_messages[6] = "User7: Switching gears a bit, but still related - how do you all feel about the role of cognitive science in designing user interfaces? I think there's a lot we can apply from understanding perception and attention."
user_messages[7] = "User1: Absolutely, User7. The principles of cognitive load theory have been crucial in my work on educational apps. It's all about making information accessible and digestible."
user_messages[8] = "User2: That ties back to User4's point about ethics, doesn't it? Designing with the user's cognitive well-being in mind is paramount, especially as technology becomes more integrated into our lives."
user_messages[9] = "User3: Definitely, and it extends beyond just UI/UX design. Consider virtual reality - the potential for immersive learning is huge, but so are the challenges in ensuring it's not overwhelming."
user_messages[10] = "User4: Speaking of immersive experiences, has anyone explored the concept of digital twins in cognitive science research? Simulating real-world environments could revolutionize behavioral studies."
user_messages[11] = "User5: I'm intrigued by the interdisciplinary approaches in our field. From computational models to philosophical debates on mind and AI, cognitive science really is at the frontier of exploring human intelligence."
user_messages[12] = "User6: For sure, User5. It's that intersection of disciplines that provides us with a unique lens to examine the complexities of the mind and technology."
user_messages[13] = "User7: Before we wrap up, I'd love to hear everyone's thoughts on the future of cognitive science. Where do you see the field heading in the next decade?"
user_messages[14] = "User1: Great question, User7. I think we'll see more integration of cognitive science with technology, particularly in understanding and enhancing human-machine collaboration."
user_messages[15] = "User2: I'm optimistic about breakthroughs in understanding consciousness. With advancements in neuroimaging and computational neuroscience, we're poised for some exciting discoveries."
user_messages[16] = "User3: And let's not forget the potential for cognitive science to impact education. Personalized learning based on cognitive principles could transform how we teach and learn."
user_messages[17] = "User4: Ethically, I hope we'll see a greater emphasis on responsible AI development. Cognitive scientists have a big role to play in shaping how AI evolves in a way that benefits society."
user_messages[18] = "User5: On the practical side, I expect further advancements in neurotechnology, like BCIs, to improve quality of life for individuals with neurological conditions."
user_messages[19] = "User6: Echoing User5, the therapeutic applications are what excite me the most. There's so much potential for cognitive science to contribute to mental health and rehabilitation."
user_messages[20] = "User7: It's inspiring to see everyone's passion and optimism for the future. Cognitive science indeed holds the key to many of the mysteries of the mind and technology. Let's continue to push the boundaries of what we know."
# user_messages[20] = "User21: Heard about the new AI that can solve complex mathematical equations in seconds. What a time to be alive!"
user_messages[21] = "User22: Planning a trip to the natural history museum. Can't wait to see the dinosaur exhibit."
user_messages[22] = "User23: How does augmented reality work? It seems like magic to me."
user_messages[23] = "User24: I've been playing this educational game about ancient civilizations. Learning history has never been so fun."
user_messages[24] = "User25: Trying to understand the basics of machine learning. Any advice on where to start?"
user_messages[25] = "User26: What's the significance of the Higgs boson? I've read about it but still can't wrap my head around it."
user_messages[26] = "User27: Just upgraded my telescope. The clarity with which I can now see the moon's surface is astonishing."
user_messages[27] = "User28: Anyone interested in joining a hackathon focused on environmental tech?"
user_messages[28] = "User29: How do scientists predict weather patterns with such accuracy these days?"
user_messages[29] = "User30: Just got a new drone with a camera. The aerial shots I can take for my nature blog are going to be epic."
user_messages[30] = "User31: Has anyone used augmented reality for DIY projects? It seems like it could be really helpful."
user_messages[31] = "User32: What are the mathematical principles behind cryptography? I find the idea of secure communication fascinating."
user_messages[32] = "User33: I'm amazed by the potential of gene editing. The future of medicine is going to be revolutionary."
user_messages[33] = "User34: How does blockchain technology actually work? I know it's behind cryptocurrencies, but what makes it secure?"
user_messages[34] = "User35: Started learning about robotics. It's incredible how robots are being designed to navigate complex environments."
user_messages[35] = "User36: Anyone else obsessed with space exploration documentaries? I can't get enough of them."
user_messages[36] = "User37: I've been experimenting with solar panels for small DIY projects. It's a great way to learn about renewable energy."
user_messages[37] = "User38: Just read an article on the psychology of video games. It's interesting how games can affect our brains."
user_messages[38] = "User39: What's the difference between augmented reality and virtual reality? I always get them mixed up."
user_messages[39] = "User40: Trying to figure out the best setup for a home science lab. Any suggestions on essential equipment?"
user_messages[40] = "User41: How are scientists using AI to explore the oceans? The depths of the sea are like another world."
user_messages[41] = "User42: I'm blown away by the detail in satellite images of Earth. You can see so much from up there!"
user_messages[42] = "User43: What are the latest advancements in biotechnology? It seems like a field that's moving incredibly fast."
user_messages[43] = "User44: Just attended a webinar on nanotechnology. The applications for medicine and electronics are mind-boggling."
user_messages[44] = "User45: How do electric cars work? I understand they're better for the environment, but what powers them?"
user_messages[45] = "User46: Has anyone tried out the new educational VR games? They seem like a fun way to learn new subjects."
user_messages[46] = "User47: What are the challenges of creating life-like animations in movies and games?"
user_messages[47] = "User48: I'm fascinated by the concept of smart homes. Imagine your house automatically adjusting to your preferences."
user_messages[48] = "User49: Does anyone know how to get started with amateur astronomy? I'd love to start observing the stars."
user_messages[49] = "User50: Heard about the breakthrough in battery technology? Longer life and faster charging could change everything."
user_messages[50] = "User51: Anyone else excited for the upcoming World Cup? The teams this year look incredibly strong."
user_messages[51] = "User52: Just finished reading 'Dune.' The depth of the universe Frank Herbert created is astonishing."
user_messages[52] = "User53: Has anyone visited the Grand Canyon? It's on my bucket list, and I'm looking for travel tips."
user_messages[53] = "User54: What's your all-time favorite board game? I'm looking to add something new to game night."
user_messages[54] = "User55: I've been getting into Greek mythology lately. The stories of gods and heroes are so captivating."
user_messages[55] = "User56: Can't wait for the new 'Star Wars' series. The trailers look promising. What are your thoughts?"
user_messages[56] = "User57: What was the first video game you ever played? Mine was 'Super Mario Bros.' on the NES."
user_messages[57] = "User58: Just watched 'Hamilton' for the first time. It's amazing how it brings history to life through music."
user_messages[58] = "User59: Anyone into documentaries? I'd love some recommendations, especially anything related to nature or history."
user_messages[59] = "User60: I'm planning a trip to Japan. Any must-see places or local foods I should try?"
user_messages[60] = "User61: What's the most interesting fact you've learned recently? I love collecting random tidbits of knowledge."
user_messages[61] = "User62: Who's your favorite fictional detective? I'm torn between Sherlock Holmes and Hercule Poirot."
user_messages[62] = "User63: The evolution of special effects in movies is incredible. From practical effects to CGI, the progress is mind-blowing."
user_messages[63] = "User64: I'm trying to get into chess. Does anyone have tips for beginners?"
user_messages[64] = "User65: How do you think virtual reality will impact the future of video gaming?"
user_messages[65] = "User66: Just started watching 'The Crown.' It's fascinating to get a dramatized glimpse into the lives of the British royal family."
user_messages[66] = "User67: I've been exploring local hiking trails. It's a great way to appreciate the geography and nature around us."
user_messages[67] = "User68: Has anyone tried learning a language through apps? I'm curious about how effective they are."
user_messages[68] = "User69: What's the most challenging video game you've ever played? For me, it's 'Dark Souls.'"
user_messages[69] = "User70: I love how mythology influences modern literature and movies. The themes are timeless."
user_messages[70] = "User71: Anyone else a fan of classic films? I'm talking about movies from the 40s and 50s."
user_messages[71] = "User72: What sports do you play? I've been getting into tennis lately."
user_messages[72] = "User73: If you could visit any fictional world, where would you go? I'd love to explore Middle-earth."
user_messages[73] = "User74: Does anyone collect anything interesting? I've started collecting vintage postcards."
user_messages[74] = "User75: I'm amazed by the complexity and strategy involved in modern board games compared to the classics."
user_messages[75] = "User76: Just saw a Broadway show for the first time. The talent and production quality blew me away."
user_messages[76] = "User77: How do you stay updated with all the new TV series coming out? There's just so much to watch!"
user_messages[77] = "User78: What's your favorite myth or legend? I'm always looking for new stories to dive into."
user_messages[78] = "User79: Who's your favorite athlete? It's incredible to see what humans are capable of achieving."
user_messages[79] = "User80: Has anyone visited the pyramids in Egypt? It's incredible how they were built thousands of years ago."
user_messages[80] = "User81: I'm looking for book recommendations. Any genre is welcome, but I especially love mystery and sci-fi."
user_messages[81] = "User82: Just completed a marathon of 'The Office.' It's amazing how the show remains relevant and funny."
user_messages[82] = "User83: Anyone else love trivia games? They're a fun way to learn new facts and challenge friends."
user_messages[83] = "User84: What's the most beautiful place you've ever visited? For me, it's the Swiss Alps."
user_messages[84] = "User85: I'm fascinated by ancient civilizations. The more we learn, the more questions arise."
user_messages[85] = "User86: Has anyone tried virtual escape rooms? They're surprisingly fun and a great way to solve puzzles with friends online."
user_messages[86] = "User87: What's the hardest board game you've ever played? I'm looking for a new challenge."
user_messages[87] = "User88: I'm curious about how different cultures influence sports. For example, the history of martial arts is so diverse."
user_messages[88] = "User89: Does anyone have a favorite comic book series? I'm looking to start a new one."
user_messages[89] = "User90: What's your go-to game for game night? I'm always looking to add more to the collection."
user_messages[90] = "User91: I've been researching my family tree and it's fascinating to uncover stories from the past."
user_messages[91] = "User92: Just started learning about the stars and constellations. It's amazing what you can see with a good telescope."
user_messages[92] = "User93: Who's your favorite character from mythology? There are so many interesting stories across different cultures."
user_messages[93] = "User94: I'm always in awe of Olympic athletes. The dedication and discipline it takes to compete at that level is incredible."
user_messages[94] = "User95: Anyone else a fan of detective novels? There's something about a good mystery that's irresistible."
user_messages[95] = "User96: What's your favorite historical era? I'm particularly interested in the Renaissance for its advancements in art and science."
user_messages[96] = "User97: Has anyone been on a safari? Witnessing wildlife in their natural habitat must be an unforgettable experience."
user_messages[97] = "User98: I'm looking for a new strategy video game. Any suggestions?"
user_messages[98] = "User99: Does anyone know a good resource for learning about world geography? I want to improve my knowledge."
user_messages[99] = "User100: What's your favorite musical? I'm looking to see something new and could use some recommendations."
user_messages[100] = "User101: Did anyone catch the latest football match? The final goal was spectacular!"
user_messages[101] = "User102: I'm currently obsessed with the myths surrounding Olympus. The stories of gods and goddesses are so intriguing."
user_messages[102] = "User103: Planning a road trip across the United States. Any must-see geographical wonders?"
user_messages[103] = "User104: Recently delved into the history of the Roman Empire. It's fascinating how it shaped modern civilization."
user_messages[104] = "User105: Thoughts on the current political climate? It feels like we're at a major turning point."
user_messages[105] = "User106: I've started exploring abstract art. It's amazing how it evokes emotions through colors and shapes."
user_messages[106] = "User107: Just saw a movie about the life of a famous celebrity. It's interesting to see the person behind the persona."
user_messages[107] = "User108: Did you know that octopuses have three hearts? Marine animals are truly fascinating."
user_messages[108] = "User109: Anyone into vintage cars here? I love the design and craftsmanship of old vehicles."
user_messages[109] = "User110: Just read the latest superhero comic. The storyline is getting intense!"
user_messages[110] = "User111: Excited about the new tech gadget release. The features are supposed to be groundbreaking."
user_messages[111] = "User112: Who's watching the new anime series? The animation quality is top-notch."
user_messages[112] = "User113: Grew up watching classic cartoons. They don't make them like that anymore, do they?"
user_messages[113] = "User114: I'm amazed by the endurance of marathon runners. The training and dedication required is extraordinary."
user_messages[114] = "User115: Reading up on Greek mythology. The tale of Achilles is both heroic and tragic."
user_messages[115] = "User116: Ever visited the Sahara Desert? The vastness and beauty of it are mesmerizing."
user_messages[116] = "User117: Currently reading a book on medieval history. The strategies and battles are incredibly detailed."
user_messages[117] = "User118: Discussing political theories can be quite enlightening. It's interesting to see different perspectives."
user_messages[118] = "User119: Anyone a fan of Renaissance art? The attention to detail is simply stunning."
user_messages[119] = "User120: Saw a documentary on famous celebrities from the 20th century. Their impact on culture is undeniable."
user_messages[120] = "User121: Just adopted a rescue dog. It's heartwarming to give them a second chance at life."
user_messages[121] = "User122: The engineering behind electric cars is fascinating. It's great to see advancements in eco-friendly transportation."
user_messages[122] = "User123: Recently got into comic book collecting. There's a whole world of stories out there."
user_messages[123] = "User124: The evolution of smartphones is incredible. Remember when they just made calls and sent texts?"
user_messages[124] = "User125: Anyone else a fan of Studio Ghibli films? The storytelling and animation are magical."
user_messages[125] = "User126: Childhood cartoons hold such nostalgia. Shows like 'Tom and Jerry' were so simple yet so entertaining."
user_messages[126] = "User127: The tactics and teamwork in soccer are what make it the beautiful game."
user_messages[127] = "User128: Norse mythology has some of the most fascinating tales. Thor and Loki's adventures are legendary."
user_messages[128] = "User129: Has anyone climbed Mount Everest? The challenge and danger involved are beyond my comprehension."
user_messages[129] = "User130: The French Revolution was such a pivotal moment in history. The rise and fall of powers were dramatic."
user_messages[130] = "User131: Political debates can get heated, but they're necessary for democracy."
user_messages[131] = "User132: Modern art can be divisive, but it's all about personal interpretation."
user_messages[132] = "User133: The influence of celebrities on fashion trends is undeniable. They often set the new norms."
user_messages[133] = "User134: I'm always amazed by birds of prey. Their hunting skills are incredible."
user_messages[134] = "User135: The thrill of driving a high-performance sports car is unmatched."
user_messages[135] = "User136: Reading a graphic novel can be just as engaging as a traditional book, if not more."
user_messages[136] = "User101: Who else is fascinated by the myth of Atlantis? The theories about its location are intriguing."
user_messages[137] = "User102: Just watched a documentary on the history of the Olympic Games. It's amazing how it evolved over time."
user_messages[138] = "User103: Planning a road trip across the USA. Any must-see geographical landmarks?"
user_messages[139] = "User104: What's your favorite historical figure? I admire Leonardo da Vinci for his diverse talents."
user_messages[140] = "User105: Thoughts on the current political climate? It feels like we're at a major turning point globally."
user_messages[141] = "User106: I visited the Louvre last year. Seeing the Mona Lisa in person was a surreal experience."
user_messages[142] = "User107: Anyone else follow celebrity environmental activists? Their platforms can really highlight important issues."
user_messages[143] = "User108: What's the most exotic animal you've ever seen in the wild? For me, it was a sloth in Costa Rica."
user_messages[144] = "User109: I'm restoring an old Mustang. It's a labor of love, but seeing it come together is rewarding."
user_messages[145] = "User110: Who's your favorite superhero? I've always been a fan of Spider-Man for his relatability."
user_messages[146] = "User111: Just got the latest smart home gadget. It's incredible how it can manage everything from lights to security."
user_messages[147] = "User112: Has anyone watched 'Attack on Titan'? The storytelling and character development are top-notch."
user_messages[148] = "User113: 'Looney Tunes' was a staple of my childhood. Which character was your favorite?"
user_messages[149] = "User114: I'm amazed by ancient mythology's influence on modern entertainment, from movies to video games."
user_messages[150] = "User115: What's your favorite sports moment in history? Mine is the 'Miracle on Ice' in 1980."
user_messages[151] = "User116: Has anyone been to Machu Picchu? The historical and geographical significance is fascinating."
user_messages[152] = "User117: Discussing politics can be divisive, but it's crucial for understanding societal dynamics and changes."
user_messages[153] = "User118: I'm trying to learn more about art history. Any suggestions on where to start?"
user_messages[154] = "User119: Celebrity chefs have really brought attention to culinary arts. Who's your favorite and why?"
user_messages[155] = "User120: Going on a safari has been a dream of mine. Observing animals in their natural habitat must be incredible."
user_messages[156] = "User121: Electric vehicles are the future. Tesla's innovations are just the beginning of what's possible."
user_messages[157] = "User122: Comics have evolved so much from simple strips to complex narratives that tackle real-world issues."
user_messages[158] = "User123: The advancements in gadgets over the last decade have fundamentally changed our daily lives. What's your favorite tech?"
user_messages[159] = "User124: Japanese anime has a unique way of exploring complex themes through animation. Any series recommendations?"
user_messages[160] = "User125: 'Tom and Jerry' never gets old. The classic cat and mouse chase is timeless entertainment."
user_messages[161] = "User126: The mythology surrounding dragons is present in so many cultures. What's your favorite dragon legend?"
user_messages[162] = "User127: Just attended a local sports game. There's something about live events that TV just can't capture."
user_messages[163] = "User128: Anyone into geocaching? It's a fun way to explore geography and go on mini-adventures."
user_messages[164] = "User129: I love reading about medieval history. The castles, the battles, the politics—it's all so fascinating."




# for i in range(len(user_messages)):
#     user_keywords = preprocess_and_extract_keywords(user_messages[i])
#     user_messages[i] = [user_messages[i], user_keywords]
#     print(f"User keywords: {user_keywords}")

In [219]:
# len 4124, 966

# set new index
questions_df = questions_df.reset_index(drop=True)


In [221]:
questions_df.head()

Unnamed: 0,type,difficulty,category,question,correct_answer,incorrect_answers,relevance_nltk
0,multiple,medium,General Knowledge,The website &quot;Shut Up &amp; Sit Down&quot;...,Board Games,"[Television Shows, Video Games, Films]",0
1,multiple,easy,General Knowledge,Which sign of the zodiac comes between Virgo a...,Libra,"[Gemini, Taurus, Capricorn]",0
2,multiple,easy,General Knowledge,Which best selling toy of 1983 caused hysteria...,Cabbage Patch Kids,"[Transformers, Care Bears, Rubik&rsquo;s Cube]",1
3,multiple,medium,General Knowledge,Where did the pineapple plant originate?,South America,"[Hawaii, Europe, Asia]",0
4,multiple,easy,General Knowledge,Which of the following is not the host of a pr...,Ben Shapiro,"[Terry Gross, Ira Glass, Peter Sagal]",0


In [211]:
questions_df.to_json(f"data/triviabot/questions.json")

In [222]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

trivia_texts = questions_df['question'] + " " + questions_df['correct_answer']  # Combine question and answer
# All texts need to be vectorized together to ensure a consistent feature space
# all_texts = [user_messages[0][0]] + trivia_texts.tolist()
# TF-IDF vectors 
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectors = vectorizer.fit_transform(trivia_texts.tolist())
print(f"TF-IDF vectors shape: {type(tfidf_vectors), tfidf_vectors.shape}")
questions_df['tfidf_vector'] = [vec.toarray()[0] for vec in tfidf_vectors]

new_message = [list(user_messages.values())[-1]]
print(new_message)
tfidf_vector_new_message = vectorizer.transform(new_message)

# Cosine similarity 
tfidf_vectors_series = csr_matrix(questions_df['tfidf_vector'].tolist())
cosine_similarities = cosine_similarity(tfidf_vector_new_message, tfidf_vectors_series) 
questions_df['cosine_similarity'] = cosine_similarities.flatten()
df_sorted_by_cosine = questions_df.sort_values(by='cosine_similarity', ascending=False)
print(df_sorted_by_cosine[['question', 'correct_answer', 'cosine_similarity']].head())

TF-IDF vectors shape: (<class 'scipy.sparse._csr.csr_matrix'>, (4124, 10107))
["User129: I love reading about medieval history. The castles, the battles, the politics—it's all so fascinating."]
                                               question        correct_answer  \
2017  What animal is featured in &quot;Bloons TD Bat...               Monkeys   
3175  Which of the following battles is often consid...  Battle of Adrianople   
1103  Klingons once had a period of Democracy in the...                  True   
3295  What is the bloodiest event in United States h...    Battle of Antietam   
3496  In United States history, how many vice presid...                     3   

      cosine_similarity  
2017           0.231750  
3175           0.187452  
1103           0.169640  
3295           0.161919  
3496           0.161007  


In [223]:
# print(type(questions_df["tfidf_vector"]), questions_df["tfidf_vector"].shape, len(questions_df["tfidf_vector"][0]), type(questions_df["tfidf_vector"][0]))
# print(type(tfidf_vectors[1:]), tfidf_vectors[1:].shape, tfidf_vectors[0].shape, type(tfidf_vectors[0]))

In [228]:
cosine_similarity_threshold = 0.25

# Check if the top match is above the threshold
if df_sorted_by_cosine.iloc[0]['cosine_similarity'] >= cosine_similarity_threshold:
    print("Relevant question found:", df_sorted_by_cosine.iloc[0]['question'])
else:
    print("No sufficiently relevant question found.")
    
    
for i, m in enumerate(list(user_messages.values())[:21]):
    tfidf_vector_new_message = vectorizer.transform([m])
    tfidf_vectors_series = csr_matrix(questions_df['tfidf_vector'].tolist())
    cosine_similarities = cosine_similarity(tfidf_vector_new_message, tfidf_vectors_series) 
    questions_df['cosine_similarity'] = cosine_similarities.flatten()
    
    # most similar question
    question, answer, incorrect_answers, type_of_question, similarity = questions_df.sort_values(by='cosine_similarity', ascending=False)[['question', 'correct_answer', 'incorrect_answers', 'type', 'cosine_similarity']].head(1).values[0]
    
    if similarity < cosine_similarity_threshold:
        print(f"User Input: {i} - {m}")
    else: 
        print(f"User Input: {i} - {m}")
        print(f"    Most similar question: {question}")
        if type_of_question == "multiple":
            print(f"    Incorrect answers: {incorrect_answers}")
            for answer in incorrect_answers:
                print(f"    -> {answer}")
            print(f"    Correct answer:")
            print(f"    -> {answer}")
        else: 
            print(f"    Correct answer: {answer}")
        print(f"    Cosine similarity: {similarity}")
    

No sufficiently relevant question found.
User Input: 0 - User1: Did anyone catch that recent study on neural correlates of consciousness? Fascinating how they're mapping brain activity to subjective experiences.
User Input: 1 - User2: Yes, I saw that! It's incredible. Makes me wonder how far we are from truly understanding what consciousness is at a neural level.
User Input: 2 - User3: Speaking of brain activity, I've been diving into how AI might replicate aspects of human cognition. The parallels and differences are quite intriguing.
    Most similar question: The most frequent subconscious activity repeated by the human body is blinking.
    Correct answer: False
    Cosine similarity: 0.26613174703493175
User Input: 3 - User4: That's a great point, User3. I'm curious about the ethical implications of AI that can mimic cognitive functions. Where do we draw the line between utility and privacy?
User Input: 4 - User5: Has anyone here experimented with brain-computer interfaces (BCIs)?