#### About Dataset

Data Link - https://www.kaggle.com/datasets/tunguz/200000-jeopardy-questions/data

**Context**

- This is a dataset of 200,000+ Jeopardy! questions

**Acknowledgements**
- We wouldn't be here without the help of others. If you owe any attributions or thanks, include them here along with any citations of past research.

**Inspiration**
- Your data will be in front of the world's largest data science community. What questions do you want to see answered?

In [38]:
import re

import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /Users/zwt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zwt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/zwt/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

#### Dataset

In [54]:
jeopardy = pd.read_csv("../data/JEOPARDY_CSV.csv")

In [55]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [56]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       213296 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216927 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [57]:
print(f"jeopardy.columns: {jeopardy.columns.values}")
jeopardy.columns = jeopardy.columns.str.strip()

jeopardy.columns: ['Show Number' ' Air Date' ' Round' ' Category' ' Value' ' Question'
 ' Answer']


In [58]:
print("len of the empty data:", (jeopardy['Question'] == "").sum())

jeopardy = jeopardy[jeopardy['Question'].str.strip() != ""]
jeopardy = jeopardy.reset_index(drop=True)

print("after delete:", (jeopardy['Question'] == "").sum())
print("now:", len(jeopardy))

len of the empty data: 0
after delete: 0
now: 216930


In [59]:
jeopardy['Category'].value_counts(), len(jeopardy['Category'].value_counts())

(Category
 BEFORE & AFTER        547
 SCIENCE               519
 LITERATURE            496
 AMERICAN HISTORY      418
 POTPOURRI             401
                      ... 
 1999 TELEVISION         1
 WORDS IN PHYSICS        1
 LITERATURE & MUSIC      1
 '90s NOTABLES           1
 CELEBRITY NAMES         1
 Name: count, Length: 27995, dtype: int64,
 27995)

In [60]:
valid_categories = ['BEFORE & AFTER', 'SCIENCE', 'LITERATURE', 'AMERICAN HISTORY', 'POTPOURRI']
jeopardy = jeopardy[jeopardy['Category'].isin(valid_categories)]

jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2381 entries, 268 to 216622
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Show Number  2381 non-null   int64 
 1   Air Date     2381 non-null   object
 2   Round        2381 non-null   object
 3   Category     2381 non-null   object
 4   Value        2348 non-null   object
 5   Question     2381 non-null   object
 6   Answer       2381 non-null   object
dtypes: int64(1), object(6)
memory usage: 148.8+ KB


In [61]:
jeopardy['Category'].value_counts(), len(jeopardy['Category'].value_counts())

(Category
 BEFORE & AFTER      547
 SCIENCE             519
 LITERATURE          496
 AMERICAN HISTORY    418
 POTPOURRI           401
 Name: count, dtype: int64,
 5)

In [62]:
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # remove the tag HTML
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # only keep the letter A-Z and \s

    words = text.lower().split()

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [66]:
def preprocess_label(text):
    return valid_categories.index(text)

In [67]:
jeopardy['question_cleaned'] = jeopardy['Question'].apply(preprocess_text)
jeopardy['label'] = jeopardy['Category'].apply(preprocess_label)

In [68]:
jeopardy.head(10)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,question_cleaned,label
268,4931,2006-02-06,Double Jeopardy!,SCIENCE,$400,"At sea level at 70 degrees this travels 1,129 ...",sound,sea level degree travel foot per second speed ...,1
274,4931,2006-02-06,Double Jeopardy!,SCIENCE,$800,"The largest tree, the General Sherman in Calif...",a sequoia,largest tree general sherman california type a...,1
280,4931,2006-02-06,Double Jeopardy!,SCIENCE,$1200,"(<a href=""http://www.j-archive.com/media/2006-...",strain,sarah clue crew read pole vault duke universit...,1
286,4931,2006-02-06,Double Jeopardy!,SCIENCE,$1600,6 elements once known as inert gases are now k...,noble gases,element known inert gas known aristocratic name,1
292,4931,2006-02-06,Double Jeopardy!,SCIENCE,$2000,"(<a href=""http://www.j-archive.com/media/2006-...",the caudal region,honeycolored retriever named max try lick cher...,1
716,2735,1996-06-21,Final Jeopardy!,AMERICAN HISTORY,,"On May 29, 1765 Patrick Henry's Stamp Act prot...",Treason!,may patrick henry stamp act protest interrupte...,3
864,4541,2004-05-10,Double Jeopardy!,POTPOURRI,$400,Some think this Irving Berlin song should repl...,"""God Bless America""",think irving berlin song replace starspangled ...,4
870,4541,2004-05-10,Double Jeopardy!,POTPOURRI,$800,Politicians often complain about having to mak...,"the ""rubber chicken"" circuit",politician often complain make appearance unap...,4
875,4541,2004-05-10,Double Jeopardy!,POTPOURRI,$1200,There are Blue & White branches of this Africa...,the Nile,blue white branch african river,4
880,4541,2004-05-10,Double Jeopardy!,POTPOURRI,$1600,VP Garret Hobart cast the deciding vote agains...,the Philippines,vp garret hobart cast deciding vote independen...,4


In [69]:
sentences = [word_tokenize(text) for text in jeopardy['question_cleaned']]
sentences[:3], len(sentences)

([['sea',
   'level',
   'degree',
   'travel',
   'foot',
   'per',
   'second',
   'speed',
   'foot',
   'per',
   'sec',
   'rising',
   'degree'],
  ['largest',
   'tree',
   'general',
   'sherman',
   'california',
   'type',
   'also',
   'called',
   'sierra',
   'redwood'],
  ['sarah',
   'clue',
   'crew',
   'read',
   'pole',
   'vault',
   'duke',
   'university',
   'track',
   'durham',
   'nc',
   'bending',
   'elastic',
   'solid',
   'stress',
   'force',
   'causing',
   'deformation',
   'letter',
   'term',
   'deformation']],
 2381)

In [None]:
model = Word2Vec(
    sentences=sentences, 
    vector_size=120, 
    window=12, 
    min_count=4,
    epochs=40,
    sg=1
)

In [100]:
len(model.wv)

1148

In [101]:
def text_to_vector(text):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [102]:
X = np.array([text_to_vector(text) for text in jeopardy['question_cleaned']])
y = jeopardy['label']

X, y

(array([[ 0.24186742, -0.227459  ,  0.17156176, ..., -0.36506504,
          0.16951005, -0.00785972],
        [-0.08778113, -0.2307947 , -0.07849058, ...,  0.04899906,
          0.19049947, -0.08519312],
        [ 0.05837506,  0.09077007, -0.08878925, ..., -0.03145953,
          0.14420424,  0.32458088],
        ...,
        [ 0.15071341, -0.15545227, -0.03898689, ..., -0.05893658,
          0.22997755,  0.00283652],
        [ 0.00063075,  0.1574526 , -0.09353244, ..., -0.03714421,
          0.03547193, -0.26485199],
        [-0.08388581,  0.17623174, -0.05591362, ...,  0.00445869,
         -0.00540813, -0.1245012 ]]),
 268       1
 274       1
 280       1
 286       1
 292       1
          ..
 216598    3
 216604    3
 216610    3
 216616    3
 216622    3
 Name: label, Length: 2381, dtype: int64)

In [109]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    stratify=y,
    random_state=42
)

X_train.shape

(2142, 120)

In [110]:
clf = LogisticRegression(
    max_iter=2000,
    class_weight='balanced',
)

clf.fit(X_train, y_train)

In [112]:
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

test_f1 = f1_score(y_test, y_pred, average='macro')

print(f"test dataset F1: {test_f1:.3f}")

test dataset F1: 0.701
