In [1]:
import json
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import re

english_stopwords = set(stopwords.words('english'))

In [2]:
def clean_text(text):
    # Removes special characters and also convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in english_stopwords]
    # Join the tokens back together
    clean_text = ' '.join(clean_tokens)
    return clean_text


In [6]:

with open('jeopardy.json') as f:
    data = json.load(f)


questions = []
values = []
for item in data:
    question = item['question']
    value = item.get('value', '$0')  # use default value "$0" if 'value' key does not exist
    if value is not None:
        # converts value to integer ;remove '$' sign
        value = int(value[1:].replace(',', '')) if value[1:].replace(',', '').isdigit() else 0
    else:
        value = 0
    # Clean the text
    clean_question = clean_text(question)
    questions.append(clean_question)
    # Classify questions as 'high value' or 'low value' based on the points available
    values.append('high value' if value >= 800 else 'low value')


In [7]:
# Creates a DataFrame from the cleaned data
df = pd.DataFrame({'question': questions, 'value': values})



In [19]:
df

Unnamed: 0,question,value
0,last 8 year life galileo house arrest espousin...,low value
1,2 1912 olympian football star carlisle indian ...,low value
2,city yuma state record average 4055 hour sunsh...,low value
3,1963 live art linkletter show company served b...,low value
4,signer dec indep framer constitution mass seco...,low value
...,...,...
216925,puccini opera turn solution 3 riddle posed her...,high value
216926,north america term properly applied 4 specie c...,high value
216927,penny lane hellraiser grew barber shave anothe...,high value
216928,ft sill okla made plea arizona land home fathe...,high value


In [8]:
# Split the data set
X_train, X_test, y_train, y_test = train_test_split(df['question'], df['value'], random_state=1)

In [16]:
# Set up vectorizer using tfidf
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(questions)
y = values


In [17]:
# Train a Naive Bayesian classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, y_train)
# predictions on the test set
predictions = naive_bayes.predict(X_test_tf)


In [18]:
# Calculating accuracy
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)

Accuracy: 0.5854737890214445
