In [3]:
import json
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC  # SVM classifier 
from sklearn.metrics import accuracy_score
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


english_stopwords = set(stopwords.words('english'))

In [6]:
def clean_text(text):
    # Removes special characters and also convert to lowercase
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in english_stopwords]
    # Join the tokens back together
    clean_text = ' '.join(clean_tokens)
    return clean_text

In [8]:
with open('jeopardy.json') as f:
    data = json.load(f)


questions = []
values = []
for item in data:
    question = item['question']
    value = item.get('value', '$0')  # use default value "$0" if 'value' key does not exist
    if value is not None:
        # converts value to integer ;remove '$' sign
        value = int(value[1:].replace(',', '')) if value[1:].replace(',', '').isdigit() else 0
    else:
        value = 0
    # Clean the text
    clean_question = clean_text(question)
    questions.append(clean_question)
    # Classify questions as 'high value' or 'low value' 
    values.append('high value' if value >= 800 else 'low value')


In [9]:
# Creates a DataFrame from the cleaned data
df = pd.DataFrame({'question': questions, 'value': values})

In [10]:
df

Unnamed: 0,question,value
0,last 8 year life galileo house arrest espousin...,low value
1,2 1912 olympian football star carlisle indian ...,low value
2,city yuma state record average 4055 hour sunsh...,low value
3,1963 live art linkletter show company served b...,low value
4,signer dec indep framer constitution mass seco...,low value
...,...,...
216925,puccini opera turn solution 3 riddle posed her...,high value
216926,north america term properly applied 4 specie c...,high value
216927,penny lane hellraiser grew barber shave anothe...,high value
216928,ft sill okla made plea arizona land home fathe...,high value


In [15]:
# Split the data 
X_train, X_test, y_train, y_test = train_test_split(df.question, df.value, random_state=1)

In [16]:
X_train, X_test, y_train, y_test

(80347              cereal heiress marjorie post middle name
 211766    nicholas meyers west end horror take victorian...
 154215       wrote night ride mansion glory suicide machine
 162472    january 20 1981 u released 8 billion country a...
 7460      nice see musical looking swell still goin stro...
                                 ...                        
 109259                           someone church work abroad
 50057     one gabriel garcia marquez bestknown work one ...
 5192      neapolitan tenor made last public appearance c...
 208780      signed february 2 1848 treaty ended mexican war
 128037    end marriage 1877 russian composer tried catch...
 Name: question, Length: 162697, dtype: object,
 180575      type yoga sanskrit discipline force better none
 85360     4 treaty mitigate horror war signed city augus...
 133653    dec 13 1937 japan took city nanking asian coun...
 4637      island fay wray first encountered king kong th...
 23868     metropolitan museum art pa

In [19]:
# Set up vectorizer using tfidf
tfidf_vectorizer = TfidfVectorizer()
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [None]:
# Support vector machine classifier
svecm_classifier = SVC()  
svecm_classifier.fit(X_train_tf, y_train)

# predictions on the test 
svecm_prediction = svecm_classifier.predict(X_test_tf)

In [None]:
# Calculate accuracy for SVM then prints
svecm_accuracy = accuracy_score(y_test, svecm_prediction)
print('Test Accuracy:', svecm_accuracy)

In [None]:
# neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_tf.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')])

In [None]:
# Compile model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train the model
train_info = model.fit(X_train_tf.toarray(), (y_train == 'high value').astype(int), epochs=10, batch_size=32, validation_split=0.2, verbose=1)


In [None]:
# evaluate test and store in the variables test loss and test accuracy then prints accuracy
test_loss, test_accuracy = model.evaluate(X_test_tf.toarray(), (y_test == 'high value').astype(int))

print('Test Accuracy:', test_accuracy)