In [46]:
# David Harrison
# STC-510 Data Wrangling
# Module 5 Project


import csv
import json
import statistics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score




# Open the file and load the contents:
with open('jeopardy.json') as f:
    data = json.load(f)

# Print the number of questions in the dataset
print("Number of questions:", len(data))



# Extract the values from each question and convert them to integers:
values = []
for question in data:
    value_str = question['value']
    if value_str is not None and value_str[0] == '$':
        value_int = int(value_str[1:].replace(',', ''))
        values.append(value_int)

# Find the median question value to determmine high value vs low value questions:
median = statistics.median(values)
print("Median question value: $", median, sep='')


Number of questions: 216930
Median question value: $600.0


In [47]:
# Define high-value and low-value questions
high_value = []
low_value = []

for question in data:
    if question['value'] is not None:
        value_str = question['value'].replace('$', '').replace(',', '')
        try:
            if int(value_str) >= 600:
                high_value.append(question)
            else:
                low_value.append(question)
        except ValueError:
            pass

# Print the number of high-value and low-value questions
print("Number of high-value questions:", len(high_value))
print("Number of low-value questions:", len(low_value))


Number of high-value questions: 113868
Number of low-value questions: 99428


In [40]:
# Extract features from the questions using the countvectorizer:
vectorizer = CountVectorizer()
X = [question['question'] for question in high_value + low_value]
y = ['high-value'] * len(high_value) + ['low-value'] * len(low_value)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the Naive Bayesian classifier:
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# Test the Naive Bayesian classifier:
y_pred = clf.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='high-value')
recall = recall_score(y_test, y_pred, pos_label='high-value')
f1 = f1_score(y_test, y_pred, pos_label='high-value')

# Print the evaluation metrics
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 score: {:.2f}".format(f1))

Accuracy: 0.56
Precision: 0.61
Recall: 0.49
F1 score: 0.54


In [41]:
# Write the results to a CSV file
with open('jeopardy_classifier.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Accuracy', 'Precision', 'Recall', 'F1 score'])
    writer.writerow([accuracy, precision, recall, f1])


In [42]:
with open('jeopardy_classifier.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in list(reader)[:10]:
        print(row)

['Accuracy', 'Precision', 'Recall', 'F1 score']
['0.5621894045944679', '0.6092508357538226', '0.49051358983409815', '0.5434724157317103']
