# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
jeopardy = pd.read_csv('jeopardy.csv')
#print(jeopardy.head())
jeopardy.columns = ['Show_Number', 'Air_Date', 'Round', 'Category', 'Value', 'Question', 'Answer']
jeopardy.columns

Index(['Show_Number', 'Air_Date', 'Round', 'Category', 'Value', 'Question', 'Answer'], dtype='object')

# String Cleaning

Removing all characters from the question and answer strings that aren't lowercase alphanumeric or whitespace characters. Additionally, date values in the Air_Date column are converted to datetime objects so they can be sorted and organized more easily

In [2]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub('[^A-Za-z0-9\s]','',text)
    return text
jeopardy['clean_question'] = jeopardy['Question'].apply(clean_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(clean_text)

In [3]:
def clean_val(text):
    text = re.sub('[^A-Za-z0-9\s]','',text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text
jeopardy['clean_value'] = jeopardy['Value'].apply(clean_val)
jeopardy['Air_Date'] = pd.to_datetime(jeopardy['Air_Date'])

# Answer In Question
The counter function is designed to tablulate the number of words in an answer that are also used in a question, while excluding the word 'the'. The average number of words shared between questions and answers in this dataset is calculated to be 5.8%

If 5.8% of words in an answer match the words in a question, then it is safe to explore an alternative study source. The other 94% needs to come from somewhere else

In [4]:
def counter(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0
    for i in split_answer: 
        if i == 'the': split_answer.remove('the')
    if len(split_answer) == 0: return 0
    for word in split_answer:
        if word in split_question:
            match_count += 1
    return match_count / len(split_answer)
jeopardy['answer_in_question'] = jeopardy.apply(counter,axis=1)

In [5]:
print(jeopardy['answer_in_question'].mean())

0.0583474447893


# Recycled Questions
It appears that many words in questions are definitely reused, since this calculation shows nearly 70%. However, the words have not been grouped in any way, so we can't be too sure of how much of a role something like question formulation impacts this result.

It is evident, however, that further exploration in the reuse of questions is worthwhile.

In [5]:
jeopardy.sort('Air_Date',inplace=True)
terms_used = set()
question_overlap = []
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used: match_count += 1
        else: terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())

0.689403135907


# Low v. High Value Questions

Several of the sample words had p values that indicated a significant difference in distribution between low and high value questions. However, because the sample size is so small, the results are unereliable. More words should be sampled, and more questions are probably needed.

In [6]:
def valcheck(row):
    if row['clean_value'] > 800: value = 1
    else: value = 0
    return value
jeopardy['high_value'] = jeopardy.apply(valcheck,axis=1)

def valcount(word):
    low_count, high_count = 0, 0
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value']: high_count += 1
            else: low_count += 1
    return low_count, high_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    observed_expected.append(valcount(term))

print(observed_expected)

[(2, 0), (1, 2), (1, 1), (3, 0), (1, 0)]


In [7]:
from scipy.stats import chisquare
import numpy as np
high_value_count = jeopardy[jeopardy['high_value']==1].shape[0]
print(high_value_count)
low_value_count = jeopardy[jeopardy['high_value']==0].shape[0]
print(low_value_count)

chi_squared = []
total = 0
for row in observed_expected:
    total += row[0] + row[1]
    total_prop = total / (high_value_count + low_value_count)

    ehi = total_prop * high_value_count
    elo = total_prop * low_value_count
    obs = np.array([row[0],row[1]])
    exp = np.array([ehi,elo])
    chi_squared.append(chisquare(obs,exp))


chi_squared

5734
14265


[(4.9755842343913503, 0.025707519787911092),
 (0.81912870034064222, 0.36543503656640786),
 (3.6985364233317939, 0.05446021891391202),
 (7.1390129054761076, 0.0075424665032683604),
 (9.3170720106541527, 0.0022702849666422693)]