In [1]:
import pandas as pd

In [2]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
# Updating columns to proper format (removing whitespaces)
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

## Data Cleaning

In [5]:
import re

def normalize_text(text):
    return re.sub("[^A-Za-z0-9\s]", "", text).lower()

def normalize_value(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

In [6]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [7]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

## How often the answer is deducible from the question?

In [8]:
def count_matches(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    
    match_count = 0
    for word in split_answer:
        match_count += int(word in split_question)
    
    return match_count/len(split_answer)

answer_in_question = jeopardy.apply(count_matches, axis=1)

In [9]:
print("Percentage of times answer appears in questions: {}".format(answer_in_question.mean()*100))

Percentage of times answer appears in questions: 6.049325706933587


## Analysis
The answer only appears in the question about 6% of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study.

## Are new questions related to past questions

In [10]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q) > 5]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap

In [11]:
print("Percentage of new questions overlapped with the past questions: {}".format(jeopardy['question_overlap'].mean()*100))

Percentage of new questions overlapped with the past questions: 69.08737315671962


## Analysis
There is about 70% overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions.

## Low value vs high value questions

In [12]:
def high_value(row):
    return int(row['clean_value'] > 800)

In [13]:
jeopardy['high_value'] = jeopardy.apply(high_value, axis=1)

In [14]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row['clean_question'].split(' '):
            high_count += row['high_value']
            low_count += 1 - row['high_value']
    
    return high_count, low_count

In [None]:
observed_expected = []
comparison_terms = list(terms_used)[:5]

for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 1), (0, 2), (2, 0), (3, 6), (1, 0)]

In [None]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.40196284612688399, pvalue=0.52607729857054686),
 Power_divergenceResult(statistic=0.80392569225376798, pvalue=0.36992223780795708),
 Power_divergenceResult(statistic=4.9755842343913503, pvalue=0.025707519787911092),
 Power_divergenceResult(statistic=0.095643501703210843, pvalue=0.75712159875701002),
 Power_divergenceResult(statistic=2.4877921171956752, pvalue=0.11473257634454047)]

## Analysis
None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than 5, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies.