In [1]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')
print(jeopardy.head(5))

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  


In [2]:
print(jeopardy.columns)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [3]:
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

In [4]:
print(jeopardy.columns)

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


In [5]:
#Remove punctuations in text and values
import re

def normalize_text(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]", "", string)
    return string

def normalize_val(val):
    val = re.sub("[^A-Za-z0-9\s]", "", val)
    try:
        val = int(val)
    except Exception:
        val = 0
    return val

In [6]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize_text)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize_text)
jeopardy["clean_value"] = jeopardy["Value"].apply(normalize_val)

In [7]:
print(jeopardy.head(5))

   Show Number    Air Date      Round                         Category Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY  $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES  $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...  $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE  $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES  $200   

                                            Question      Answer  \
0  For the last 8 years of his life, Galileo was ...  Copernicus   
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe   
2  The city of Yuma in this state has a record av...     Arizona   
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's   
4  Signer of the Dec. of Indep., framer of the Co...  John Adams   

                                      clean_question clean_answer  clean_value  
0  for the last 8 years of hi

# How of often can we deduce the answer from the question?

In [9]:
def count_match(row):
    split_ans = row["clean_answer"].split(" ")
    split_qs = row["clean_question"].split(" ")
    #Remove the since it occurs too frequently
    if "the" in split_ans:
        split_ans.remove("the")
    if len(split_ans) == 0:
        return 0
    match_count = 0
    for item in split_ans:
        if item in split_qs:
            match_count += 1
    return match_count / len(split_ans)

jeopardy["answer_in_question"] = jeopardy.apply(count_match, axis=1)

In [10]:
print(jeopardy["answer_in_question"].mean())

0.06049325706933587


Only 6% of answers can be deduced from the question 

# How often are new questions repeat of older ones?

In [11]:
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
        split_qs = row["clean_question"].split(" ")
        split_qs = [q for q in split_qs if len(q) > 5]
        match_count = 0
        for word in split_qs:
            if word in terms_used:
                match_count += 1
        for word in split_qs:
            terms_used.add(word)
        if len(split_qs) > 0:
            match_count /= len(split_qs)
        question_overlap.append(match_count)
jeopardy["question_overlap"] = question_overlap

jeopardy["question_overlap"].mean()

0.6908737315671962

About 70% of words are repeated in questions. These are single words and not necessarily phrases. So we need to study this further. Here we can use a chi squared statistic.

In [12]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

In [13]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(1, 3), (0, 1), (2, 6), (1, 2), (1, 0)]

In [14]:
from scipy.stats import chisquare
import numpy as np

high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []
for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.02636443308440769, pvalue=0.871013484688921),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.05272886616881538, pvalue=0.818381104912348),
 Power_divergenceResult(statistic=0.03188116723440362, pvalue=0.8582887163235293),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047)]