In [13]:
import pandas as pd
import string 
from scipy.stats import chisquare
import numpy as np

jeopardy = pd.read_csv("jeopardy.csv")
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [14]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [15]:
jeopardy.columns = ["Show Number", "Air Date", "Round", "Category", "Value", "Question", "Answer"]
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [16]:
def norm_str(s):
    s = s.lower()
    translator = str.maketrans('', '', string.punctuation)
    return s.translate(translator)

jeopardy["clean_question"] = jeopardy["Question"].apply(norm_str)    
jeopardy["clean_answer"] = jeopardy["Answer"].apply(norm_str)

In [17]:
def norm_val(s):
    translator = str.maketrans('', '', string.punctuation)
    s = s.translate(translator)
    try:
        int_s = int(s)
    except:
        int_s = 0
    return int_s

jeopardy["clean_value"] = jeopardy["Value"].apply(norm_val)
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [18]:
def qa_match_ratio(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    match_count = 0
    
    split_answer = [x for x in split_answer if x != "the"]
        
    if len(split_answer) == 0:
        return 0
    
    for x in split_answer:
        if x in split_question:
            match_count += 1
    
    return match_count / len(split_answer)
        
jeopardy["answer_in_question"] = jeopardy.apply(qa_match_ratio, axis=1)
jeopardy["answer_in_question"].mean()

0.05973712438535679

The mean result shows that the answer appears in the question only 6% of the time, which means that studying the question to look for the answer probably isn't the best strategy for studyng jeopardy. 

In [19]:
jeopardy = jeopardy.sort_values("Air Date")
jeopardy.head(5)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value,answer_in_question
19325,10,1984-09-21,Final Jeopardy!,U.S. PRESIDENTS,,"Adventurous 26th president, he was 1st to ride...",Theodore Roosevelt,adventurous 26th president he was 1st to ride ...,theodore roosevelt,0,0.0
19301,10,1984-09-21,Double Jeopardy!,LABOR UNIONS,$200,Notorious labor leader missing since '75,Jimmy Hoffa,notorious labor leader missing since 75,jimmy hoffa,200,0.0
19302,10,1984-09-21,Double Jeopardy!,1789,$200,"Washington proclaimed Nov. 26, 1789 this first...",Thanksgiving,washington proclaimed nov 26 1789 this first n...,thanksgiving,200,0.0
19303,10,1984-09-21,Double Jeopardy!,TOURIST TRAPS,$200,Both Ferde Grofe' & the Colorado River dug thi...,the Grand Canyon,both ferde grofe the colorado river dug this ...,the grand canyon,200,0.0
19304,10,1984-09-21,Double Jeopardy!,LITERATURE,$200,"Depending on the book, he could be a ""Jones"", ...",Tom,depending on the book he could be a jones a sa...,tom,200,0.0


In [20]:
question_overlap = []
terms_used = set()

for i, row in jeopardy.iterrows():
    split_question = row["clean_question"].split(" ")
    split_question = [x for x in split_question if len(x) > 5]
    
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
            
    for word in split_question:
        terms_used.add(word)
        
    if len(split_question) > 0: 
        match_count /= len(split_question)
    
    question_overlap.append(match_count)

jeopardy["question_overlap"] = question_overlap
jeopardy["question_overlap"].mean()

0.687124288096678

About 69% of terms in newer question appear in older questions. Not completely indicative of the frequency of repeated questions (since the questions might ask something different despite having similar words), but worth looking into nonetheless. 

In [21]:
def high_val(row):
    if row["clean_value"] > 800:
        return 1
    return 0

jeopardy["high_value"] = jeopardy.apply(high_val, axis=1)

In [22]:
def counts(word):
    low_count = 0
    high_count = 0
    
    for i, row in jeopardy.iterrows():
        split_question = row["clean_question"].split(" ")
        if word in split_question:
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

observed_expected = []
terms_used = list(terms_used)
comparison_terms = terms_used[:5]

observed_expected = [counts(word) for word in comparison_terms]
observed_expected

[(2, 1), (0, 2), (0, 1), (1, 0), (0, 1)]

In [25]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

chi_squared = []

for lis in observed_expected:
    total = lis[0] + lis[1]
    total_prop = total / jeopardy.shape[0]
    high_val = total_prop * high_value_count
    low_val = total_prop * low_value_count
    
    observed = np.array([lis[0], lis[1]])
    expected = np.array([high_val, low_val])
    stat, p = chisquare(observed, expected)
    chi_squared.append((stat, p))
    
chi_squared

[(2.1177104383031944, 0.14560406868264344),
 (0.803925692253768, 0.3699222378079571),
 (0.401962846126884, 0.5260772985705469),
 (2.487792117195675, 0.11473257634454047),
 (0.401962846126884, 0.5260772985705469)]

The chi-squared test is pretty pointless because frequencies of unique words aren't that common. Significant results weren't found. Words that appear in higher frequencies would be useful to test. 