In [1]:
import pandas as pd
jeopardy = pd.read_csv("jeopardy.csv", parse_dates=[" Air Date"])

In [2]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
jeopardy.columns = [ c.strip() for c in jeopardy.columns]

In [5]:
import re
def normalize(s):
    s = s.lower()
    s = re.sub(r'\W',' ', s)
    return s


In [6]:
jeopardy["clean_question"] = jeopardy["Question"].apply(normalize)

In [7]:
jeopardy["clean_answer"] = jeopardy["Answer"].apply(normalize)

In [8]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was ...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisl...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show th...,mcdonald s
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the co...,john adams


In [20]:
def norm_dollar(s):
    s = re.sub(r'\W','',s)
    if s is None:
        return 0
    try:
        s = int(s)
    except:
        return 0
    return s

jeopardy["clean_value"] = jeopardy["Value"].apply(norm_dollar)

In [26]:
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])

In [28]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was ...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisl...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show th...,mcdonald s,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the co...,john adams,200


In [33]:
def check_words(row):
    split_answer = row['clean_answer'].split(' ')
    split_question = row['clean_question'].split(' ')
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for w in split_answer:
        if w in split_question:
            match_count += 1
    return match_count / len(split_answer)


In [34]:
jeopardy['answer_in_question'] = jeopardy.apply(check_words, axis=1)

In [36]:
jeopardy['answer_in_question'].mean()

0.09565366087691443

* Only 9.5% of the answers are found in the question.
* The answer is usually not deducible from the question.

In [41]:
question_overlap = []
terms_used = set()
jeopardy.sort_values(by='Air Date', ascending=True, inplace=True)

In [42]:
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [w for w in split_question if len(w) >= 6]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
        terms_used.add(word)
    if len(split_question) > 0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)
jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())

0.7217290484579508


* 72% of questions terms have been reused
* Might want to focus on studying questions that with previously used terms

In [43]:
def find_value(row):
    if row['clean_value'] > 800:
        return 1
    else:
        return 0

jeopardy['high_value'] = jeopardy.apply(find_value, axis=1)

In [44]:
def find_count(word):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count


In [46]:
import numpy as np
comparison_terms = np.random.choice(list(terms_used), 10)

In [47]:
comparison_terms

array(['diverges', 'increases', 'settlers', 'columbine', 'wearers',
       'connor', 'temperature', 'valour', 'maricopa', 'absorbed'],
      dtype='<U23')

In [48]:
observed_expected = []
for term in comparison_terms:
    observed_expected.append(find_count(term))

In [52]:
high_value_count = jeopardy[jeopardy['high_value']==1].count()[0]

In [54]:
low_value_count = jeopardy[jeopardy['high_value']==0].count()[0]

In [57]:
from scipy.stats import chisquare
chi_squared = []
for l in observed_expected:
    total = sum(l)
    total_prop = total/len(jeopardy)
    expected_high_value_count = total_prop * high_value_count
    expected_low_value_count = total_prop * low_value_count
    obs_high_value_count = l[0]
    obs_low_value_count = l[1]
    chi_squared.append(chisquare(l,[expected_high_value_count,expected_low_value_count]))
    

In [58]:
chi_squared

[Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=4.97558423439135, pvalue=0.025707519787911092),
 Power_divergenceResult(statistic=0.18383953104516373, pvalue=0.6680941623250602),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=3.423170782846152e-05, pvalue=0.9953317740648371),
 Power_divergenceResult(statistic=0.33955667615496254, pvalue=0.5600852286656143),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.1177104383031944, pvalue=0.14560406868264344)]

One pvalue showed as statistically significant:  0.0257