# Winning  Jeopardy

In [1]:
import pandas as pd
jeopardy = pd.read_csv('jeopardy.csv')

print(jeopardy.head())
print(jeopardy.columns)

   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype

In [2]:
jeopardy.columns = jeopardy.columns.str.strip()
print(jeopardy.columns)

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')


# Normalizing text

In [3]:
import re

def normalizing_string(string):
    string = string.lower()
    string = re.sub("[^A-Za-z0-9\s]", ' ', string)
    return string

In [4]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalizing_string)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalizing_string)

In [5]:
def normalize_numeric(value):
    value = re.sub("[^A-Za-z0-9\s]", '', value)
    try:
        value = int(value)
    except Exception:
        value = 0
    return value

In [6]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_numeric)

In [7]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer,clean_value
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was ...,copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisl...,jim thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show th...,mcdonald s,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the co...,john adams,200


In [8]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [9]:
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

# Answers in questions

* How often the answer is deducible from the question.
* How often new questions are repeats of older questions.

In [10]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)

In [11]:
jeopardy['answer_in_question'].mean()

0.09611447756854266

問題文中に答えが含まれているのは、全体のおよそ9.6％であった。1割弱は問題を聴くだけで正解することができるようになるということだが、これだけのために膨大な過去問を研究するのは効率的ではないと考える。次に、同じ問題がどれだけ再利用されているかを求める

# Recycled questions

In [12]:
question_overlap = []
terms_used = set()

jeopardy = jeopardy.sort_values('Air Date')
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(" ")
    split_question = [q for q in split_question if len(q) > 6]
    match_count = 0
    for word in split_question:
        if word in terms_used:
            match_count += 1
    for word in split_question:
        terms_used.add(word)
        
    if len(split_question) > 0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap']= question_overlap
jeopardy['question_overlap'].mean()    

0.6590534345509593

新しい問題の単語の約66％が過去の問題の単語と重複していることがわかった。ただし、問題の内容ではなく、文章中の各単語の一致を調べただけなので、この分析から過去の問題を再利用していると断言できないが、さらに検討する価値はあると考える。

# Low value vs high value questions

In [13]:
def determine_value(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

In [15]:
jeopardy['high_value'] = jeopardy.apply(determine_value, axis=1)

In [16]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(" ")
        if term in split_question:
            if row['high_value'] == 1:
                high_count += 1
            else:
                low_count += 1
                
    return high_count, low_count

In [17]:
observed_expected = []
comparison_terms = list(terms_used)[:5]

In [19]:
for term in comparison_terms:
    high_low_count = count_usage(term)
    observed_expected.append(high_low_count)
    
observed_expected

[(0, 1), (0, 2), (0, 1), (0, 2), (0, 2)]

# Applying the chi-squared test

In [25]:
high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0]
low_value_count = jeopardy[jeopardy['high_value'] == 0].shape[0]

chi_squared = []  

from scipy.stats import chisquare
import numpy as np

for obs in observed_expected:
    total = sum(obs)
    total_prop = total / jeopardy.shape[0]
    high_value_exp = total_prop * high_value_count
    low_value_exp = total_prop * low_value_count
    
    observed = np.array([obs[0], obs[1]])
    expected = np.array([high_value_exp, low_value_exp])
    
    chi_squared.append(chisquare(observed, expected))
    
chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),
 Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571)]

どの単語も高額賞金と少額賞金の問題の間で使用量にはっきりとした違いはない。度数がいずれも2以下でchi-squaredが有効ではない、使用回数のより多い単語でテストすべきである・