In [1]:
import pandas as pd

In [2]:
from scipy.stats import chisquare

In [3]:
import numpy as np

In [4]:
jeopardy = pd.read_csv('jeopardy.csv')

In [5]:
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [6]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [7]:
jeopardy.columns = jeopardy.columns.str.strip()

In [8]:
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [9]:
import re

In [10]:
def normalize_text(s):
    s = s.lower()
    s = re.sub(r'[^\s\w]', '', s)
    return s

In [11]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_text)

In [12]:
jeopardy[['Question', 'clean_question']].head()

Unnamed: 0,Question,clean_question
0,"For the last 8 years of his life, Galileo was ...",for the last 8 years of his life galileo was u...
1,No. 2: 1912 Olympian; football star at Carlisl...,no 2 1912 olympian football star at carlisle i...
2,The city of Yuma in this state has a record av...,the city of yuma in this state has a record av...
3,"In 1963, live on ""The Art Linkletter Show"", th...",in 1963 live on the art linkletter show this c...
4,"Signer of the Dec. of Indep., framer of the Co...",signer of the dec of indep framer of the const...


In [13]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_text)

In [14]:
jeopardy[['Answer', 'clean_answer']].head()

Unnamed: 0,Answer,clean_answer
0,Copernicus,copernicus
1,Jim Thorpe,jim thorpe
2,Arizona,arizona
3,McDonald's,mcdonalds
4,John Adams,john adams


In [15]:
def normalize_currency(s):
    s = re.sub(r'[^\s\w]', '', s)
    while True:
        try:
            i = int(s)
            return i
            break
        except ValueError:
            i = 0
            return i

In [16]:
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_currency)

In [17]:
jeopardy[['Value', 'clean_value']].head()

Unnamed: 0,Value,clean_value
0,$200,200
1,$200,200
2,$200,200
3,$200,200
4,$200,200


In [18]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

In [19]:
jeopardy['Air Date'].head()

0   2004-12-31
1   2004-12-31
2   2004-12-31
3   2004-12-31
4   2004-12-31
Name: Air Date, dtype: datetime64[ns]

In [20]:
def count_matching_terms(s):
    split_answer = s['clean_answer'][0].split(' ')
    split_question = s['clean_question'][0].split(' ')
    match_count = 0
    if 'the' in split_answer:
        split_answer = split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for i in split_answer:
        if i in split_question:
            match_count+=1
    return match_count/len(split_answer)

In [21]:
answer_in_question = jeopardy.apply(count_matching_terms, axis=1)

In [22]:
answer_in_question.value_counts()

0.0    18222
1.0     1777
dtype: int64

In [23]:
answer_in_question.mean()

0.0888544427221361

In [24]:
question_overlap = []

In [25]:
terms_used = set()

In [26]:
jeopardy.sort_values('Air Date', inplace=True)

In [27]:
jeopardy['Air Date'].head()

19325   1984-09-21
19301   1984-09-21
19302   1984-09-21
19303   1984-09-21
19304   1984-09-21
Name: Air Date, dtype: datetime64[ns]

In [28]:
for index, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(' ')
    split_question = [q for q in split_question if len(q)>5]
    match_count = 0
    for w in split_question:
        if w in terms_used:
            match_count+=1
    for w in split_question:
        terms_used.add(w)
    if len(split_question)>0:
        match_count /= len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap
print(jeopardy['question_overlap'].mean())
    
    

0.6876235590919739


In [29]:
def clasify_values(r):
    if r['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value

In [30]:
jeopardy['high_value'] = jeopardy.apply(clasify_values, axis=1)

In [31]:
def count_word_value_types(w):
    low_count = 0
    high_count = 0
    for index, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if w in split_question:
            if row['high_value'] == 1:
                high_count+=1
            else:
                low_count+=1
    return high_count, low_count

In [32]:
observed_expected = []

In [33]:
comparison_terms = list(terms_used)[:5]

In [34]:
for term in comparison_terms:
    result = count_word_value_types(term)
    observed_expected.append(result)

In [35]:
high_value_count = len(jeopardy[jeopardy['high_value']==1])

In [36]:
low_value_count = len(jeopardy[jeopardy['high_value']==0])

In [37]:
chi_squared = []

In [38]:
for l in observed_expected:
    total = l[0] + l[1]
    total_prop = total/len(jeopardy)
    high_values_expected = total_prop*high_value_count
    low_values_expected = total_prop*low_value_count
    observed = np.array([l[0], l[1]])
    expected = np.array([high_values_expected, low_values_expected])
    chi_squared.append(chisquare(observed, expected))

chi_squared

[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),
 Power_divergenceResult(statistic=0.10236523249330269, pvalue=0.7490095154898175),
 Power_divergenceResult(statistic=0.4448774816612795, pvalue=0.5047776487545996)]