In [75]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import scipy.stats as stats
#%matplotibinline

In [76]:
jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [77]:
columns = jeopardy.columns.tolist()
jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']

## Normalizing Text

In [111]:
# Remove punctuation and make string all lowercase
def normazile_text(text):
    text = text.lower()
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    return text

In [112]:
jeopardy['clean_question'] = jeopardy['Question'].apply(normazile_text)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normazile_text)

## Normalizeing Columns

In [113]:
# Convert datatype of  the "value" column to numeric
# and the "air date" column to datetime
def normalize_money(text):
    text = re.sub("[^A-Za-z0-9\s]", "", text)
    try:
        text = int(text)
    except Exception:
        text = 0
    return text

jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_money)
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])

## Answers in questions

In [124]:
def count_matches(row):
    split_answer = row["clean_answer"].split(" ")
    split_question = row["clean_question"].split(" ")
    if "the" in split_answer:
        split_answer.remove("the")
    if len(split_answer) == 0:
        return 0
    match_count = 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
    return match_count / len(split_answer)

jeopardy["answer_in_question"] = jeopardy.apply(count_matches, axis=1)

In [125]:
jeopardy['answer_in_question'].mean()

0.060493257069335872

About 6% of cells have a word from the answer in the question. learning to slove questions from the question itself doesn;t seem like the  best strategy.  Let's examin other strategies.

## Recycling Questions

In [126]:
question_overlap = []
terms_used = set()

for index, row in jeopardy.iterrows():
    split_question =  row['clean_question'].split(' ')
    split_question = [i for i in split_question if len(i) > 5]
    match_count = 0
    for i in split_question:
        if i in terms_used:
            match_count = match_count + 1
    for i in split_question:
        terms_used.add(i)
    if len(split_question) > 0:
        match_count = match_count/len(split_question)
    question_overlap.append(match_count)
    
jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap'].mean()

0.69087373156719623

Almost 70% of questions have been asked before in some fashion. Sudying all previouse question might be a good strategy.

## Low vs High Value Questions

In [127]:
def determine_value(row):
    value = 0
    if row["clean_value"] > 800:
        value = 1
    return value

jeopardy["high_value"] = jeopardy.apply(determine_value, axis=1)

In [129]:
def count_usage(term):
    low_count = 0
    high_count = 0
    for i, row in jeopardy.iterrows():
        if term in row["clean_question"].split(" "):
            if row["high_value"] == 1:
                high_count += 1
            else:
                low_count += 1
    return high_count, low_count

comparison_terms = list(terms_used)[:5]
observed_expected = []
for term in comparison_terms:
    observed_expected.append(count_usage(term))

observed_expected

[(0, 1), (1, 0), (1, 5), (0, 1), (0, 1)]

In [136]:
high_value_count = jeopardy[jeopardy["high_value"] == 1].shape[0]
low_value_count = jeopardy[jeopardy["high_value"] == 0].shape[0]

In [137]:
high_value_count

5734

In [138]:
low_value_count

14265

In [167]:
chi_squared = []
observed_expected

[(0, 1), (1, 0), (1, 5), (0, 1), (0, 1)]

In [168]:
for observed in observed_expected:
    total = sum(observed)
    total_prop =  total / jeopardy.shape[0]
    expected_high = total_prop * high_value_count
    expected_low = total_prop * low_value_count
    
    obs = np.array([observed[0], observed[1]])
    exp = np.array([expected_high, expected_low])
    chi, p =stats.chisquare(obs, exp)
    chi_squared.append([chi,p])

chi_squared

[[0.40196284612688399, 0.52607729857054686],
 [2.4877921171956752, 0.11473257634454047],
 [0.42281054506129573, 0.51553795812945302],
 [0.40196284612688399, 0.52607729857054686],
 [0.40196284612688399, 0.52607729857054686]]

## Next Steps

Here are some potential next steps:

- Find a better way to eliminate non-informative words than just removing words that are less than 6 characters long. Some ideas:
    - Manually create a list of words to remove, like the, than, etc.
    - Find a list of stopwords to remove.
    - Remove words that occur in more than a certain percentage (like 5%) of questions.

- Perform the chi-squared test across more terms to see what terms have larger differences. This is hard to do currently because the code is slow, but here are some ideas:
    - Use the apply method to make the code that calculates frequencies more efficient.
    - Only select terms that have high frequencies across the dataset, and ignore the others.

- Look more into the Category column and see if any interesting analysis can be done with it. Some ideas:
    - See which categories appear the most often.
    - Find the probability of each category appearing in each round.

- Use the whole Jeopardy dataset (available here) instead of the subset we used in this mission.

- Use phrases instead of single words when seeing if there's overlap between questions. Single words don't capture the whole context of the question well.