In [2]:
import pandas as pd

jeopardy = pd.read_csv('jeopardy.csv')
jeopardy.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
jeopardy.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [4]:
jeopardy.columns = jeopardy.columns.str.strip()
jeopardy.columns

Index(['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question',
       'Answer'],
      dtype='object')

In [5]:
import string
def normalize(col):
    col = col.lower()
    col = ''.join(i for i in col if i not in string.punctuation)
    return col

jeopardy['clean_question'] = jeopardy['Question'].apply(normalize)
jeopardy['clean_question']

0        for the last 8 years of his life galileo was u...
1        no 2 1912 olympian football star at carlisle i...
2        the city of yuma in this state has a record av...
3        in 1963 live on the art linkletter show this c...
4        signer of the dec of indep framer of the const...
5        in the title of an aesop fable this insect sha...
6        built in 312 bc to link rome  the south of ita...
7        no 8 30 steals for the birmingham barons 2306 ...
8        in the winter of 197172 a record 1122 inches o...
9        this housewares store was named for the packag...
10                                          and away we go
11       cows regurgitate this from the first stomach t...
12       in 1000 rajaraja i of the cholas battled to ta...
13       no 1 lettered in hoops football  lacrosse at s...
14       on june 28 1994 the natl weather service began...
15       this companys accutron watch introduced in 196...
16       outlaw murdered by a traitor and a coward whos.

In [6]:
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize)
jeopardy['clean_answer']

0                                               copernicus
1                                               jim thorpe
2                                                  arizona
3                                                mcdonalds
4                                               john adams
5                                                  the ant
6                                           the appian way
7                                           michael jordan
8                                               washington
9                                            crate  barrel
10                                          jackie gleason
11                                                 the cud
12                                     ceylon or sri lanka
13                                               jim brown
14                                            the uv index
15                                                  bulova
16                                             jesse jam

In [7]:
jeopardy.head(6)

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,clean_question,clean_answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,for the last 8 years of his life galileo was u...,copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,no 2 1912 olympian football star at carlisle i...,jim thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,the city of yuma in this state has a record av...,arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,in 1963 live on the art linkletter show this c...,mcdonalds
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,signer of the dec of indep framer of the const...,john adams
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect sh...",the ant,in the title of an aesop fable this insect sha...,the ant


In [8]:
# object == string
jeopardy.dtypes

Show Number        int64
Air Date          object
Round             object
Category          object
Value             object
Question          object
Answer            object
clean_question    object
clean_answer      object
dtype: object

In [9]:
def normalize_value(col):
    col = ''.join(i for i in col if i not in string.punctuation)
    try:
        col = int(col)
    except ValueError:
        col = 0
    return col

jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_value)
jeopardy['clean_value']

0         200
1         200
2         200
3         200
4         200
5         200
6         400
7         400
8         400
9         400
10        400
11        400
12        600
13        600
14        600
15        600
16        600
17        600
18        800
19        800
20        800
21        800
22       2000
23        800
24       1000
25       1000
26       1000
27       1000
28       1000
29        400
         ... 
19969    1200
19970    1200
19971    1500
19972    1200
19973    1200
19974    1200
19975    1600
19976    1600
19977    1600
19978    1600
19979    1600
19980    1600
19981    1200
19982    2000
19983    2000
19984    2000
19985    2000
19986    2000
19987       0
19988     100
19989     100
19990     100
19991     100
19992     100
19993     100
19994     200
19995     200
19996     200
19997     200
19998     200
Name: clean_value, dtype: int64

In [10]:
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
jeopardy.dtypes

Show Number                int64
Air Date          datetime64[ns]
Round                     object
Category                  object
Value                     object
Question                  object
Answer                    object
clean_question            object
clean_answer              object
clean_value                int64
dtype: object

In [11]:
def answer_in_question(row):
    split_answer = row['clean_answer'].split(" ")
    split_question = row['clean_question'].split(" ")
    
    match_count = 0
    
    if 'the' in split_answer:
        split_answer.remove('the')
        
    if len(split_answer) == 0:
        return 0
    
    for answer in split_answer:
        if answer in split_question:
            match_count+=1
    
    return match_count / len(split_answer)

jeopardy['answer_in_question'] = jeopardy.apply(answer_in_question, axis=1)
jeopardy['answer_in_question']

0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
5        0.000000
6        0.000000
7        0.000000
8        0.000000
9        0.333333
10       0.000000
11       0.000000
12       0.000000
13       0.000000
14       0.500000
15       0.000000
16       0.000000
17       0.000000
18       0.000000
19       0.000000
20       0.000000
21       0.000000
22       0.000000
23       0.000000
24       0.500000
25       0.000000
26       0.000000
27       0.000000
28       0.000000
29       0.000000
           ...   
19969    0.000000
19970    0.000000
19971    0.000000
19972    0.000000
19973    0.000000
19974    0.333333
19975    0.000000
19976    0.000000
19977    0.000000
19978    0.000000
19979    0.000000
19980    0.500000
19981    0.500000
19982    0.000000
19983    0.000000
19984    0.000000
19985    0.000000
19986    0.000000
19987    0.000000
19988    0.000000
19989    0.000000
19990    0.000000
19991    0.000000
19992    0.000000
19993    0

In [12]:
jeopardy['answer_in_question'].mean()

0.060352773854698942

This means that only 6% of the times, we can deduce answers from the questions. 6% isn't a huge number and so we cannot rely on this strategy. So we will have to study for winning.

In [13]:
jeopardy = jeopardy.sort_values('Air Date')
question_overlap = []
terms_used = set()
for i, row in jeopardy.iterrows():
    split_question = row['clean_question'].split(" ")
    
    split_question = [i for i in split_question if len(i) > 5]

    match_count = 0
    
    for word in split_question:
        if word in terms_used:
            match_count+=1
    
    for word in split_question:
        terms_used.add(word)
    
    if len(split_question)>0:
        match_count = match_count / len(split_question)
    question_overlap.append(match_count)

jeopardy['question_overlap'] = question_overlap
jeopardy['question_overlap']    

19325    0.000000
19301    0.000000
19302    0.000000
19303    0.500000
19304    0.000000
19305    0.000000
19306    0.000000
19307    0.000000
19308    0.000000
19309    0.000000
19310    0.000000
19311    0.000000
19312    0.000000
19313    0.000000
19314    0.000000
19315    0.000000
19316    0.000000
19317    0.000000
19318    0.000000
19319    0.000000
19320    0.000000
19321    0.000000
19322    0.000000
19323    0.000000
19300    0.000000
19324    0.000000
19299    0.000000
19297    0.000000
19274    0.000000
19275    0.000000
           ...   
1973     0.333333
1974     1.000000
1959     1.000000
1958     0.800000
1957     1.000000
1956     1.000000
1934     1.000000
1935     0.571429
1936     1.000000
1937     1.000000
1938     0.666667
1939     0.666667
1940     1.000000
1941     0.500000
1942     0.833333
1943     1.000000
1932     1.000000
1944     0.666667
1946     0.833333
1947     0.750000
1948     0.800000
1949     1.000000
1950     1.000000
1951     1.000000
1952     1

In [14]:
jeopardy['question_overlap'].mean()

0.68712428809667803

This means that 69% of the questions(not the questions but important phrases) are repeated.

In [15]:
def every_row_value(row):
    if row['clean_value'] > 800:
        value = 1
    else:
        value = 0
    return value
jeopardy['high_value'] = jeopardy.apply(every_row_value, axis=1)
jeopardy['high_value']

19325    0
19301    0
19302    0
19303    0
19304    0
19305    0
19306    0
19307    0
19308    0
19309    0
19310    0
19311    0
19312    0
19313    0
19314    0
19315    0
19316    0
19317    0
19318    1
19319    0
19320    1
19321    1
19322    1
19323    1
19300    0
19324    1
19299    0
19297    0
19274    0
19275    0
        ..
1973     1
1974     0
1959     1
1958     1
1957     1
1956     1
1934     0
1935     0
1936     0
1937     0
1938     0
1939     1
1940     1
1941     1
1942     1
1943     1
1932     0
1944     1
1946     0
1947     0
1948     0
1949     0
1950     0
1951     0
1952     0
1953     0
1954     0
1955     0
1945     0
1922     0
Name: high_value, dtype: int64

In [16]:
def high_low(word):
    low_count = 0
    high_count = 0
    
    for i, row in jeopardy.iterrows():
        split_question = row['clean_question'].split(' ')
        if word in split_question:
            if row['high_value'] == 1:
                high_count+=1
            else:
                low_count+=1
    
    return high_count, low_count

observed_expected = []
comparison_terms = list(terms_used)[:5]
for term in comparison_terms:
    print (term)
    h,l = high_low(term)
    observed_expected.append([h,l])
observed_expected

grandcamp
targetblankvideo
feathered
hrefhttpwwwjarchivecommedia20110121j27jpg
speakers


[[0, 1], [0, 3], [2, 2], [1, 0], [1, 3]]

In [21]:
from scipy.stats import chisquare

high_value_count = jeopardy[jeopardy['high_value'] == 1].shape[0]
low_value_count = jeopardy[jeopardy['high_value'] == 0].shape[0]

chi_squared = []

for i in observed_expected:
    total = sum(i)
    total_prop = total / jeopardy.shape[0]
    expected_high_value = total_prop * high_value_count
    expected_low_value = total_prop * low_value_count
    
    observed = [i[0], i[1]]
    expected = [expected_high_value, expected_low_value]
    
    chi_squared.append(chisquare(observed, expected))

chi_squared

[(0.40196284612688399, 0.52607729857054686),
 (1.2058885383806519, 0.27214791766902047),
 (0.88975496332255899, 0.34554371914834681),
 (2.4877921171956752, 0.11473257634454047),
 (0.026364433084407689, 0.87101348468892104)]