## LSTM

In [110]:
jeopardy_data = pd.read_csv('JEOPARDY_CSV.csv')
# remove spaces from column names
jeopardy_data.columns = [col.strip() for col in jeopardy_data.columns]
print(jeopardy_data.shape)
jeopardy_data.head()

jeopardy_data_sub = jeopardy_data[jeopardy_data['Round'] == 'Jeopardy!']
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub.Answer.str.isalpha() == True]
jeopardy_data_sub = jeopardy_data_sub[~jeopardy_data_sub.Question.str.contains("<a href=")]
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub['Show Number'] >= 4000]
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub['Question'].str.split().str.len() >= 5]

import random
# get a sample of 1,000 for each Value
jeopardy_data_sub_200 = jeopardy_data_sub[jeopardy_data_sub['Value']=='$200'].sample(2000, random_state=670)
jeopardy_data_sub_1000 = jeopardy_data_sub[jeopardy_data_sub['Value']=='$1000'].sample(2000, random_state=670)
jeopardy_data_sub = pd.concat([jeopardy_data_sub_200, jeopardy_data_sub_1000])


print(jeopardy_data_sub.shape)
jeopardy_data_sub.head(10)

(216930, 7)
(4000, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
166403,4536,2004-05-03,Jeopardy!,"DOUBLE A, B, Cs",$200,"I'm this, you're glue, everything you say boun...",rubber
781,4335,2003-06-06,Jeopardy!,MY PLACE?,$200,"A Norman could say, ""I'm the king of the motte...",castle
119920,5224,2007-05-03,Jeopardy!,POTPOURRI,$200,Shelley & Eliot would be happy to know that Ap...,poetry
33882,5668,2009-04-08,Jeopardy!,IT'S A COUNTRY THING,$200,Hat dance & jumping bean,Mexican
186569,6247,2011-11-15,Jeopardy!,MELTING POTPOURRI,$200,"""Our actors"", says Prospero, ""were all spirits...",air
45283,5687,2009-05-05,Jeopardy!,ARCHAEOLOGY,$200,In 1996 Franck Goddio discovered her palace un...,Cleopatra
184063,5023,2006-06-14,Jeopardy!,& TAKIN' NAMES,$200,"World poverty fighter, Time magazine Person of...",Bono
85474,5139,2007-01-04,Jeopardy!,BEAN,$200,This bean that shares the name of a South Amer...,lima
155694,5853,2010-02-10,Jeopardy!,BE TRUE TO YOUR SCHOOL,$200,"The benefactor for whom this West Lafayette, I...",Purdue
147690,4293,2003-04-09,Jeopardy!,DECODE THE PERSONAL AD,$200,"To start with, S. is for this",single


In [90]:
jeopardy_data_sub['label']= jeopardy_data_sub['Value'].map(lambda x: int(x.replace('$','')))
# turn labels to int
jeopardy_data_sub['label_id'],group_name = pd.factorize(jeopardy_data_sub['label'])
jeopardy_data_sub.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,label,label_id
166403,4536,2004-05-03,Jeopardy!,"DOUBLE A, B, Cs",$200,"I'm this, you're glue, everything you say boun...",rubber,200,0
781,4335,2003-06-06,Jeopardy!,MY PLACE?,$200,"A Norman could say, ""I'm the king of the motte...",castle,200,0
119920,5224,2007-05-03,Jeopardy!,POTPOURRI,$200,Shelley & Eliot would be happy to know that Ap...,poetry,200,0
33882,5668,2009-04-08,Jeopardy!,IT'S A COUNTRY THING,$200,Hat dance & jumping bean,Mexican,200,0
186569,6247,2011-11-15,Jeopardy!,MELTING POTPOURRI,$200,"""Our actors"", says Prospero, ""were all spirits...",air,200,0


In [85]:
X = jeopardy_data_sub.filter(['Question'])

X_train, X_test, y_train, y_test = train_test_split(
    X, jeopardy_data_sub['label_id'], 
                   stratify=jeopardy_data_sub['label_id'],random_state=0)

In [101]:
train_labels = y_train
train_questions = X_train['Question']
test_labels = y_test
test_questions = X_test['Question']

### Tokenize & Pad

In [102]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(jeopardy_data_sub['Question'])

train_sequence = tokenizer.texts_to_sequences(train_questions)
test_sequence = tokenizer.texts_to_sequences(test_questions)

In [104]:
X_train = pad_sequences(train_sequence)
X_test = pad_sequences(test_sequence)

print(X_train.shape)
print(X_test.shape)

(3000, 40)
(1000, 32)


### Encode labels as counts

Unlike Sklearn, Keras requires your labels to be either one-hot-encoded, or encoded using label encoders. For the former, you will need to use a categorical_crossentropy loss when you compile the model, and for the latter you need to use sparse_categorical_crossentropy. We will use the latter for simplicity.

In [105]:
le = LabelEncoder()
le.fit(jeopardy_data_sub['label_id'])

y_train = le.transform(y_train)
y_test = le.transform(y_test)

print(y_train.shape)
print(y_test.shape)

(3000,)
(1000,)


### Building and running the model

In [106]:
num_words = tokenizer.num_words
output_size = len(le.classes_)

In [107]:
model = Sequential([
    Embedding(input_dim=num_words, 
              output_dim=200, 
              mask_zero=True, 
              input_length=50),
    Bidirectional(LSTM(150, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(300, activation='relu'),
    Dropout(0.5),
    Dense(output_size, activation='softmax')
    
])

model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 200)           10000000  
_________________________________________________________________
bidirectional_5 (Bidirection (None, 50, 300)           421200    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 300)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 300)               90300     
_________________________________________________________________
dropout_5 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 602       
Total params: 10,512,102
Trainable params: 10,512,102
Non-trainable params: 0
__________________________________________

### Train the model

In [108]:
model.fit(X_train, y_train, epochs=10, batch_size=1024, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x281a2d1c988>

### Evaluate the model

In [109]:
y_pred = model.predict(X_test, batch_size=1024).argmax(axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.37      0.42       500
           1       0.50      0.64      0.56       500

    accuracy                           0.50      1000
   macro avg       0.50      0.50      0.49      1000
weighted avg       0.50      0.50      0.49      1000



## LSTM with Textstat Features

In [136]:
# read in and preview data

import pandas as pd

jeopardy_data = pd.read_csv('JEOPARDY_CSV.csv')
# remove spaces from column names
jeopardy_data.columns = [col.strip() for col in jeopardy_data.columns]
print(jeopardy_data.shape)
jeopardy_data.head()

(216930, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [137]:
# extract subset of the data using the following criteria:
# (1) only 'Jeopardy!' round
# (2) no answers with numbers or questions containing images/links
# (3) only shows after 4000 (to limit amount of data)
# (4) only questions with 5 or more words
# (5) only questions with a value of $200 or $1000

jeopardy_data_sub = jeopardy_data[jeopardy_data['Round'] == 'Jeopardy!']
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub.Answer.str.isalpha() == True]
jeopardy_data_sub = jeopardy_data_sub[~jeopardy_data_sub.Question.str.contains("<a href=")]
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub['Show Number'] >= 4000]
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub['Question'].str.split().str.len() >= 5]

import random
# get a sample of 1,000 for each Value
jeopardy_data_sub_200 = jeopardy_data_sub[jeopardy_data_sub['Value']=='$200'].sample(2000, random_state=670)
jeopardy_data_sub_1000 = jeopardy_data_sub[jeopardy_data_sub['Value']=='$1000'].sample(2000, random_state=670)
jeopardy_data_sub = pd.concat([jeopardy_data_sub_200, jeopardy_data_sub_1000])


print(jeopardy_data_sub.shape)
jeopardy_data_sub.head(10)

(4000, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
166403,4536,2004-05-03,Jeopardy!,"DOUBLE A, B, Cs",$200,"I'm this, you're glue, everything you say boun...",rubber
781,4335,2003-06-06,Jeopardy!,MY PLACE?,$200,"A Norman could say, ""I'm the king of the motte...",castle
119920,5224,2007-05-03,Jeopardy!,POTPOURRI,$200,Shelley & Eliot would be happy to know that Ap...,poetry
33882,5668,2009-04-08,Jeopardy!,IT'S A COUNTRY THING,$200,Hat dance & jumping bean,Mexican
186569,6247,2011-11-15,Jeopardy!,MELTING POTPOURRI,$200,"""Our actors"", says Prospero, ""were all spirits...",air
45283,5687,2009-05-05,Jeopardy!,ARCHAEOLOGY,$200,In 1996 Franck Goddio discovered her palace un...,Cleopatra
184063,5023,2006-06-14,Jeopardy!,& TAKIN' NAMES,$200,"World poverty fighter, Time magazine Person of...",Bono
85474,5139,2007-01-04,Jeopardy!,BEAN,$200,This bean that shares the name of a South Amer...,lima
155694,5853,2010-02-10,Jeopardy!,BE TRUE TO YOUR SCHOOL,$200,"The benefactor for whom this West Lafayette, I...",Purdue
147690,4293,2003-04-09,Jeopardy!,DECODE THE PERSONAL AD,$200,"To start with, S. is for this",single


In [138]:
# function to count the average word length in the sentence 
import numpy as np

def avg_word_length(text):
    lens = []
    for word in text.split():
        lens.append(len(word))
    return np.mean(lens)

# function to count the max word length in the sentence 
def longest_word(text):
    lens = []
    for word in text.split():
        lens.append(len(word))
    try:
        return np.max(lens)
    except:
        return 0

In [139]:
import textstat as ts # source: https://pypi.org/project/textstat/

# how many words are in the question?
jeopardy_data_sub['Question Len'] = jeopardy_data_sub['Question'].apply(lambda x: ts.lexicon_count(x, removepunct=True))
# what is the readability of the question?
jeopardy_data_sub['Flesch Reading Ease Score'] = jeopardy_data_sub['Question'].apply(lambda text: ts.flesch_reading_ease(text))
# what is the grade level associated the question?
jeopardy_data_sub['Flesch-Kincaid Grade Level'] = jeopardy_data_sub['Question'].apply(lambda text:ts.flesch_kincaid_grade(text))
# longest word in question? 
jeopardy_data_sub['Longest Word (Question)'] = jeopardy_data_sub['Question'].apply(lambda text:longest_word(text))
# longest word in answer?
jeopardy_data_sub['Longest Word (Answer)'] = jeopardy_data_sub['Answer'].apply(lambda text:longest_word(text))


# clean questions/answers by lowercasing and removing stop words

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

jeopardy_data_sub['Question_cleaned'] = jeopardy_data_sub['Question'].apply(lambda x: " ".join([word.lower() for word in x.split() if word.lower() not in stopwords]))
jeopardy_data_sub['Answer_cleaned'] = jeopardy_data_sub['Answer'].apply(lambda x: " ".join([word.lower() for word in x.split() if word.lower() not in stopwords]))
jeopardy_data_sub['Category_cleaned'] = jeopardy_data_sub['Category'].apply(lambda x: " ".join([word.lower() for word in x.split() if word.lower() not in stopwords]))

# what is the average length of a word in the answer?
jeopardy_data_sub['Average Answer Word Length (Cleaned)'] = jeopardy_data_sub['Answer_cleaned'].apply(lambda text: avg_word_length(text))
# how many words are in the answer?
jeopardy_data_sub['Answer Len (Cleaned)'] = jeopardy_data_sub['Answer_cleaned'].apply(lambda x: ts.lexicon_count(x, removepunct=True))


jeopardy_data_sub.groupby('Value')[['Question_cleaned','Answer_cleaned','Category_cleaned','Flesch Reading Ease Score', 
         'Flesch-Kincaid Grade Level', 'Longest Word (Question)', 'Longest Word (Answer)',
         'Average Answer Word Length (Cleaned)', 'Answer Len (Cleaned)']].mean()

  out=out, **kwargs)


Unnamed: 0_level_0,Flesch Reading Ease Score,Flesch-Kincaid Grade Level,Longest Word (Question),Longest Word (Answer),Average Answer Word Length (Cleaned),Answer Len (Cleaned)
Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
$1000,70.390675,6.98215,9.8085,7.0475,7.06015,0.9975
$200,72.825295,6.5186,9.548,6.51,6.533434,0.9945


In [140]:
jeopardy_data_sub['label']= jeopardy_data_sub['Value'].map(lambda x: int(x.replace('$','')))
# turn labels to int
jeopardy_data_sub['label_id'],group_name = pd.factorize(jeopardy_data_sub['label'])

In [146]:
cols_sub = ['Flesch Reading Ease Score', 
         'Flesch-Kincaid Grade Level', 'Longest Word (Question)', 'Longest Word (Answer)',
         'Average Answer Word Length (Cleaned)']

X = jeopardy_data_sub['Question_cleaned']

X_train, X_test, y_train, y_test = train_test_split(
    X, jeopardy_data_sub['label_id'], 
                   stratify=jeopardy_data_sub['label_id'],random_state=0)

In [149]:
train_labels = y_train
train_questions = X_train
test_labels = y_test
test_questions = X_test

### Tokenize & Pad

In [150]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(jeopardy_data_sub['Question_cleaned'])

train_sequence = tokenizer.texts_to_sequences(train_questions)
test_sequence = tokenizer.texts_to_sequences(test_questions)

In [151]:
X_train = pad_sequences(train_sequence)
X_test = pad_sequences(test_sequence)

print(X_train.shape)
print(X_test.shape)

(3000, 30)
(1000, 19)


### Encode labels as counts

In [152]:
le = LabelEncoder()
le.fit(jeopardy_data_sub['label_id'])

y_train = le.transform(y_train)
y_test = le.transform(y_test)

print(y_train.shape)
print(y_test.shape)

(3000,)
(1000,)


### Building and running the model

In [153]:
model = Sequential([
    Embedding(input_dim=num_words, 
              output_dim=200, 
              mask_zero=True, 
              input_length=50),
    Bidirectional(LSTM(150, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(300, activation='relu'),
    Dropout(0.5),
    Dense(output_size, activation='softmax')
    
])

model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 200)           10000000  
_________________________________________________________________
bidirectional_8 (Bidirection (None, 50, 300)           421200    
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 300)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 300)               90300     
_________________________________________________________________
dropout_8 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 2)                 602       
Total params: 10,512,102
Trainable params: 10,512,102
Non-trainable params: 0
__________________________________________

### Train the model

In [154]:
model.fit(X_train, y_train, epochs=10, batch_size=1024, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x281fc2d0408>

### Evaluate the model

In [155]:
y_pred = model.predict(X_test, batch_size=1024).argmax(axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.48      0.49       500
           1       0.51      0.53      0.52       500

    accuracy                           0.51      1000
   macro avg       0.51      0.51      0.51      1000
weighted avg       0.51      0.51      0.51      1000

