In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
%cd "/content/drive/My Drive/Colab Notebooks//"

/content/drive/My Drive/Colab Notebooks


In [3]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')



In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.metrics import classification_report
import re

In [6]:
jeopardy_data = pd.read_csv('JEOPARDY_CSV.csv')
# remove spaces from column names
jeopardy_data.columns = [col.strip() for col in jeopardy_data.columns]
print(jeopardy_data.shape)
jeopardy_data.head()

jeopardy_data_sub = jeopardy_data[jeopardy_data['Round'] == 'Jeopardy!']
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub.Answer.str.isalpha() == True]
jeopardy_data_sub = jeopardy_data_sub[~jeopardy_data_sub.Question.str.contains("<a href=")]
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub['Show Number'] >= 4000]
jeopardy_data_sub = jeopardy_data_sub[jeopardy_data_sub['Question'].str.split().str.len() >= 5]

import random
# get a sample of 1,000 for each Value
jeopardy_data_sub_200 = jeopardy_data_sub[jeopardy_data_sub['Value']=='$200'].sample(2000, random_state=670)
jeopardy_data_sub_1000 = jeopardy_data_sub[jeopardy_data_sub['Value']=='$1000'].sample(2000, random_state=670)
jeopardy_data_sub = pd.concat([jeopardy_data_sub_200, jeopardy_data_sub_1000])


print(jeopardy_data_sub.shape)
jeopardy_data_sub.head(10)

(216930, 7)
(4000, 7)


Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
166403,4536,2004-05-03,Jeopardy!,"DOUBLE A, B, Cs",$200,"I'm this, you're glue, everything you say boun...",rubber
781,4335,2003-06-06,Jeopardy!,MY PLACE?,$200,"A Norman could say, ""I'm the king of the motte...",castle
119920,5224,2007-05-03,Jeopardy!,POTPOURRI,$200,Shelley & Eliot would be happy to know that Ap...,poetry
33882,5668,2009-04-08,Jeopardy!,IT'S A COUNTRY THING,$200,Hat dance & jumping bean,Mexican
186569,6247,2011-11-15,Jeopardy!,MELTING POTPOURRI,$200,"""Our actors"", says Prospero, ""were all spirits...",air
45283,5687,2009-05-05,Jeopardy!,ARCHAEOLOGY,$200,In 1996 Franck Goddio discovered her palace un...,Cleopatra
184063,5023,2006-06-14,Jeopardy!,& TAKIN' NAMES,$200,"World poverty fighter, Time magazine Person of...",Bono
85474,5139,2007-01-04,Jeopardy!,BEAN,$200,This bean that shares the name of a South Amer...,lima
155694,5853,2010-02-10,Jeopardy!,BE TRUE TO YOUR SCHOOL,$200,"The benefactor for whom this West Lafayette, I...",Purdue
147690,4293,2003-04-09,Jeopardy!,DECODE THE PERSONAL AD,$200,"To start with, S. is for this",single


In [7]:
jeopardy_data_sub['distance'] = jeopardy_data_sub.apply(lambda x: model.wmdistance(str(x['Question']), str(x['Answer'])), axis=1)
jeopardy_data_sub['cat'] = jeopardy_data_sub.apply(lambda x: model.wmdistance(str(x['Question']), str(x['Category'])), axis=1)

In [8]:
jeopardy_data_sub['label']= jeopardy_data_sub['Value'].map(lambda x: int(x.replace('$','')))
# turn labels to int
jeopardy_data_sub['label_id'],group_name = pd.factorize(jeopardy_data_sub['label'])

#jeopardy_data_sub['Show Number_id'],group_name = pd.factorize(jeopardy_data_sub['Show Number'])

In [9]:
jeopardy_data_sub.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,distance,cat,label,label_id,Show Number_id
166403,4536,2004-05-03,Jeopardy!,"DOUBLE A, B, Cs",$200,"I'm this, you're glue, everything you say boun...",rubber,1.892638,2.835478,200,0,0
781,4335,2003-06-06,Jeopardy!,MY PLACE?,$200,"A Norman could say, ""I'm the king of the motte...",castle,1.754963,3.130233,200,0,1
119920,5224,2007-05-03,Jeopardy!,POTPOURRI,$200,Shelley & Eliot would be happy to know that Ap...,poetry,1.629959,3.043004,200,0,2
33882,5668,2009-04-08,Jeopardy!,IT'S A COUNTRY THING,$200,Hat dance & jumping bean,Mexican,1.698437,2.877467,200,0,3
186569,6247,2011-11-15,Jeopardy!,MELTING POTPOURRI,$200,"""Our actors"", says Prospero, ""were all spirits...",air,2.066404,2.869591,200,0,4


In [35]:

X = jeopardy_data_sub.filter(['Question','distance','cat'])

X_train, X_test, y_train, y_test = train_test_split(
    X, jeopardy_data_sub['label_id'], 
                   stratify=jeopardy_data_sub['label_id'],random_state=0)

vectorizer = TfidfVectorizer(stop_words='english',
                             sublinear_tf=True,
                             strip_accents='unicode',
                             analyzer='word',
                             token_pattern=r'\w{2,}',  #vectorize 2-character words or more
                             ngram_range=(1, 2),
                             max_features=1000)



X = vectorizer.fit_transform(X_train['Question']).toarray()
X = pd.DataFrame(X)

X_train['mean'] = X.mean(axis=1).values
X_train = X_train.filter(['mean','cat','distance'])

test = vectorizer.transform(X_test['Question']).toarray()
test = pd.DataFrame(test)
X_test['mean'] = test.mean(axis=1).values
X_test = X_test.filter(['mean','cat','distance'])

X_train

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = MultinomialNB().fit(X_train, y_train)
pred = clf.predict(X_train)
print("Train MAE: ",mean_absolute_error(y_train, pred))
print("Train acc:",np.mean(pred == y_train))

pred_test = clf.predict(X_test)
print("Test MAE: ",mean_absolute_error(y_test, pred_test))
print("Test acc:",np.mean(pred_test == y_test))
print(classification_report(y_test, pred_test))

Train MAE:  0.4726666666666667
Train acc: 0.5273333333333333
Test MAE:  0.463
Test acc: 0.537
              precision    recall  f1-score   support

           0       0.54      0.54      0.54       500
           1       0.54      0.53      0.54       500

    accuracy                           0.54      1000
   macro avg       0.54      0.54      0.54      1000
weighted avg       0.54      0.54      0.54      1000



In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = {
    'max_depth': [1,2,3,4,5],
    'max_features': [2, 3, 4],
    'n_estimators': [100, 200, 500]
}

rf = RandomForestClassifier(random_state=670)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   54.6s finished


{'max_depth': 3, 'max_features': 2, 'n_estimators': 500}

In [39]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier

reg = RandomForestClassifier(max_depth=3,max_features=2,n_estimators= 500,random_state=670
                             ).fit(X_train, y_train)

y_pred_train = reg.predict(X_train)
print('train mae: ')
a = mean_absolute_error(y_train,y_pred_train)
print(a)
print("Train acc:",np.mean(y_pred_train == y_train))


print('test mae: ')
prediction = reg.predict(X_test)
b = mean_absolute_error(y_test, prediction)
print(b)
print('Test accuracy: ',np.mean(prediction == y_test))
print(classification_report(y_test, prediction))

train mae: 
0.43
Train acc: 0.57
test mae: 
0.479
Test accuracy:  0.521
              precision    recall  f1-score   support

           0       0.51      0.77      0.62       500
           1       0.54      0.28      0.37       500

    accuracy                           0.52      1000
   macro avg       0.53      0.52      0.49      1000
weighted avg       0.53      0.52      0.49      1000



In [40]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

parameters = {'kernel':('rbf','linear'), 'C':[0.01,0.1,0.5,1]} #,'gamma':[ 50, 80, 100] 'linear', 
svc = svm.SVC() 
S = GridSearchCV(svc, parameters)
S.fit(X_train, y_train)
print("best: ",S.best_params_)


y_pred_train = S.predict(X_train)
print('train mae: ')
a = mean_absolute_error(y_train,y_pred_train)
print(a)
print("Train acc:",np.mean(y_pred_train == y_train))

print('test mae: ')
prediction = S.predict(X_test)
b = mean_absolute_error(y_test, prediction)
print(b)
print("Test acc:",np.mean(y_test == prediction))
print('best params: ',S.best_params_)
print(classification_report(y_test, prediction))

best:  {'C': 0.1, 'kernel': 'rbf'}
train mae: 
0.45866666666666667
Train acc: 0.5413333333333333
test mae: 
0.476
Test acc: 0.524
best params:  {'C': 0.1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.52      0.75      0.61       500
           1       0.54      0.30      0.38       500

    accuracy                           0.52      1000
   macro avg       0.53      0.52      0.50      1000
weighted avg       0.53      0.52      0.50      1000

