# Harry Potter Houses

In [209]:
import pandas
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import random
import pylab as pl
from imblearn.over_sampling import RandomOverSampler
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import gensim



In [82]:
path = 'hp-with-text.csv'

hp_data = pandas.read_csv(path, sep='\t',
                           names=['name', "house", "text"], header=1)
hp_data.head()

Unnamed: 0,name,house,text
0,Harry Potter,Gryffindor,"Harry was an extremely brave, loyal, and selfl..."
1,Dolores Umbridge,Slytherin,Dolores Umbridge was nothing short of a sociop...
2,Horace Slughorn,Slytherin,Horace Slughorn was described as having a bumb...
3,Albus Dumbledore,Gryffindor,Considered to be the most powerful wizard of h...
4,Severus Snape,Slytherin,"At times, Snape could appear cold, cynical, ma..."


In [83]:
print(len(hp_data[hp_data.house=='Gryffindor']), len(hp_data[hp_data.house=='Slytherin']), len(hp_data[hp_data.house=='Ravenclaw']), len(hp_data[hp_data.house=='Hufflepuff']))

42 26 15 11


In [84]:
dummyresults = ['Gryffindor' for i in range(len(hp_data))]
'{0:.3f}'.format(accuracy_score(dummyresults, hp_data.house))

'0.447'

In [85]:
train, test, train_answers, test_answers = train_test_split(hp_data.text, hp_data.house, test_size=0.2)

In [166]:
bow = CountVectorizer(stop_words='english', min_df=10, max_df=35)

In [167]:
bowed_train = bow.fit_transform(train)
bowed_test = bow.transform(test)

In [168]:
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(bowed_train.toarray(), train_answers)

In [169]:
naive_model = MultinomialNB()
naive_model.fit(X_resampled, np.array(y_resampled))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [170]:
predicted = naive_model.predict(bowed_test)
print(len(test_answers), len(predicted))
print(classification_report(test_answers, predicted))
print(confusion_matrix(test_answers, predicted))

19 19
             precision    recall  f1-score   support

 Gryffindor       0.71      0.56      0.63         9
 Hufflepuff       0.00      0.00      0.00         2
  Ravenclaw       0.40      0.67      0.50         3
  Slytherin       0.67      0.80      0.73         5

avg / total       0.58      0.58      0.57        19

[[5 1 2 1]
 [0 0 1 1]
 [1 0 2 0]
 [1 0 0 4]]


# Try again

In [200]:
Key_words = {'Gryffindor': 'gryffindor brave honest courage bravery determination worthy phoenix auror'.split(' '),
            'Slytherin': 'slytherin dark leader proud ambitious pure-blood'.split(' '),
            'Ravenclaw': 'ravenclaw clever wit learning wisdom learn'.split(' '),
            'Hufflepuff': 'hyfflepuff good friendly patience loyalty fair loyal friend'.split(' ')}

In [201]:
stop = set(stopwords.words('english'))

def tokenize(text):
    text = text.lower()
    return word_tokenize(text)

lmtzr = WordNetLemmatizer()

def lemmatize(word):
    if word in stop:
        return ''
    noun = lmtzr.lemmatize(word)
    verb = lmtzr.lemmatize(word, 'v')
    if verb != word:
        if verb in stop:
            return ''
        return verb 
    return noun

def lematize(text):
    text = tokenize(text)
    text = ' '.join([lemmatize(word) for word in text])
    return text

hp_data.text = [lematize(sent) for sent in hp_data.text]

In [202]:
vectors = []
for i in hp_data.text:
    vecs = [0, 0, 0, 0]
    for word in i.split(' '):
        if word in Key_words['Gryffindor']:
            vecs[0] += 1
        elif word in Key_words['Hufflepuff']:
            vecs[1] += 1
        if word in Key_words['Slytherin']:
            vecs[2] += 1
        if word in Key_words['Ravenclaw']:
            vecs[3] += 1
    vectors.append(vecs)
hp_data['vecs'] = vectors
hp_data.head()

Unnamed: 0,name,house,text,vecs
0,Harry Potter,Gryffindor,"harry extremely brave , loyal , selfless perso...","[5, 15, 10, 6]"
1,Dolores Umbridge,Slytherin,dolores umbridge nothing short sociopath chara...,"[0, 3, 1, 0]"
2,Horace Slughorn,Slytherin,"horace slughorn describe bumble , jovial sort ...","[2, 3, 4, 0]"
3,Albus Dumbledore,Gryffindor,"consider powerful wizard time , dumbledore ben...","[0, 7, 0, 0]"
4,Severus Snape,Slytherin,"time , snape could appear cold , cynical , mal...","[2, 1, 4, 1]"


In [203]:
train, test, train_answers, test_answers = train_test_split(hp_data.vecs, hp_data.house, test_size=0.2)

In [204]:
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(bowed_train.toarray(), train_answers)

In [205]:
naive_model = MultinomialNB()
naive_model.fit(X_resampled, np.array(y_resampled))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [206]:
predicted = naive_model.predict(bowed_test)
print(len(test_answers), len(predicted))
print(classification_report(test_answers, predicted))
print(confusion_matrix(test_answers, predicted))

19 19
             precision    recall  f1-score   support

 Gryffindor       0.40      0.50      0.44         8
 Hufflepuff       0.00      0.00      0.00         2
  Ravenclaw       0.00      0.00      0.00         4
  Slytherin       0.33      0.20      0.25         5

avg / total       0.26      0.26      0.25        19

[[4 4 0 0]
 [1 0 0 1]
 [1 2 0 1]
 [4 0 0 1]]


  'precision', 'predicted', average, warn_for)


# expand keywords

In [211]:
m = '.\GoogleNews-vectors-negative300.bin.gz'
if m.endswith('.vec.gz'):
    model = gensim.models.Word2Vec.load_word2vec_format(m, binary=False)
elif m.endswith('.bin.gz'):
    model = gensim.models.Word2Vec.load_word2vec_format(m, binary=True)
else:
    model = gensim.models.Word2Vec.load(m)

DeprecationWarning: Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.

In [202]:
vectors = []
for i in hp_data.text:
    vecs = [0, 0, 0, 0]
    for word in i.split(' '):
        if word in Key_words['Gryffindor']:
            vecs[0] += 1
        elif word in Key_words['Hufflepuff']:
            vecs[1] += 1
        if word in Key_words['Slytherin']:
            vecs[2] += 1
        if word in Key_words['Ravenclaw']:
            vecs[3] += 1
    vectors.append(vecs)
hp_data['vecs'] = vectors
hp_data.head()

Unnamed: 0,name,house,text,vecs
0,Harry Potter,Gryffindor,"harry extremely brave , loyal , selfless perso...","[5, 15, 10, 6]"
1,Dolores Umbridge,Slytherin,dolores umbridge nothing short sociopath chara...,"[0, 3, 1, 0]"
2,Horace Slughorn,Slytherin,"horace slughorn describe bumble , jovial sort ...","[2, 3, 4, 0]"
3,Albus Dumbledore,Gryffindor,"consider powerful wizard time , dumbledore ben...","[0, 7, 0, 0]"
4,Severus Snape,Slytherin,"time , snape could appear cold , cynical , mal...","[2, 1, 4, 1]"


In [203]:
train, test, train_answers, test_answers = train_test_split(hp_data.vecs, hp_data.house, test_size=0.2)

In [204]:
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(bowed_train.toarray(), train_answers)

In [205]:
naive_model = MultinomialNB()
naive_model.fit(X_resampled, np.array(y_resampled))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [206]:
predicted = naive_model.predict(bowed_test)
print(len(test_answers), len(predicted))
print(classification_report(test_answers, predicted))
print(confusion_matrix(test_answers, predicted))

19 19
             precision    recall  f1-score   support

 Gryffindor       0.40      0.50      0.44         8
 Hufflepuff       0.00      0.00      0.00         2
  Ravenclaw       0.00      0.00      0.00         4
  Slytherin       0.33      0.20      0.25         5

avg / total       0.26      0.26      0.25        19

[[4 4 0 0]
 [1 0 0 1]
 [1 2 0 1]
 [4 0 0 1]]


  'precision', 'predicted', average, warn_for)
