In [30]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score
import nltk
from keras.layers import LSTM, Dense
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
poems = gutenberg.raw('blake-poems.txt')
stories = gutenberg.raw('bryant-stories.txt')
busterbrown = gutenberg.raw('burgess-busterbrown.txt')
alice = gutenberg.raw('carroll-alice.txt')
ball = gutenberg.raw('chesterton-ball.txt')
parents = gutenberg.raw('edgeworth-parents.txt')
moby_dick = gutenberg.raw('melville-moby_dick.txt')
paradise = gutenberg.raw('milton-paradise.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
poems = re.sub(r'CHAPTER .*', '', poems)
stories = re.sub(r'CHAPTER .*', '', stories)
busterbrown = re.sub(r'CHAPTER .*', '', busterbrown)
ball = re.sub(r'CHAPTER .*', '', ball)
parents = re.sub(r'CHAPTER .*', '', parents)
moby_dick = re.sub(r'CHAPTER .*', '', moby_dick)
paradise = re.sub(r'CHAPTER .*', '', paradise)
hamlet = re.sub(r'CHAPTER .*', '', hamlet)

In [4]:
# Decrease size of our text dataset

alice = text_cleaner(alice[:int(len(alice)/10)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/10)])
poems = text_cleaner(poems[:int(len(poems)/10)])
stories = text_cleaner(stories[:int(len(stories)/10)])
busterbrown = text_cleaner(busterbrown[:int(len(busterbrown)/10)])
ball = text_cleaner(ball[:int(len(ball)/10)])
parents = text_cleaner(parents[:int(len(parents)/10)])
moby_dick = text_cleaner(moby_dick[:int(len(moby_dick)/10)])
paradise = text_cleaner(paradise[:int(len(paradise)/10)])
hamlet = text_cleaner(hamlet[:int(len(hamlet)/10)])

In [11]:
nltk.sent_tokenize(alice)

["Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'",
 'So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.',
 "There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!",
 'Oh dear!',
 "I shall be late!'",
 '(when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually TOOK A WATCH OUT OF ITS WAISTCOAT-POCKET, and

In [28]:
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)
poems_doc = nlp(poems)
stories_doc = nlp(stories)
busterbrown_doc = nlp(busterbrown)
ball_doc = nlp(ball)
parents_doc = nlp(parents)
moby_dick_doc = nlp(moby_dick)
paradise_doc = nlp(paradise)
hamlet_doc = nlp(hamlet)

In [5]:
# Convert all the paragraph into text and it's labels

alice_sents = [[sent, "Carroll"] for sent in nltk.sent_tokenize(alice)]
persuasion_sents = [[sent, "Austen"] for sent in nltk.sent_tokenize(persuasion)]
poems_sents = [[sent, "Blake"] for sent in nltk.sent_tokenize(poems)]
stories_sents = [[sent, "Bryant"] for sent in nltk.sent_tokenize(stories)]
busterbrown_sents = [[sent, "Burgess"] for sent in nltk.sent_tokenize(busterbrown)]
ball_sents = [[sent, "Chesterton"] for sent in nltk.sent_tokenize(ball)]
parents_sents = [[sent, "Edgeworth"] for sent in nltk.sent_tokenize(parents)]
moby_dick_sents = [[sent, "Melville"] for sent in nltk.sent_tokenize(moby_dick)]
paradise_sents = [[sent, "Milton"] for sent in nltk.sent_tokenize(paradise)]
hamlet_sents = [[sent, "Shakespeare"] for sent in nltk.sent_tokenize(hamlet)]

In [6]:
sentences = pd.DataFrame(alice_sents + persuasion_sents + poems_sents + stories_sents + busterbrown_sents + ball_sents + parents_sents + moby_dick_sents + paradise_sents + hamlet_sents)
sentences.head()

Unnamed: 0,0,1
0,Alice was beginning to get very tired of sitti...,Carroll
1,So she was considering in her own mind (as wel...,Carroll
2,There was nothing so VERY remarkable in that; ...,Carroll
3,Oh dear!,Carroll
4,I shall be late!',Carroll


In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences[0])
Y = sentences[1]

In [8]:
X.shape

(3588, 9105)

## Creating clusters

In [9]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(X)
y_predict = kmeans.predict(X)

In [10]:
print(silhouette_score(X, kmeans.labels_))
pd.crosstab(Y, y_predict)

0.008471969616531053


col_0,0,1,2,3,4,5,6,7,8,9
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austen,50,48,3,63,10,2,40,6,55,3
Blake,8,3,0,6,4,0,3,0,0,4
Bryant,54,11,26,54,13,7,36,2,34,90
Burgess,23,6,1,11,5,1,39,4,0,18
Carroll,30,23,9,5,4,0,1,4,40,2
Chesterton,58,28,27,132,70,10,91,14,0,9
Edgeworth,102,73,105,130,19,21,152,37,104,35
Melville,358,134,37,330,94,2,98,25,7,5
Milton,34,47,0,98,9,0,16,6,0,0
Shakespeare,121,31,16,16,10,0,5,9,2,0


From cross tab poems from Blake is least identified in our clustering model. So our model performed pretty poorly.

In [11]:
sentences[1].value_counts()

Melville       1090
Edgeworth       778
Chesterton      439
Bryant          327
Austen          280
Milton          210
Shakespeare     210
Carroll         118
Burgess         108
Blake            28
Name: 1, dtype: int64

## Vectorizing methods

We will try TfidfVectorizer and CountVectorizer for converting our data into numerical form. Next we would compare both those models for higher accuracy.

In [None]:
X = sentences[0]
Y = sentences[1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [19]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_train_tfidf, X_test_tfidf, Y_train, Y_test = train_test_split(X_tfidf, Y)

count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(X_train)
X_train_count, X_test_count, Y_train, Y_test = train_test_split(X_count, Y)

# Supervised Modeling

Now let's make some models with labels available to us. We'll try 3 different supervised models, RandomForestClassifier, GradientBoostingClassifier and LogisticRegression and see which works best.

### RandomForestClassifier

In [21]:
# random forest with tfidf vectorizer
rfc_tfidf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=30)
rfc_tfidf.fit(X_tfidf, Y)
print("Tfidf Train score: ", rfc_tfidf.score(X_train_tfidf, Y_train))
print("Tfidf Test score: ", rfc_tfidf.score(X_test_tfidf, Y_test))

# random forest with count vectorizer
rfc_count = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=30)
rfc_count.fit(X_count, Y)
print("Count Train score: ", rfc_count.score(X_train_count, Y_train))
print("Count Test score: ", rfc_count.score(X_test_count, Y_test))

Tfidf Train score:  0.30360460795243405
Tfidf Test score:  0.3032329988851728
Count Train score:  0.3240431066518023
Count Test score:  0.30992196209587514


### GradientBoostingClassifier

In [23]:
# Gradient boosting with tfidf vectorizer
gbc_tfidf = GradientBoostingClassifier()
gbc_tfidf.fit(X_tfidf, Y)
print("Tfidf Train score: ", gbc_tfidf.score(X_train_tfidf, Y_train))
print("Tfidf Test score: ", gbc_tfidf.score(X_test_tfidf, Y_test))

# Gradient boosting with count vectorizer
gbc_count = GradientBoostingClassifier()
gbc_count.fit(X_count, Y)
print("Count Train score: ", gbc_count.score(X_train_count, Y_train))
print("Count Test score: ", gbc_count.score(X_test_count, Y_test))

Tfidf Train score:  0.1958379784466741
Tfidf Test score:  0.20624303232998886
Count Train score:  0.8454106280193237
Count Test score:  0.8472686733556298


### LogisticRegression

In [24]:
# Logistic Regression with tfidf Vectorizer
lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_tfidf, Y)
print("Tfidf Train score: ", lr_tfidf.score(X_tfidf, Y))
print("Tfidf Test score: ", lr_tfidf.score(X_test_tfidf, Y_test))

# Logistic Regression with count vectorizer
lr_count = LogisticRegression()
lr_count.fit(X_count, Y)
print("Count Train score: ", lr_count.score(X_train_count, Y_train))
print("Count Test score: ", lr_count.score(X_test_count, Y_test))



Tfidf Train score:  0.8199554069119287
Tfidf Test score:  0.2129319955406912
Count Train score:  0.9535488665923448
Count Test score:  0.9509476031215162


Best model from supervised learning is Logistic Regression with accuracy of 95% in both train and test dataset. Best performing vectorizer is CountVectorizer in every model. TfidfVectorizer tends to overfitting in LogisticRegression.

# Unsupervise modeling

In [27]:
model = Sequential()
model.add(LSTM((1), input_shape=(X_train_count.shape[0], X_train_count.shape[1]), return_sequences=False))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

W0712 20:55:27.876821 13224 deprecation_wrapper.py:119] From C:\Users\vivek\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0712 20:55:28.099245 13224 deprecation_wrapper.py:119] From C:\Users\vivek\Anaconda3\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0712 20:55:28.113208 13224 deprecation_wrapper.py:119] From C:\Users\vivek\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0712 20:55:28.118165 13224 deprecation.py:323] From C:\Users\vivek\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 1)                 36428     
Total params: 36,428
Trainable params: 36,428
Non-trainable params: 0
_________________________________________________________________


In [32]:
X_train_count = pad_sequences(X_train_count, dtype="float32")

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [28]:
history = model.fit(X_train_count, Y_train, epochs=50, batch_size=50, validation_data=(X_test_count, Y_test))

ValueError: Error when checking input: expected lstm_2_input to have 3 dimensions, but got array with shape (2691, 9105)