In [69]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.cluster import KMeans, MeanShift, SpectralClustering
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score, confusion_matrix
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
import nltk
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
poems = gutenberg.raw('blake-poems.txt')
stories = gutenberg.raw('bryant-stories.txt')
busterbrown = gutenberg.raw('burgess-busterbrown.txt')
alice = gutenberg.raw('carroll-alice.txt')
ball = gutenberg.raw('chesterton-ball.txt')
parents = gutenberg.raw('edgeworth-parents.txt')
moby_dick = gutenberg.raw('melville-moby_dick.txt')
paradise = gutenberg.raw('milton-paradise.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
poems = re.sub(r'CHAPTER .*', '', poems)
stories = re.sub(r'CHAPTER .*', '', stories)
busterbrown = re.sub(r'CHAPTER .*', '', busterbrown)
ball = re.sub(r'CHAPTER .*', '', ball)
parents = re.sub(r'CHAPTER .*', '', parents)
moby_dick = re.sub(r'CHAPTER .*', '', moby_dick)
paradise = re.sub(r'CHAPTER .*', '', paradise)
hamlet = re.sub(r'CHAPTER .*', '', hamlet)

In [4]:
# Decrease size of our text dataset

alice = text_cleaner(alice[:int(len(alice)/10)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/10)])
poems = text_cleaner(poems[:int(len(poems)/10)])
stories = text_cleaner(stories[:int(len(stories)/10)])
busterbrown = text_cleaner(busterbrown[:int(len(busterbrown)/10)])
ball = text_cleaner(ball[:int(len(ball)/10)])
parents = text_cleaner(parents[:int(len(parents)/10)])
moby_dick = text_cleaner(moby_dick[:int(len(moby_dick)/10)])
paradise = text_cleaner(paradise[:int(len(paradise)/10)])
hamlet = text_cleaner(hamlet[:int(len(hamlet)/10)])

In [5]:
nltk.sent_tokenize(alice)

["Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'",
 'So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.',
 "There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!",
 'Oh dear!',
 "I shall be late!'",
 '(when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually TOOK A WATCH OUT OF ITS WAISTCOAT-POCKET, and

In [6]:
# Convert all the paragraph into text and it's labels

alice_sents = [[sent, "Carroll"] for sent in nltk.sent_tokenize(alice)]
persuasion_sents = [[sent, "Austen"] for sent in nltk.sent_tokenize(persuasion)]
poems_sents = [[sent, "Blake"] for sent in nltk.sent_tokenize(poems)]
stories_sents = [[sent, "Bryant"] for sent in nltk.sent_tokenize(stories)]
busterbrown_sents = [[sent, "Burgess"] for sent in nltk.sent_tokenize(busterbrown)]
ball_sents = [[sent, "Chesterton"] for sent in nltk.sent_tokenize(ball)]
parents_sents = [[sent, "Edgeworth"] for sent in nltk.sent_tokenize(parents)]
moby_dick_sents = [[sent, "Melville"] for sent in nltk.sent_tokenize(moby_dick)]
paradise_sents = [[sent, "Milton"] for sent in nltk.sent_tokenize(paradise)]
hamlet_sents = [[sent, "Shakespeare"] for sent in nltk.sent_tokenize(hamlet)]

In [115]:
sentences = pd.DataFrame(alice_sents + persuasion_sents + poems_sents + stories_sents + busterbrown_sents + ball_sents + parents_sents + moby_dick_sents + paradise_sents + hamlet_sents)
sentences.head()

Unnamed: 0,0,1
0,Alice was beginning to get very tired of sitti...,Carroll
1,So she was considering in her own mind (as wel...,Carroll
2,There was nothing so VERY remarkable in that; ...,Carroll
3,Oh dear!,Carroll
4,I shall be late!',Carroll


# Creating clusters

## Decomposition

For decreasing the features size of our data we need to use some technique of decomposition. We can't simply use PCA for text vectors so we'll be comparing two other techniques which are used for reducing text features.
    
    1. TruncatedSVD or LSA (Latent Semantic Analysis)
    2. LDA (Latend Dirichlet Allocation)

In [80]:
X = sentences[0]
Y = sentences[1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=44)

In [9]:
# making a tfidf vector for converting text data into numerical form
cluster_vectorizer = TfidfVectorizer()
X_cluster = cluster_vectorizer.fit_transform(X_train).toarray()
Y_cluster = Y_train

### LDA

In [10]:
lda = LatentDirichletAllocation()
x_lda = lda.fit_transform(X_cluster)

In [11]:
kmeans_lda = KMeans(n_clusters=10)
kmeans_lda.fit(x_lda)
y_predict = kmeans_lda.predict(x_lda)
print(silhouette_score(x_lda, kmeans_lda.labels_))
pd.crosstab(Y_cluster, y_predict)

0.5200726160869298


col_0,0,1,2,3,4,5,6,7,8,9
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austen,3,141,11,5,6,5,9,4,9,5
Blake,1,11,1,1,1,0,0,1,1,1
Bryant,26,110,45,14,7,15,6,23,10,2
Burgess,2,57,11,1,1,0,0,2,4,2
Carroll,1,64,8,0,4,1,6,1,2,3
Chesterton,31,141,45,30,12,20,18,11,17,13
Edgeworth,19,332,151,12,13,10,7,7,15,18
Melville,64,284,111,60,55,55,32,51,49,43
Milton,14,33,31,18,8,13,10,12,9,15
Shakespeare,15,28,40,5,6,12,17,20,6,9


Our silhouette score is not so bad for LDA. Now let's check this data on Mean Shift algorithm to check what number of clusters it chooses.

In [12]:
meanshift_lda = MeanShift()
meanshift_lda.fit(x_lda)
y_pred = meanshift_lda.predict(x_lda)
pd.crosstab(Y_cluster, y_pred)

col_0,0,1
1,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,187,11
Blake,17,1
Bryant,208,50
Burgess,69,11
Carroll,79,11
Chesterton,292,46
Edgeworth,427,157
Melville,684,120
Milton,130,33
Shakespeare,109,49


Mean Shift does not require to specify the number of clusters as it uses automatically chooses appropriate number of clusters and tries to fit our model based on that number. We only use this method when our data is small because it takes a lot of time to train.

For LDA MeanShift worked well but looking at crosstab we see that it only has 2 clusters. So, it makes it less reliable than k-means with 10 clusters as we know the number of authors in our data.

### TruncatedSVD

In [13]:
svd = TruncatedSVD(n_components=100, random_state=40)
x_svd = svd.fit_transform(X_cluster)

In [14]:
kmeans_svd = KMeans(n_clusters=10)
kmeans_svd.fit(x_svd)
y_predict = kmeans_svd.predict(x_svd)
print(silhouette_score(x_svd, kmeans_svd.labels_))
pd.crosstab(Y_cluster, y_predict)

0.013103230087110425


col_0,0,1,2,3,4,5,6,7,8,9
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austen,30,52,9,4,2,25,23,37,4,12
Blake,2,3,2,0,3,0,2,6,0,0
Bryant,46,17,13,2,4,28,37,53,35,23
Burgess,7,7,4,4,1,0,30,22,1,4
Carroll,5,8,3,3,2,32,1,28,4,4
Chesterton,93,32,50,12,9,0,62,45,19,16
Edgeworth,60,73,18,26,13,64,107,92,86,45
Melville,201,108,65,11,40,5,65,256,26,27
Milton,35,75,5,5,1,0,5,26,0,11
Shakespeare,5,23,10,6,20,2,3,77,11,1


In [15]:
meanshift_svd = MeanShift()
meanshift_svd.fit(x_svd)
print(silhouette_score(x_svd, meanshift_svd.labels_))
y_pred = meanshift_svd.predict(x_svd)
pd.crosstab(Y_cluster, y_pred)

KeyboardInterrupt: 

Both these models performed very poorly for TruncatedSVD and as per other machine learning experts LDA tends to be much more reliable compared to LSA which is clearly observed in our scores.

In [42]:
sclustering = SpectralClustering(n_clusters=10, random_state=40)
y_pred = sclustering.fit_predict(x_lda)
print(silhouette_score(x_svd, sclustering.labels_))
pd.crosstab(Y_cluster, y_pred)

-0.12021922695261111


col_0,0,1,2,3,4,5,6,7,8,9
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austen,6,7,2,157,7,5,5,7,8,5
Blake,0,2,0,13,1,0,3,2,1,0
Bryant,5,3,4,172,4,13,13,13,10,8
Burgess,1,3,2,61,0,3,1,2,7,2
Carroll,3,3,1,70,0,0,2,5,2,2
Chesterton,22,17,3,198,18,12,17,25,18,10
Edgeworth,4,5,5,468,14,19,12,12,9,9
Melville,52,48,44,402,43,34,47,52,44,63
Milton,15,10,9,53,6,10,19,12,10,8
Shakespeare,11,24,10,43,6,15,9,26,15,8


SpectralClustering is doing very poor job so we won't be looking into it.

In [43]:
sentences[1].value_counts()

Melville       1090
Edgeworth       778
Chesterton      439
Bryant          327
Austen          280
Shakespeare     210
Milton          210
Carroll         118
Burgess         108
Blake            28
Name: 1, dtype: int64

## Vectorizing methods

We will try TfidfVectorizer and CountVectorizer for converting our data into numerical form. Next we would compare both those models for higher accuracy.

In [101]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, Y_train)

count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(X_train)
X_train_count, X_test_count, y_train, y_test = train_test_split(X_count, Y_train)

# Supervised Modeling

Now let's make some models with labels available to us. We'll try 3 different supervised models, RandomForestClassifier, GradientBoostingClassifier and LogisticRegression and see which works best.

### RandomForestClassifier

In [21]:
# random forest with tfidf vectorizer
rfc_tfidf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=30)
rfc_tfidf.fit(X_tfidf, Y)
print("Tfidf Train score: ", rfc_tfidf.score(X_train_tfidf, Y_train))

# random forest with count vectorizer
rfc_count = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=30)
rfc_count.fit(X_count, Y)
print("Count Train score: ", rfc_count.score(X_train_count, Y_train))

Tfidf Train score:  0.30360460795243405
Tfidf Test score:  0.3032329988851728
Count Train score:  0.3240431066518023
Count Test score:  0.30992196209587514


### GradientBoostingClassifier

In [23]:
# Gradient boosting with tfidf vectorizer
gbc_tfidf = GradientBoostingClassifier()
gbc_tfidf.fit(X_tfidf, Y)
print("Tfidf Train score: ", gbc_tfidf.score(X_train_tfidf, Y_train))

# Gradient boosting with count vectorizer
gbc_count = GradientBoostingClassifier()
gbc_count.fit(X_count, Y)
print("Count Train score: ", gbc_count.score(X_train_count, Y_train))

Tfidf Train score:  0.1958379784466741
Tfidf Test score:  0.20624303232998886
Count Train score:  0.8454106280193237
Count Test score:  0.8472686733556298


### LogisticRegression

In [24]:
# Logistic Regression with tfidf Vectorizer
lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_tfidf, Y)
print("Tfidf Train score: ", lr_tfidf.score(X_tfidf, Y))

# Logistic Regression with count vectorizer
lr_count = LogisticRegression()
lr_count.fit(X_count, Y)
print("Count Train score: ", lr_count.score(X_train_count, Y_train))



Tfidf Train score:  0.8199554069119287
Tfidf Test score:  0.2129319955406912
Count Train score:  0.9535488665923448
Count Test score:  0.9509476031215162


Best model from supervised learning is Logistic Regression with accuracy of 95% in both train and test dataset. Best performing vectorizer is CountVectorizer in every model. TfidfVectorizer tends to overfitting in LogisticRegression.

# Unsupervise modeling

#### Count vectorizer

In [102]:
lda = LatentDirichletAllocation()
x_lda = lda.fit_transform(X_train_count)
x_test_count = lda.transform(X_test_count)

In [103]:
x_lda

array([[0.05      , 0.05      , 0.05000001, ..., 0.05000001, 0.05000001,
        0.05000001],
       [0.00322683, 0.00322628, 0.00322641, ..., 0.00322651, 0.00322581,
        0.00322648],
       [0.00344935, 0.00344868, 0.00344876, ..., 0.00344832, 0.00344828,
        0.00344847],
       ...,
       [0.8499943 , 0.01666818, 0.01666683, ..., 0.01666667, 0.01666693,
        0.01666706],
       [0.01111453, 0.01111244, 0.01111167, ..., 0.01111196, 0.01111111,
        0.01111224],
       [0.45265053, 0.00400032, 0.11696069, ..., 0.00400014, 0.004     ,
        0.00400016]])

In [104]:
X_train_count = pad_sequences(x_lda)

In [105]:
model = Sequential()
model.add(Embedding(4000, 128,input_length = X_train_count.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(256, dropout_U=0.2, dropout_W=0.2, return_sequences=False))
model.add(Dense(9, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  after removing the cwd from sys.path.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 10, 128)           512000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 10, 128)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dense_6 (Dense)              (None, 9)                 2313      
Total params: 908,553
Trainable params: 908,553
Non-trainable params: 0
_________________________________________________________________


In [106]:
y_train.value_counts()

Edgeworth      428
Chesterton     250
Bryant         182
Austen         158
Milton         121
Shakespeare    111
Carroll         75
Burgess         65
Blake           14
Name: 1, dtype: int64

In [107]:
y_train = pd.get_dummies(y_train)

In [108]:
print(y_train)

      Austen  Blake  Bryant  Burgess  Carroll  Chesterton  Edgeworth  Milton  \
2417       0      0       0        0        0           0          0       0   
1791       0      0       0        0        0           0          1       0   
1666       0      0       0        0        0           0          1       0   
49         0      0       0        0        1           0          0       0   
897        0      0       0        0        0           1          0       0   
1899       0      0       0        0        0           0          1       0   
1374       0      0       0        0        0           0          1       0   
1435       0      0       0        0        0           0          1       0   
198        1      0       0        0        0           0          0       0   
1550       0      0       0        0        0           0          1       0   
1          0      0       0        0        1           0          0       0   
2421       0      0       0        0    

In [109]:
y_test = pd.get_dummies(y_test)

In [110]:
print(X_train_count.shape, y_train.shape)
print(x_test_count.shape, y_test.shape)

(1404, 10) (1404, 9)
(469, 10) (469, 9)


In [111]:
history = model.fit(X_train_count, y_train, epochs=10, batch_size=32, validation_data=(x_test_count, y_test), verbose=2)

Train on 1404 samples, validate on 469 samples
Epoch 1/10
 - 3s - loss: 0.3209 - acc: 0.8883 - val_loss: 0.3184 - val_acc: 0.8889
Epoch 2/10
 - 1s - loss: 0.3184 - acc: 0.8889 - val_loss: 0.3181 - val_acc: 0.8889
Epoch 3/10
 - 1s - loss: 0.3180 - acc: 0.8889 - val_loss: 0.3180 - val_acc: 0.8889
Epoch 4/10
 - 1s - loss: 0.3171 - acc: 0.8889 - val_loss: 0.3196 - val_acc: 0.8889
Epoch 5/10
 - 1s - loss: 0.3185 - acc: 0.8889 - val_loss: 0.3193 - val_acc: 0.8889
Epoch 6/10
 - 1s - loss: 0.3180 - acc: 0.8889 - val_loss: 0.3173 - val_acc: 0.8889
Epoch 7/10
 - 1s - loss: 0.3174 - acc: 0.8889 - val_loss: 0.3205 - val_acc: 0.8889
Epoch 8/10
 - 1s - loss: 0.3175 - acc: 0.8889 - val_loss: 0.3174 - val_acc: 0.8889
Epoch 9/10
 - 1s - loss: 0.3174 - acc: 0.8889 - val_loss: 0.3169 - val_acc: 0.8889
Epoch 10/10
 - 1s - loss: 0.3170 - acc: 0.8889 - val_loss: 0.3175 - val_acc: 0.8889


In [112]:
y_predict = model.predict_classes(x_test_count)
y_predict_train = model.predict_classes(X_train_count)
print(y_predict.shape)
print(y_test.shape)

(469,)
(469, 9)


In [113]:
print(y_predict)
print(y_predict_train)

[6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
[6 6 6 ... 6 6 6]


In [114]:
confusion_matrix(y_test, y_predict)

ValueError: Classification metrics can't handle a mix of multilabel-indicator and binary targets

#### Tfidf Vectorizer

In [61]:
lda = LatentDirichletAllocation()
x_lda = lda.fit_transform(X_train_tfidf)
x_test_tfidf = lda.transform(X_test_tfidf)

In [64]:
X_train_tfidf = pad_sequences(x_lda)

In [65]:
print(X_train_tfidf.shape, y_train.shape)
print(x_test_tfidf.shape, y_test.shape)

(2018, 10) (2018, 10)
(673, 10) (673, 10)


In [66]:
history = model.fit(X_train_tfidf, y_train, epochs=10, batch_size=5, validation_data=(x_test_tfidf, y_test), verbose=2)

Train on 2018 samples, validate on 673 samples
Epoch 1/10
 - 8s - loss: 0.2877 - acc: 0.9000 - val_loss: 0.2876 - val_acc: 0.9000
Epoch 2/10
 - 8s - loss: 0.2878 - acc: 0.9000 - val_loss: 0.2867 - val_acc: 0.9000
Epoch 3/10
 - 9s - loss: 0.2879 - acc: 0.9000 - val_loss: 0.2867 - val_acc: 0.9000
Epoch 4/10
 - 12s - loss: 0.2878 - acc: 0.9000 - val_loss: 0.2869 - val_acc: 0.9000
Epoch 5/10
 - 10s - loss: 0.2878 - acc: 0.9000 - val_loss: 0.2869 - val_acc: 0.9000
Epoch 6/10
 - 9s - loss: 0.2877 - acc: 0.9000 - val_loss: 0.2876 - val_acc: 0.9000
Epoch 7/10
 - 9s - loss: 0.2878 - acc: 0.9000 - val_loss: 0.2868 - val_acc: 0.9000
Epoch 8/10
 - 9s - loss: 0.2877 - acc: 0.9000 - val_loss: 0.2866 - val_acc: 0.9000
Epoch 9/10
 - 8s - loss: 0.2878 - acc: 0.9000 - val_loss: 0.2866 - val_acc: 0.9000
Epoch 10/10
 - 8s - loss: 0.2877 - acc: 0.9000 - val_loss: 0.2871 - val_acc: 0.9000


## Test data validation

In [45]:
x_tfidf = cluster_vectorizer.transform(X_test).toarray()
x_test = lda.transform(x_tfidf)
y_test = Y_test

In [46]:
y_predict = kmeans_lda.predict(x_test)
print(silhouette_score(x_test, y_predict))
pd.crosstab(y_test, y_predict)

0.48947500847901887


col_0,0,1,2,3,4,5,6,7,8,9
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austen,2,1,65,0,0,1,0,1,0,1
Blake,0,0,4,0,0,0,0,0,0,2
Bryant,2,2,58,4,1,0,3,7,2,3
Burgess,0,4,19,0,2,0,0,1,0,0
Carroll,0,0,28,0,0,2,0,0,0,0
Chesterton,2,3,86,4,1,0,0,1,2,0
Edgeworth,1,0,211,2,1,0,1,3,2,0
Melville,1,7,201,17,6,10,5,6,3,5
Milton,0,0,54,1,1,0,0,0,0,2
Shakespeare,2,7,19,2,6,3,2,1,0,1


Silhouette score looks good for test data also for clustering algorithms

#### Random Forest Test Score

In [None]:
print("Tfidf Test score: ", rfc_tfidf.score(X_test_tfidf, Y_test))
print("Count Test score: ", rfc_count.score(X_test_count, Y_test))

#### Gradient Boosting Test Score

In [None]:
print("Tfidf Test score: ", gbc_tfidf.score(X_test_tfidf, Y_test))
print("Count Test score: ", gbc_count.score(X_test_count, Y_test))

#### Logistic Regression Test Score

In [None]:
print("Tfidf Test score: ", lr_tfidf.score(X_test_tfidf, Y_test))
print("Count Test score: ", lr_count.score(X_test_count, Y_test))