In [1]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [4]:
genre_content_df = pd.read_csv('/gdrive/My Drive/Inbox/Datasets/books_gutenberg.csv', index_col=[0])
genre_content_df.head()

Unnamed: 0,title,genre,rating,rating_count,content_std,word_count
0,The White Feather,School Stories,3.96,125,produced by suzanne l shell charles franks and...,44679
1,463,Historical Fiction,4.67,39,produced by judith boss html version by al hai...,46221
2,The Young Buglers,Children's Fiction,4.12,91,produced by ted garvin suzanne shell william f...,102661
3,The Children's Pilgrimage,Children's Fiction,4.23,13,produced by avinash kothare tom allen charles ...,83173
4,A Houseful of Girls,Children's Fiction,3.33,12,produced by jonathan ingram jacqueline jeremy ...,92847


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
stops = set(stopwords.words('english'))
def remove_stop(text):
    text = text.split()
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = ' '.join(text)
    return text

genre_content_df['content'] = genre_content_df['content_std'].apply(remove_stop)
genre_content_df.head()

Unnamed: 0,title,genre,rating,rating_count,content_std,word_count,content
0,The White Feather,School Stories,3.96,125,produced by suzanne l shell charles franks and...,44679,produced suzanne shell charles franks online d...
1,463,Historical Fiction,4.67,39,produced by judith boss html version by al hai...,46221,produced judith boss html version haines red b...
2,The Young Buglers,Children's Fiction,4.12,91,produced by ted garvin suzanne shell william f...,102661,produced ted garvin suzanne shell william flis...
3,The Children's Pilgrimage,Children's Fiction,4.23,13,produced by avinash kothare tom allen charles ...,83173,produced avinash kothare tom allen charles fra...
4,A Houseful of Girls,Children's Fiction,3.33,12,produced by jonathan ingram jacqueline jeremy ...,92847,produced jonathan ingram jacqueline jeremy onl...


In [0]:
def gen_level(x):
  rt = 1
  if x < 1.5:
    rt = 1
  elif x < 2.5:
    rt = 2
  elif x < 3.5:
    rt = 3
  elif x < 4.5:
    rt = 4
  else:
    rt = 5
  return rt

# Generating classification labels
genre_content_df['level'] = genre_content_df['rating'].apply(gen_level)

In [8]:
# Level labels for lstm
y_level = pd.get_dummies(genre_content_df['level']).values
y_level.shape

(925, 5)

In [9]:
# Genre labels for lstm
y_genre = pd.get_dummies(genre_content_df['genre']).values
y_genre.shape

(925, 10)

In [0]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(genre_content_df['content'].values)
X = tokenizer.texts_to_sequences(genre_content_df['content'].values)
X = pad_sequences(X, maxlen=1000)

In [11]:
X.shape

(925, 1000)

In [12]:
X

array([[1027,   92,  158, ..., 2288,  131, 4157],
       [ 295, 1354, 1450, ...,  292, 1100, 4211],
       [ 835, 1583,  218, ...,  937, 2288,   69],
       ...,
       [ 711,  497, 1253, ..., 2288,  396, 3585],
       [ 216,  672,  217, ..., 4997,  782, 3585],
       [ 431,   39,  911, ...,  937, 2288, 3585]], dtype=int32)

In [13]:
y = np.array(genre_content_df.loc[:, ['rating']])
y_rc = np.array(genre_content_df.loc[:, ['rating_count']])
y.shape

(925, 1)

In [0]:
def genLSTM():
    model = tf.keras.Sequential()
    vector_length = 128
    model.add(layers.Embedding(max_features, vector_length, input_length=X.shape[1]))
    model.add(layers.LSTM(
        units=64,
        return_sequences = True
        ))
    model.add(layers.Dropout(0.15))
    model.add(layers.LSTM(units = 32))
    model.add(layers.Dropout(0.15))
    model.add(layers.Dense(units=1))
    model.compile(
        loss='mean_squared_error',
        optimizer='adam',
        metrics=['mse']
        )
    model.summary()
    return model

def genClassLSTM(n_classes):
    model = tf.keras.Sequential()
    vector_length = 128
    model.add(layers.Embedding(max_features, vector_length, input_length=X.shape[1]))
    model.add(layers.LSTM(
        units=64,
        return_sequences = True
        ))
    model.add(layers.Dropout(0.15))
    model.add(layers.LSTM(units = 32))
    model.add(layers.Dropout(0.15))
    model.add(layers.Dense(n_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [0]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=73)
rating_model = genLSTM()
%time rating_model.fit(X_train, y_train, shuffle=True, validation_data=(X_test, y_test), verbose=2, batch_size=64, epochs=50)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 128)         640000    
_________________________________________________________________
lstm (LSTM)                  (None, 1000, 64)          49408     
_________________________________________________________________
dropout (Dropout)            (None, 1000, 64)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 701,857
Trainable params: 701,857
Non-trainable params: 0
__________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f5c2e79dd68>

In [17]:
rating_model.evaluate(X_test, y_test)



[0.13762274384498596, 0.13762274384498596]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y_level, test_size=0.3, random_state=73)
level_model = genClassLSTM(5)
%time level_model.fit(X_train, y_train, shuffle=True, validation_data=(X_test, y_test), verbose=2, batch_size=64, epochs=50)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 128)         640000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 1000, 64)          49408     
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000, 64)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 165       
Total params: 701,989
Trainable params: 701,989
Non-trainable params: 0
________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f5c2ca3a550>

In [19]:
level_model.evaluate(X_test, y_test)



[1.285625696182251, 0.6582733988761902]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y_genre, test_size=0.3, random_state=73)
genre_model = genClassLSTM(10)
%time genre_model.fit(X_train, y_train, shuffle=True, validation_data=(X_test, y_test), verbose=2, batch_size=64, epochs=50)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 128)         640000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 1000, 64)          49408     
_________________________________________________________________
dropout_4 (Dropout)          (None, 1000, 64)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                330       
Total params: 702,154
Trainable params: 702,154
Non-trainable params: 0
________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f5c2c039240>

In [21]:
genre_model.evaluate(X_test, y_test)



[3.500155448913574, 0.2374100685119629]