In [1]:
import numpy as np
import pandas as pd
import os  
import matplotlib.pyplot as plt                                             
import tensorflow as tf                
import zipfile
import helper_functions

In [2]:
!sudo apt-get install unrar

Reading package lists... Done
Building dependency tree       
Reading state information... Done
unrar is already the newest version (1:5.6.6-2build1).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.


In [3]:
!unrar x "/content/drive/MyDrive/Amazon.rar"


UNRAR 5.61 beta 1 freeware      Copyright (c) 1993-2018 Alexander Roshal


Extracting from /content/drive/MyDrive/Amazon.rar

Extracting  train.ft.txt                                                   0%  1%  2%  3%  4%  5%  6%  7%  8%  9% 10% 11% 12% 13% 14% 15% 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31% 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47% 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63% 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79% 80% 81% 82% 83% 84% 85% 86% 87% 88% 89%  OK 
Extracting  test.ft.txt                                              

In [4]:
def get_lines(filename):
  with open(filename, "r") as f:
    return f.readlines()

In [5]:
train_lines = get_lines("/content/train.ft.txt")
train_lines[:3]

['__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n',
 "__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n",
 '__label__2 Amazing!: This s

In [6]:
test_lines = get_lines("/content/test.ft.txt")
test_lines[:3]

['__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n',
 "__label__2 One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too 

In [7]:
def make_data_ready(lines):
  labels = []
  texts = []

  label = ""
  text = ""
  for line in lines:
    label, text = line.split(' ', 1)
    if label=="__label__2":
      labels.append(1)
    else:
      labels.append(0)
    texts.append(text)
  alldata = pd.DataFrame({"texts":texts,"label":labels})
  return alldata

In [8]:
train_data = make_data_ready(train_lines)
train_data[:3]

Unnamed: 0,texts,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1


In [9]:
test_data = make_data_ready(test_lines)
test_data[:3]


Unnamed: 0,texts,label
0,Great CD: My lovely Pat has one of the GREAT v...,1
1,One of the best game music soundtracks - for a...,1
2,Batteries died within a year ...: I bought thi...,0


In [10]:
X_train = train_data["texts"]
y_train = train_data["label"]

X_train[:3], y_train[:3]

(0    Stuning even for the non-gamer: This sound tra...
 1    The best soundtrack ever to anything.: I'm rea...
 2    Amazing!: This soundtrack is my favorite music...
 Name: texts, dtype: object, 0    1
 1    1
 2    1
 Name: label, dtype: int64)

In [11]:
X_train = X_train[:int(len(X_train)/10)]
y_train = y_train[:int(len(y_train)/10)]

In [12]:
X_test = test_data["texts"]
y_test = test_data["label"]

X_test[:3], y_test[:3]

(0    Great CD: My lovely Pat has one of the GREAT v...
 1    One of the best game music soundtracks - for a...
 2    Batteries died within a year ...: I bought thi...
 Name: texts, dtype: object, 0    1
 1    1
 2    0
 Name: label, dtype: int64)

In [13]:
max_tokens = 25000

In [14]:
sent_lens = [len(sentence.split()) for sentence in X_train]
avg_sent_lens = np.mean(sent_lens)
avg_sent_lens

79.90115277777778

In [15]:
output_seq_length = int(np.percentile(sent_lens,75))
output_seq_length

110

In [16]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens = max_tokens,
                                    output_sequence_length = output_seq_length,
                                    standardize='lower_and_strip_punctuation')

In [17]:
text_vectorizer.adapt(X_train)

In [19]:
vocab = text_vectorizer.get_vocabulary()
len(vocab)

25000

In [20]:
token_embed = tf.keras.layers.Embedding(input_dim = len(vocab),
                               output_dim = 128,
                               mask_zero = True,
                               name="token_embedding")

In [21]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [22]:
train_dataset = train_dataset.batch(16).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(16).prefetch(tf.data.AUTOTUNE)

In [23]:
inputs = tf.keras.layers.Input(shape=(1,),dtype = tf.string)

text_vectors = text_vectorizer(inputs)
token_embedding = token_embed(text_vectors)
x = tf.keras.layers.GlobalAveragePooling1D()(token_embedding)
outputs = tf.keras.layers.Dense(1,activation="sigmoid", name="output")(x)

model = tf.keras.Model(inputs,outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(train_dataset,
          steps_per_epoch = int(0.1*len(train_dataset)),
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe37c2494f0>

In [24]:
model.evaluate(test_dataset)



[0.26612573862075806, 0.8935700058937073]

In [25]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 110)              0         
 torization)                                                     
                                                                 
 token_embedding (Embedding)  (None, 110, 128)         3200000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 output (Dense)              (None, 1)                 129       
                                                                 
Total params: 3,200,129
Trainable params: 3,200,129
Non-train

In [38]:
from keras.regularizers import l2
from keras.constraints import maxnorm


inputs2 = tf.keras.layers.Input(shape=(1,),dtype=tf.string)
text_vectors2 = text_vectorizer(inputs2)
x2 = token_embed(text_vectors2)
x2 = tf.keras.layers.Dropout(0.1)(x2)
x2 = tf.keras.layers.Conv1D(100,kernel_size=3)(x2)
prefilt = tf.keras.layers.Conv1D(100,kernel_size=3)(x2)
x2 = prefilt

for strides in [1,1,2]:
  x2 = tf.keras.layers.Conv1D(100,strides = strides, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_size=3, kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x2)
x_f = tf.keras.layers.LSTM(128, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x2)  
x_b = tf.keras.layers.LSTM(128, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x2)

x2 = tf.keras.layers.concatenate([x_f, x_b])
x2 = tf.keras.layers.Dense(64, activation="relu")(x2)
x2 = tf.keras.layers.Dense(1, activation="sigmoid")(x2)

model2 = tf.keras.Model(inputs=inputs2, outputs=x2)

model2.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])

model2.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 110)         0           ['input_8[0][0]']                
 ization)                                                                                         
                                                                                                  
 token_embedding (Embedding)    (None, 110, 128)     3200000     ['text_vectorization[6][0]']     
                                                                                                  
 dropout_10 (Dropout)           (None, 110, 128)     0           ['token_embedding[6][0]']  

In [39]:
model2.fit(train_dataset,
          epochs=5,
          steps_per_epoch = int(0.1*len(train_dataset)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe2f3f55940>

In [41]:
model2.evaluate(test_dataset)



[0.2713643014431, 0.892002522945404]