In [1]:
!pip install -q keras
!pip install -U -q PyDrive
!pip install -q h5py
!pip install -q hyperas
import pandas as pd
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPooling1D, CuDNNGRU, Dropout, BatchNormalization, Activation
from sklearn.model_selection import train_test_split
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
import keras.optimizers
import os
from google.colab import files
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from html import unescape

Using TensorFlow backend.


In [2]:
filepath = 'weights_best.hdf5'
model_name = 'rnn_model_twitter.h5'

def authenticate_drive():
  # Authenticate and create the PyDrive client.
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  
  return drive

dl_weight = input("Do you want to download the weights from google drive? ")
if dl_weight == 'y' or dl_weight == 'Y':
  drive = authenticate_drive()
    
  #download weights from google drive
  weight_id = '15N9WaKK5PBet_0MX7EnyxQ8UuXOPuTSW'
  weight_drive_file = drive.CreateFile({'id': weight_id})
  weight_drive_file.GetContentFile(filepath)

Do you want to download the weights from google drive? y


In [0]:
!mkdir data
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip && unzip glove.twitter.27B.zip -d data/
!wget http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip && unzip Sentiment-Analysis-Dataset.zip -d data/

In [0]:
#load data to the working environment
data_path = os.path.join(os.path.expanduser('~'), 'data', 'Sentiment Analysis Dataset.csv')
dataset = pd.read_csv(data_path, error_bad_lines=False, encoding='utf-8')
dataset.dropna(axis=0, inplace=True)
dataset = dataset.rename(index=str, columns={"SentimentText": "text", "Sentiment": "sentiment"})

embedding_path = os.path.join(os.path.expanduser('~'), 'data', 'glove.twitter.27B.100d.txt')
dim_size = 100

#take sample of it
dataset = dataset.sample(frac=0.25, random_state=42)

In [0]:
#preprocess dataset
eyes = r"[8:=;]"
nose = r"['`\-]?"

#decode html entities
dataset.text = dataset.text.apply(lambda x: unescape(x))

#fix this
# ã?Ÿã?„ã?“ã‚Œã‚“ã?—ã‚…ã?† at index 1502

dataset['text'] = dataset['text']\
.str.replace(r'https?:\/\/\S+\b|www\.(\w+\.)+\S*', '<url>')\
  .str.replace(r'@\w+', '<user>')\
  .str.replace(r'{}{}[)dD]+|[)dD]+{}{}'.format(eyes, nose, nose, eyes), '<smile>')\
  .str.replace(r'{}{}p+'.format(eyes, nose), '<lolface>')\
  .str.replace(r'{}{}\(+|\)+{}{}'.format(eyes, nose, nose, eyes), '<sadface>')\
  .str.replace(r'{}{}[\/|l*]'.format(eyes, nose), '<neutralface>')\
  .str.replace(r'/',' / ')\
  .str.replace(r'<3','<heart>')\
  .str.replace(r'[-+]?[.\d]*[\d]+[:,.\d]*', '<number>')\
  .str.replace(r'#\S+', '<hashtag>')\
  .str.replace(r'([!?.]){2,}', r'\1 <repeat>')\
  .str.replace(r'\b(\S*?)(.)\2{2,}\b', r'\1\2 <elong>')
  
#source: https://gist.github.com/tokestermw/cb87a97113da12acb388

In [0]:
#prep the dataset
#filter these things from the text
token = text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\]^_`{|}~\t\n')
max_len = dataset['text'].str.len().max()

#learn the vocabulary from all the text
token.fit_on_texts(list(dataset['text']))
vocab_size = len(token.word_index) + 1

#this might produce some error, test these 2 lines
x_train, x_test, y_train, y_test = train_test_split(dataset['text'], dataset['sentiment'], test_size=0.015, shuffle=False, random_state=42)

y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

#encode
x_train_enc = token.texts_to_sequences(x_train)
x_test_enc = token.texts_to_sequences(x_test)

#add zero padding
x_train_enc_pad = sequence.pad_sequences(x_train_enc, maxlen=max_len)
x_test_enc_pad = sequence.pad_sequences(x_test_enc, maxlen=max_len)

In [0]:
#create embedding dictionary
embeddings_index = dict()
f = open(embedding_path)
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()

#map the vocabulary to it's word embedding
embedding_matrix = np.zeros((vocab_size, dim_size))
for word, i in token.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector


In [8]:
#checking if data works correctly
index = random.randint(1, x_train.shape[0])

print('Preprocessed sentence')
print(dataset.iloc[index, 3], end='\n\n')
print('Encoded text')
print(x_train_enc[index], end='\n\n')


res = dict((v,k) for k,v in token.word_index.items())
for num in x_train_enc[index]:
  print(res.get(num), end=' ')

print()
print('Zero padding')
print(x_train_enc_pad[index])

Preprocessed sentence
<user> Those kebabs were rather good. Thanks heaps 

Encoded text
[1, 284, 28479, 160, 773, 33, 85, 3599]

<user> those kebabs were rather good thanks heaps 
Zero padding
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
    

In [11]:
#deep learning
finished_training = False
model = Sequential()
model.add(Embedding(vocab_size, dim_size, input_length=max_len, weights=[embedding_matrix], trainable=False))

model.add(CuDNNGRU(64, return_sequences=True))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(GlobalMaxPooling1D())

model.add(Dense(32))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))

#gru 64 dense 32 underfit loss: 0.3738 - val_loss: 0.4187

model.add(Dense(2, activation='sigmoid'))
use_existing_weights = True
if use_existing_weights:
  print('Training network using existing weights')
  model.load_weights(filepath)
else:
  print('Training network from scratch')

#hyperparameter is a mess, tune it
#lr, batch_size, epoch, dropout, maybe some decay, hidden units is maybe too small
opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

#am i even using mini batch right now, i'm not sure lmao

#grid search here, try lr value, dropout, layer size, do it for 20-30 epoch
#glove dimension also affects overfitting by the way
earlystop = EarlyStopping(monitor='val_loss', patience=25, verbose=1, mode='auto')
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callback_lists = [earlystop, checkpoint]

model.fit(x_train_enc_pad, y_train, epochs=300, batch_size=512, callbacks=callback_lists, verbose=1, validation_split=0.015)
scores = model.evaluate(x_test_enc_pad, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

model.save(model_name)
finished_training = True

Training network using existing weights
Train on 382902 samples, validate on 5831 samples
Epoch 1/300


Epoch 00001: val_loss improved from inf to 0.42178, saving model to weights_best.hdf5
Epoch 2/300


Epoch 00002: val_loss improved from 0.42178 to 0.41091, saving model to weights_best.hdf5
Epoch 3/300


Epoch 00003: val_loss did not improve
Epoch 4/300


Epoch 00004: val_loss did not improve
Epoch 5/300


Epoch 00005: val_loss improved from 0.41091 to 0.40872, saving model to weights_best.hdf5
Epoch 6/300
 82944/382902 [=====>........................] - ETA: 1:15 - loss: 0.3843 - acc: 0.8268


Epoch 00006: val_loss did not improve
Epoch 7/300


Epoch 00007: val_loss did not improve
Epoch 8/300


Epoch 00008: val_loss did not improve
Epoch 9/300


Epoch 00009: val_loss did not improve
Epoch 10/300


Epoch 00010: val_loss did not improve
Epoch 11/300


Epoch 00011: val_loss did not improve
Epoch 12/300


Epoch 00012: val_loss did not improve
Epoch 13/300


Epoch 00013: val_loss did not improve
Epoch 14/300


Epoch 00014: val_loss did not improve
Epoch 15/300


Epoch 00015: val_loss did not improve
Epoch 16/300


Epoch 00016: val_loss did not improve
Epoch 17/300


Epoch 00017: val_loss did not improve
Epoch 18/300


Epoch 00018: val_loss did not improve
Epoch 19/300


Epoch 00019: val_loss did not improve
Epoch 20/300


Epoch 00020: val_loss did not improve
Epoch 21/300


Epoch 00021: val_loss did not improve
Epoch 22/300


Epoch 00022: val_loss did not improve
Epoch 23/300


Epoch 00023: val_loss did not improve
Epoch 24/300


Epoch 00024: val_loss did not improve
Epoch 25/300


Epoch 00025: val_loss did not improve
Epoch 26/300


Epoch 00026: val_loss did not improve
Epoch 27/300


Epoch 00027: val_loss did not improve
Epoch 28/300


Epoch 00028: val_loss did not improve
Epoch 29/300


Epoch 00029: val_loss did not improve
Epoch 30/300


Epoch 00030: val_loss did not improve
Epoch 00030: early stopping

acc: 79.26%


In [0]:
weight_choice = input('Do you want to upload the weight into google drive? ')
if weight_choice == 'y' or weight_choice == 'Y':
  drive = authenticate_drive()

  #upload process error
  weight_upload = drive.CreateFile({'title': filepath, 'id': '15N9WaKK5PBet_0MX7EnyxQ8UuXOPuTSW'})
  weight_upload.SetContentFile(filepath)
  weight_upload.Upload()
  print('Uploaded file with ID {}'.format(weight_upload.get('id')))
  print('Done uploading weights')

model_choice = input('Do you want to upload the model into google drive? ')
if (model_choice == 'y' or model_choice == 'Y') and finished_training:
  model_upload = drive.CreateFile({'title': model_name})
  model_upload.SetContentFile(model_name)
  model_upload.Upload()
  print('Uploaded file with ID {}'.format(model_upload.get('id')))
  print('Done uploading model')

In [0]:
#using the model

done = False
while done == False:
  sentence = input('Input sentence: ')
  
  sentence_encoding = token.texts_to_sequences([sentence])
  padded_sentence = sequence.pad_sequences(sentence_encoding, maxlen=max_len)
  
  prediction = model.predict(np.array(padded_sentence))
  if prediction[0] == 1:
    print(prediction[0])
    print('Positive')
  elif prediction[0] == 0:
    print(prediction[0])
    print('Negative')
  
  finish = input('Do you still want to input another text? [y/n]')
  if finish == 'N' or finish == 'n':
    done = True