In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.metrics import (recall_score,accuracy_score, 
precision_score, confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
import seaborn as sb
from mlxtend.plotting import plot_learning_curves
from mlxtend.data import mnist_data
from mlxtend.preprocessing import shuffle_arrays_unison
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('all')
import string
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('/content/sample_data/Reviews_1.csv')

In [None]:
df=df.drop_duplicates(subset={"Text"}, keep='first', inplace=False)

In [None]:
cachedStopWords = stopwords.words("english")
sentences = []
for values in df['Text']:
  text = values
  text = ''.join(i for i in text if not i.isdigit())
  text = text.lower()
  text = re.sub(r"http\S|www\S+|https\S+","",text,flags=re.MULTILINE)
  text = text.translate(str.maketrans("","",string.punctuation))
  text = ' '.join([word for word in text.split() if word not in cachedStopWords])
  sentences.append(text)
  del text

sentences_df = pd.DataFrame(sentences, columns=['reviews'])

In [None]:
# total number of unique words
words = set()
for values in sentences_df['reviews']:
  text = values
  tmp_list = text.split()
  words.update(tmp_list)

num_words = len(words)
print(len(words))
print(num_words)

In [None]:
training_sentences,testing_sentences,training_labels,testing_labels = train_test_split(sentences_df['reviews'].values,df['Score'].values,test_size=.20, random_state=0)     
print(training_sentences.shape)
print(testing_sentences.shape)

In [None]:
# tokenizer and glove embedding
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = num_words, oov_token = "<OOV>")
tokenizer.fit_on_texts(training_sentences)
# word_index = tokenizer.word_index
# print(len(word_index))
# del word_index
tokenizer.fit_on_texts(testing_sentences)
word_index = tokenizer.word_index
print(len(word_index))
# glove_train = tokenizer.word_index
# tokenizer.fit_on_texts(testing_sentences)
# glove_test = tokenizer.word_index

traning_sequences = tokenizer.texts_to_sequences(training_sentences)
glove_train = pad_sequences(traning_sequences, maxlen=500)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
glove_test = pad_sequences(testing_sequences, maxlen=500)

# glove_train = np.array(glove_train)
# training_labels = np.array(training_labels)
# glove_test = np.array(glove_test)
# testing_labels = np.array(testing_labels)

In [None]:
embedding_dict = {}
with open('/content/sample_data/glove.twitter.27B.25d.txt', 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:], 'float32')
    embedding_dict[word] = vectors
f.close()

In [None]:
num_words += 1
embedding_matrix = np.zeros((num_words, 25))

for word, i in word_index.items():
  if i<num_words:
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
      embedding_matrix[i] = emb_vec

In [None]:
embedding_matrix

In [None]:
word_index['separate']

In [None]:
embedding_dict.get('separate')

In [None]:
#LSTM with glove embedding
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

model = Sequential()
model.add(Embedding(num_words, 25, embeddings_initializer = Constant(embedding_matrix), 
                    input_length=500, trainable = False))
model.add(LSTM(100, dropout=0.1))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate = 3e-4)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
epochs = 5
batch_size = 1000

history = model.fit(glove_train, training_labels, epochs=epochs,validation_data=(glove_test,testing_labels))
