In [1]:
import yaml
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Dropout, BatchNormalization
from tensorflow.keras import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

nlp = spacy.load("en_core_web_sm")

In [2]:
%cd ..

c:\Users\wonde\Desktop\Projects\Speech-Notetaking\dvc_stt


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
# Read config
import pprint

with open('params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

pprint.pprint(config)

{'base': {'log_level': 'INFO', 'random_state': 42},
 'data': {'path': 'data/review.csv',
          'processed_path': 'data/processed.csv',
          'test_path': 'data/test.csv',
          'train_path': 'data/train.csv'},
 'evaluate': {'metrics_file': 'reports/metrics.json'},
 'train': {'model_accuracy_path': 'reports/model_accuracy.png',
           'model_checkpoint': 'model/best_model.h5',
           'model_loss_path': 'reports/model_loss.png'}}


### Data: Prepare Data

In [4]:
data = pd.read_csv(config['data']['path'])

### Model: Prepare base model and run training

In [5]:
data = data.dropna(axis=0, subset=['reviewText'])

In [6]:
def preprocess(string):
    doc = nlp(string)
    lemma = [token.lemma_ for token in doc if token.lemma_.isalpha() or token.lemma_ not in STOP_WORDS]
    return ' '.join(lemma)

In [7]:
X = [preprocess(sen) for sen in list(data['reviewText'])]


y = np.array(data['overall'].map({1:0, 2:0, 3:1, 4:1, 5:1}))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [11]:
vocab_size = len(tokenizer.word_index)

maxlen=200
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
data['preprocessed'] = X
data.to_csv(config['processed_path'], index=False)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1, output=300, input_length=maxlen, trainable=True, name="Input"))
model.add(Dense(300, name="Dense1"))
model.add(Dropout(rate=0.25, name="Dropout1"))
model.add(Dense(128, name="Dense2"))
model.add(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15, name="LSTM"))
model.add(GRU(64, return_sequences=False, dropout=0.15, name="GRU"))
model.add(Dense(64, name="Dense3"))
model.add(Dropout(rate=0.15, name="Dropout2"))
model.add(Dense(32, name="Dense4"))
model.add(Dense(1, activation="sigmoid", name="Output"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
# Implement callbacks to handle overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_save = ModelCheckpoint('best_model.hdf5', save_best_only=True)

history = model.fit(X_train, y_train, batch_size=64, epochs=20, validation_split=0.2, callbacks=[early_stopping, model_save])

### Visualization

In [None]:
plt.figure()
# Use the history metrics
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
# Make it pretty
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train','Validate'])
plt.show()

In [None]:
plt.figure()
# Use the history metrics
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
# Make it pretty
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Validate'])
plt.show()

### Evaluate

In [None]:
model.evaluate(X_test, y_test)

In [None]:
tes = [preprocess("")]

tes = tokenizer.texts_to_sequences(tes)
tes = pad_sequences(tes, padding="post", maxlen=maxlen)

In [None]:
model.predict(tes)