# Text Classifiers
## This notebook contains various classifiers that can be used on a provided text dataset.

## Additions for transfer learning and model coefficient examinatation is present in some of the models.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import nltk
import tensorflow as tf
import keras
import sklearn
import csv
import itertools
import matplotlib.pyplot as plt
from sklearn import svm, datasets
import operator
import seaborn as sns
import pickle

### Define the function to generate a detailed confusion matrix:

In [None]:
## This function is called by the classifiers to provide a visulization of the results
class_names = ['False', 'True']
def plot_confusion_matrix(cm, classes, Y_test,
                          predictions,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    print("Confusion Matrix:")
    print(cm)

    plt.clf()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j],'d'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
    print(classification_report(Y_test, predictions, target_names = classes))

### (OPTIONAL): If using google colabs, mount your drive so you can reference a file system to reading in and storing datasets.

In [None]:
#### Setup the google drive connection if needed
from google.colab import drive
drive.mount('/content/gdrive')

### Indicate the csv file that you want to read in:
#### Note: The file_in2 variable determines the dataset that will be used for transfer learning (data will be used exclusively for testing).

In [None]:
### Enter filename below:
file_in = "/content/gdrive/My Drive/Colab Notebooks/various_data/praise/praise_data_multisemester_cleaned.csv"
data_col = "REVIEW"
label_col = "TAG"

### For testing on other datasets (cross-domain)
file_in2 = "/content/gdrive/My Drive/Colab Notebooks/amazon_sentiment_reviews_cleaned.csv"
data_col2 = "REVIEW"
label_col2 = "TAG"

df = pd.read_csv(file_in, engine = 'python')
df = df.dropna()
df = df.reset_index()
df = df.drop(columns = ['index'])
df2 = pd.read_csv(file_in2, engine = 'python') # Other
df2 = df2.dropna()
df2 = df2.reset_index()
df2 = df2.drop(columns = ['index'])
print(file_in)
print(df.dtypes)
print("Sample size:", len(df))
df.head(5)

### Setting up the data.

In [None]:
### Define data and label columns
X = df[data_col] # Main dataset text
Y = df[label_col] # Main dataset labels
X2 = df2[data_col2] # Transfer text
Y2 = df2[label_col2] # Transfer labels

In [None]:
### Train and Test splitting
X_train, X_test, Y_train, Y_test = train_test_split(
 X, Y, test_size=0.15, random_state=42, stratify=df[label_col])
print("Train data amount:", len(X_train))
print("Test data amount:", len(X_test))

## Logistic Regression Classifier:

In [None]:
#### Logistic Regression
text_clf_log = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
 ('tfidf', TfidfTransformer()),
 ('clf-log', LogisticRegression(solver='liblinear')),
])

In [None]:
text_clf_log = text_clf_log.fit(X_train,Y_train)

In [None]:
### Function that shows must important features by class (binary)
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [None]:
show_most_informative_features(text_clf_log['vect'], text_clf_log['clf-log'], n=10)

In [None]:
predicted_log = text_clf_log.predict(X_test)
accuracy = np.mean(predicted_log == Y_test)
print("Accuracy:", accuracy)
cm = confusion_matrix(Y_test, predicted_log)
print(cm)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_log)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_log,
                      title='Confusion matrix')

### Gridsearch: Logistic Regression

In [None]:
#### Gridsearch
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              #'tfidf__use_idf': (True, False),
              'clf-log__C': (10, 1, 1e-1),
              'clf-log__solver': ('liblinear', 'newton-cg', 'lbfgs'),
    }

In [None]:
text_clf_log.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_log = GridSearchCV(text_clf_log, parameters, cv=5, n_jobs=-1)
gs_clf_log = gs_clf_log.fit(X_train,Y_train)
predicted_gs = gs_clf_log.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_log.best_score_)
print(gs_clf_log.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

In [None]:
###### Transfer Learning Results ######
cnf_matrix = confusion_matrix(Y2, gs_clf_log.predict(X2))
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y2, predictions = gs_clf_log.predict(X2),
                      title='Confusion matrix')

## Random Forest Classifier

In [None]:
#### Random Forest
text_clf_rfc = Pipeline([('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
#  ('clf-rfc', RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42)),
 ('clf-rfc', RandomForestClassifier(n_estimators=300, max_depth=100)),
])

In [None]:
text_clf_rfc = text_clf_rfc.fit(X_train,Y_train)

In [None]:
predicted_rfc = text_clf_rfc.predict(X_test)
accuracy = np.mean(predicted_rfc == Y_test)
print("Accuracy:", accuracy)
cm = confusion_matrix(Y_test, predicted_rfc)
print(cm)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_rfc)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_rfc,
                      title='Confusion matrix')

### Gridsearch: Random Forest

In [None]:
#### Gridsearch
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              #'tfidf__use_idf': (True, False),
#               'clf-rfc__max_depth': (2, 3, 4),
              'clf-rfc__criterion': ("gini", "entropy"),
#               'clf-rfc__n_estimators': (100, 200, 300),
    }

In [None]:
text_clf_rfc.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_rfc = GridSearchCV(text_clf_rfc, parameters, cv=5, n_jobs=-1)
gs_clf_rfc = gs_clf_rfc.fit(X_train,Y_train)
predicted_gs = gs_clf_rfc.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_rfc.best_score_)
print(gs_clf_rfc.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

In [None]:
###### Transfer Learning Results ######
cnf_matrix = confusion_matrix(Y2, gs_clf_rfc.predict(X2))
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y2, predictions = gs_clf_rfc.predict(X2),
                      title='Confusion matrix')

## Multinomial Naive Bayes classifier:

In [None]:
#### Multinomial Naive Bayes
text_clf_mnb = Pipeline([('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf-mnb', MultinomialNB()),
])

In [None]:
text_clf_mnb = text_clf_mnb.fit(X_train,Y_train)

In [None]:
mod = text_clf_mnb['clf-mnb']
count_vect = text_clf_mnb['vect']
print(max(mod.coef_[0]))
print(min(mod.coef_[0]))
print(mod.feature_log_prob_[0, :].argsort())
print(mod.feature_log_prob_[1, :].argsort())

In [None]:
neg_class_prob_sorted = mod.feature_log_prob_[0, :].argsort()
pos_class_prob_sorted = mod.feature_log_prob_[1, :].argsort()

print(np.take(count_vect.get_feature_names(), neg_class_prob_sorted[:10]))
print(np.take(count_vect.get_feature_names(), pos_class_prob_sorted[:10]))

In [None]:
show_most_informative_features(text_clf_mnb['vect'], text_clf_mnb['clf-mnb'], n=20)

In [None]:
predicted_mnb = text_clf_mnb.predict(X_test)
accuracy = np.mean(predicted_mnb == Y_test)
print("Accuracy:", accuracy)
cm = confusion_matrix(Y_test, predicted_mnb)
print(cm)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_mnb)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_mnb,
                      title='Confusion matrix')

### Gridsearch: Multinomial Naive Bayes

In [None]:
#### Gridsearch
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'tfidf__use_idf': (True, False),
              'clf-mnb__alpha': (0, 1e-1, 1e-2),
    }

In [None]:
text_clf_mnb.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_mnb = GridSearchCV(text_clf_mnb, parameters, cv=5)
gs_clf_mnb = gs_clf_mnb.fit(X_train,Y_train)
predicted_gs = gs_clf_mnb.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_mnb.best_score_)
print(gs_clf_mnb.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

In [None]:
###### Transfer Learning Results ######
cnf_matrix = confusion_matrix(Y2, gs_clf_mnb.predict(X2))
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y2, predictions = gs_clf_mnb.predict(X2),
                      title='Confusion matrix')

## Complement Naive Bayes classifier:

In [None]:
#### Complement Naive Bayes
text_clf_cnb = Pipeline([('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf-cnb', ComplementNB()),
])

In [None]:
text_clf_cnb = text_clf_cnb.fit(X_train,Y_train)

In [None]:
predicted_cnb = text_clf_cnb.predict(X_test)
accuracy = np.mean(predicted_cnb == Y_test)
print("Accuracy:", accuracy)
cm = confusion_matrix(Y_test, predicted_cnb)
print(cm)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_cnb)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_cnb,
                      title='Confusion matrix')

### Gridsearch: Complement Naive Bayes

In [None]:
#### Gridsearch
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'tfidf__use_idf': (True, False),
              'clf-cnb__alpha': (0, 1e-1, 1e-2),
    }

In [None]:
text_clf_cnb.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_cnb = GridSearchCV(text_clf_cnb, parameters, cv=5)
gs_clf_cnb = gs_clf_cnb.fit(X_train,Y_train)
predicted_gs = gs_clf_cnb.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_cnb.best_score_)
print(gs_clf_cnb.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

## AdaBoost Classifier:

In [None]:
#### AdaBoost
text_clf_adb = Pipeline([('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf-adb', AdaBoostClassifier(learning_rate = 0.8, n_estimators = 170)),
])

In [None]:
text_clf_adb = text_clf_adb.fit(X_train,Y_train)

In [None]:
predicted_adb = text_clf_adb.predict(X_test)
accuracy = np.mean(predicted_adb == Y_test)
print("Accuracy:", accuracy)
cm = confusion_matrix(Y_test, predicted_adb)
print(cm)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_adb)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_adb,
                      title='Confusion matrix')

### Gridsearch: AdaBoost

In [None]:
#### Gridsearch
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'tfidf__use_idf': (True, False),
              'clf-adb__n_estimators': (140, 160),
#               'clf-adb__learning_rate': (0.6, 0.8),
    }

In [None]:
text_clf_adb.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_adb = GridSearchCV(text_clf_adb, parameters, cv=5)
gs_clf_adb = gs_clf_adb.fit(X_train,Y_train)
predicted_gs = gs_clf_adb.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_adb.best_score_)
print(gs_clf_adb.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

## Gradient Boosting Classifier:

In [None]:
#### Gradient Boosting
text_clf_gdb = Pipeline([('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf-gdb', GradientBoostingClassifier(loss = "deviance", learning_rate = 0.3, n_estimators = 150)),
])

In [None]:
text_clf_gdb = text_clf_gdb.fit(X_train,Y_train)

In [None]:
predicted_gdb = text_clf_gdb.predict(X_test)
accuracy = np.mean(predicted_gdb == Y_test)
print("Accuracy:", accuracy)
cm = confusion_matrix(Y_test, predicted_gdb)
print(cm)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gdb)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gdb,
                      title='Confusion matrix')

### Gridsearch: Gradient Boosting

In [None]:
#### Gridsearch
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'tfidf__use_idf': (True, False),
#               'clf-gdb__n_estimators': (150, 170),
              'clf-gdb__learning_rate': (0.2, 0.4),
    }

In [None]:
text_clf_gdb.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_gdb = GridSearchCV(text_clf_gdb, parameters, cv=5, n_jobs=-1)
gs_clf_gdb = gs_clf_gdb.fit(X_train,Y_train)
predicted_gs = gs_clf_gdb.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_gdb.best_score_)
print(gs_clf_gdb.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

## Support Vector Machine classifier:

In [None]:
#### Support Vector Machine
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SVC(C=1.0, kernel='linear')), 
                    ])

In [None]:
text_clf_svm = text_clf_svm.fit(X_train,Y_train)

In [None]:
predicted_svm = text_clf_svm.predict(X_test)
accuracy = np.mean(predicted_svm == Y_test)
print("Accuracy:", accuracy)
cm = confusion_matrix(Y_test, predicted_svm)
print(cm)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_svm)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_svm,
                      title='Confusion matrix')

### Gridsearch: Support Vector Machine

In [None]:
#### Gridsearch
## To save time, do sigmoid with degree separately from other kernels
parameters = {
    # 'vect__ngram_range': [(1, 1), (1, 2)],
              #'tfidf__use_idf': (True, False),
              'clf-svm__C': (10, 1, 1e-1),
    }

In [None]:
text_clf_svm.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_svm = GridSearchCV(text_clf_svm, parameters, cv=5, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train,Y_train)
predicted_gs = gs_clf_svm.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

In [None]:
###### Transfer Learning Results ######
cnf_matrix = confusion_matrix(Y2, gs_clf_svm.predict(X2))
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y2, predictions = gs_clf_svm.predict(X2),
                      title='Confusion matrix')

## Stochastic Gradient Decent (SVM) classifier:

In [None]:
#### Stochastic Gradient Descent (SVM hinge loss)
text_clf_sgd = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-sgd', SGDClassifier(loss='hinge', # modified_huber
                                                  penalty='l2',
                                                  max_iter=1000)), 
                    ])

In [None]:
_ = text_clf_sgd.fit(X_train,Y_train)

In [None]:
show_most_informative_features(text_clf_sgd['vect'], text_clf_sgd['clf-sgd'], n=10)

In [None]:
predicted_sgd = text_clf_sgd.predict(X_test)
accuracy = np.mean(predicted_sgd == Y_test)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_sgd)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_sgd,
                      title='Confusion matrix')

### Gridsearch: Stochastic Gradient Descent

In [None]:
#### Gridsearch
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'tfidf__use_idf': (True, False),
              'clf-sgd__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf-sgd__l1_ratio': (0, 0.15, 1)
              #'clf-sgd__learning_rate': (1, 1e-1, 1e-2, 1e-3, 1e-4),
    }

In [None]:
text_clf_sgd.get_params().keys()

In [None]:
## Cross-validation and fit
gs_clf_sgd = GridSearchCV(text_clf_sgd, parameters, cv=5)
gs_clf_sgd = gs_clf_sgd.fit(X_train,Y_train)
predicted_gs = gs_clf_sgd.predict(X_test)
accuracy = np.mean(predicted_gs == Y_test)

In [None]:
print("Grid search best score:", gs_clf_sgd.best_score_)
print(gs_clf_sgd.best_params_)
print("Accuracy:", accuracy)

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_gs)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_gs,
                      title='Confusion matrix')

## Neural Network Models 3D

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
X = df[data_col]
Y = df[label_col]
train_data, test_data, train_labels, test_labels = train_test_split(
 X, Y, test_size=0.15, random_state=42, stratify=df[label_col])
print("Train data amount:", len(train_data))
print("Test data amount:", len(test_data))

In [None]:
## Initialize train and test data, train and test labels
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_data)
word_index = tokenizer.word_index
vocab_size = len(word_index)
sequences = tokenizer.texts_to_sequences(train_data)
test_data = tokenizer.texts_to_sequences(test_data)
train_portion = sequences # for maxlength reference later
train_data, x_val, train_labels, y_val = train_test_split(
 sequences, train_labels, test_size=0.11, random_state=42, stratify=train_labels)
print("Train data amount:", len(train_data))
print("Validation data amount:", len(x_val))
print("Test data amount:", len(test_data))

In [None]:
# pickle.dump(tokenizer, open("/content/gdrive/My Drive/Colab Notebooks/localization_tokenizer", 'wb')) # Use this to save the tokenizer if you want to save the actual model later

In [None]:
numabove = 0
max_length = 400 # This number determines the maximum length a comment will be (trimming longer comments and padding shorter comments).
for i in range(len(train_portion)):
    if (len(train_portion[i]) > max_length):
        numabove = numabove + 1
print(len(train_portion))
print(numabove)

In [None]:
## Pad train-val-test data
train_data = pad_sequences(train_data, maxlen=max_length, padding='post', truncating='post')
x_val = pad_sequences(x_val, maxlen=max_length, padding='post', truncating='post')
test_data = pad_sequences(test_data, maxlen=max_length, padding='post', truncating='post')

In [None]:
## Accommoadates Tensorflow version 2
train_data = np.array(train_data)
train_labels = np.array(train_labels)
x_val = np.array(x_val)
y_val = np.array(y_val)
test_data = np.array(test_data)
test_labels = np.array(test_labels)

In [None]:
### Load the embeddings file.
embeddings_index = {}
embeddings_file = "/content/gdrive/My Drive/Colab Notebooks/glove.6B.300d.txt"
f = open(embeddings_file, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        pass
    embeddings_index[word] = coefs
f.close()

In [None]:
### Create the embedding matrix mapping every index in the corpus to it's respective embedding_vector.
embedding_dim = 300 # The number on the glove file: xxxd
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
### Construct and compile neural network models:

# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False),
#     tf.keras.layers.Dropout(0.4),
#     tf.keras.layers.Conv1D(64, 5, activation='relu'),
#     tf.keras.layers.GlobalMaxPool1D(),
#     tf.keras.layers.Dense(64, activation=tf.nn.relu),
#     tf.keras.layers.Dropout(0.4),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,recurrent_dropout=0.4)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics=['accuracy'])

model.summary()

In [None]:
## Train the model
# history = model.fit(training_sequences, training_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels), verbose=2)

history = model.fit(train_data,
                   train_labels,
                   epochs=4,
                   batch_size=75,
                   validation_data=(x_val, y_val),
                   verbose=1)

In [None]:
## Evaluate the model
predicted_nn = model.predict_classes(test_data)
results = model.evaluate(test_data, test_labels)
print("Test loss:", results[0])
print("Test Accuracy:", results[1])

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(test_labels, predicted_nn)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = test_labels, predictions = predicted_nn,
                      title='Confusion matrix')

In [None]:
## Optional save
# model.save("/content/gdrive/My Drive/Colab Notebooks/localized_bilstm_model.h5")

### Full load and data transformation.
#### Use the following to load in an already constructed and saved neural network model.
#### Note: The following also contains text cleaning code, meaning this can be used on any new review comment in a working text system.

In [None]:
### If you haven't imported the required files from the start of the notebook, run this.
import keras
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
!pip install pyspellchecker # If not installed
from spellchecker import SpellChecker # The above pyspellchecker refers to this library
import re
import string
import nltk
import pandas as pd
spell = SpellChecker()
nltk.download('words') # If not downloaded
from nltk.corpus import words

In [None]:
### Similar to the cleaning function in other code files, except this only cleans and returns a single comment.
def preprocess_review(reviews):
  comment = []
  for i in range(len(reviews)):
    if reviews[i] == "" or isinstance(reviews[i], str) == False or reviews[i] == " ":
            continue
    reviews[i] = re.sub(r'[!?]','.',reviews[i]) # Removing special character
    reviews[i] = re.sub(r'[^.a-zA-Z0-9\s]',' ',reviews[i]) # Removing special character
    reviews[i] = re.sub('\'',' ',reviews[i]) # Removing quotes
    reviews[i] = re.sub('#','',reviews[i]) # Removing quotes
    reviews[i] = re.sub('\d',' ',reviews[i]) # Replacing digits by space
    reviews[i] = re.sub(r'\s+[a-z][\s$]', ' ',reviews[i]) # Removing single characters and spaces alongside
    reviews[i] = re.sub(r'\s+', ' ',reviews[i]) # Replacing more than one space with a single space
    if 'www.' in reviews[i] or 'http:' in reviews[i] or 'https:' in reviews[i] or '.com' in reviews[i]:
          reviews[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", reviews[i])
    reviews[i] = reviews[i].lower()
    reviews[i] = reviews[i].rstrip()
    spot = reviews[i].find(' .')
    while spot != -1: # Fix lone periods in comment
      sl = list(reviews[i])
      sl[spot] = '.'
      sl[spot+1] = ''
      reviews[i] = "".join(sl)
      spot = reviews[i].find(' .')
    for word in reviews[i].split():
      if word == '.':
        continue
      word_base = word.translate(str.maketrans('', '', string.punctuation))  
      if(bool(spell.unknown([word_base]))):
        recommended = spell.correction(word_base)
        if (recommended in words.words()):
          reviews[i] = reviews[i].replace(word,recommended,1)
        else:
          reviews[i] = reviews[i].replace(word, '')
          reviews[i] = re.sub(r'\s+', ' ',reviews[i]) # Replacing more than one space with a single space
    reviews[i] = reviews[i].replace('..', '.')
    if reviews[i].find('.') == 0:
      reviews[i] = reviews[i].replace('.', '', 1)
      reviews[i] = reviews[i].replace(' ', '', 1)
    comment.append(reviews[i])
  return comment[0]

In [None]:
def load_items(filepath_model, filepath_tokenizer):
  model = load_model(filepath_model) # May need to alter filepath
  tokenizer = pickle.load(open(filepath_tokenizer, 'rb')) # May need to alter filepath
  return model, tokenizer

In [None]:
def predict_class(new_data, model, tokenizer, maxlength):
  new_data = preprocess_review([new_data])
  new_df = (pd.DataFrame([new_data]))[0]
  new_df = tokenizer.texts_to_sequences(new_df)
  new_df = pad_sequences(new_df, maxlen=maxlength, padding='post', truncating='post')
  predicted = int(model.predict_classes(new_df))
  return predicted

In [None]:
## Load indicated model and tokenizer
filepath_model = "/content/gdrive/My Drive/Colab Notebooks/problems_cnn_model.h5"
filepath_tokenizer = "/content/gdrive/My Drive/Colab Notebooks/problems_tokenizer"
model, tokenizer = load_items(filepath_model, filepath_tokenizer)
new_data = "This code sample could use more comments." # New comment to be transformed and predicted (this is an example).
predicted_comment = predict_class(new_data, model, tokenizer, 400) # 400 required for Problem's model.
print(predicted_comment) # Final prediction of the single comment; can store this instead and use elsewhere.

## TF-IDF Deep Neural Network

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
 X, Y, test_size=0.1, random_state=35, stratify=df[label_col])
print("Train data amount:", len(X_train))
print("Test data amount:", len(X_test))

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(
 X_train, Y_train, test_size=0.11, random_state=35, stratify=Y_train)
print("Train data amount:", len(X_train))
print("Val data amount:", len(X_val))

In [None]:
vectorizer = CountVectorizer()
x_train_2 = vectorizer.fit_transform(X_train)
x_val_2 = vectorizer.transform(X_val)
x_test_2 = vectorizer.transform(X_test)
transformer = TfidfTransformer(norm = 'l2')
x_train_2 = transformer.fit_transform(x_train_2)
x_val_2 = transformer.transform(x_val_2)
x_test_2 = transformer.transform(x_test_2)
min_max_scaler = preprocessing.MaxAbsScaler() # test MaxAbsScaler
x_train_2 = min_max_scaler.fit_transform(x_train_2)
x_val_2 = min_max_scaler.transform(x_val_2)
x_test_2 = min_max_scaler.transform(x_test_2)

In [None]:
print(len(vectorizer.get_feature_names()))
num_features = len(vectorizer.get_feature_names())

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, input_shape = (num_features,), activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    # tf.keras.layers.Dense(100, input_shape = (num_features,), activation=tf.nn.relu,
    # tf.keras.layers.Dropout(0.50),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics=['accuracy'])

model.summary()

In [None]:
## Train the model
history = model.fit(x_train_2,
                   Y_train,
                   epochs=5,
                   batch_size=50,
                   validation_data=(x_val_2, Y_val),
                   verbose=1)

In [None]:
## Evaluate the model
predicted_nn = model.predict_classes(x_test_2)
results = model.evaluate(x_test_2, Y_test)
print("Test loss:", results[0])
print("Test Accuracy:", results[1])

In [None]:
## Graphical visualization
cnf_matrix = confusion_matrix(Y_test, predicted_nn)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      Y_test = Y_test, predictions = predicted_nn,
                      title='Confusion matrix')

In [None]:
## Visualize train and validation loss
loss = history.history['loss'][1:]
val_loss = history.history['val_loss'][1:]

epochs = range(1, len(loss) + 1)

plt.clf()   # clear figure
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
## Visualize train and validation accuracy
plt.clf()   # clear figure
acc = history.history['acc'][1:]
val_acc = history.history['val_acc'][1:]

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training Accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()