# Instructions to run:
1. Install all the libraries given in the imports.
2. Download the Stack Overflow dataset (2016-2020) from here (https://www.kaggle.com/imoore/60k-stack-overflow-questions-with-quality-rate).
3. Open in a console that supports .ipynb or python notebooks.
4. Run the program on the console and wait for the program to finish.

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from tensorflow.keras import *
import matplotlib.pyplot as plt
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from plotly import graph_objs as go
from sklearn.metrics import classification_report, plot_confusion_matrix
import random
import numpy as np
import seaborn as sns

In [None]:
def visualize_confusion_matrix(accuracy, labels, val_pred_labels, val_true_labels):
    '''
    Plots the confusion matrix for given data.
    '''
    confusion_matrix = np.zeros((3, 3), dtype=np.int16)
    for pred, true in zip(val_pred_labels, val_true_labels):
        confusion_matrix[pred, true] += 1
    confusion_matrix = confusion_matrix / confusion_matrix.sum(axis=1,keepdims=1)
    
    plt.figure(figsize=(20, 20))
    sns.set(font_scale=1.5)
    ax = sns.heatmap(confusion_matrix, annot=True, square=True, cmap="YlGnBu", cbar_kws={'label': 'Scale'})
    ax.set_title(f'Confusion Matrix (Accuracy = {accuracy})', fontsize=50)
    ax.set_xticklabels(labels, fontsize=15)
    ax.set_yticklabels(labels, fontsize=15)
    ax.set_xlabel("True Label", fontsize=25)
    ax.set_ylabel("Predicted Label", fontsize=25)
    plt.show()

In [None]:
train_data = pd.read_csv(r'/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv')
test_data = pd.read_csv(r'/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv')
data = pd.concat((train_data, test_data))
print(f'The size of training dataset is: {len(train_data)} samples')
print(f'The size of testing dataset is: {len(test_data)} samples')
print(f'The combined size of the dataset is: {len(data)} samples')

In [None]:
data.head()

In [None]:
data.Y.value_counts().plot.bar()
plt.xlabel('Classes')
plt.ylabel('Number of samples')
plt.title('Dataset size')
plt.show()

In [None]:
data['Text']=data.Body.apply(lambda x: BeautifulSoup(x, 'html.parser').text)
data.head()

In [None]:
HQ = data[data['Y']=='HQ']['Text'].str.split().map(lambda x: len(x) if len(x) < 500 else 500)
LQ_EDIT = data[data['Y']=='LQ_EDIT']['Text'].str.split().map(lambda x: len(x) if len(x) < 500 else 500)
LQ_CLOSE = data[data['Y']=='LQ_CLOSE']['Text'].str.split().map(lambda x: len(x) if len(x) < 500 else 500)

fig = go.Figure()
fig.add_trace(go.Histogram(x=HQ, histfunc='avg', name="HQ", opacity=0.6, histnorm='probability density'))
fig.add_trace(go.Histogram(x=LQ_EDIT, histfunc='avg', name="LQ_EDIT", opacity=0.6, histnorm='probability density'))
fig.add_trace(go.Histogram(x=LQ_CLOSE, histfunc='avg', name="LQ_CLOSE", opacity=0.6, histnorm='probability density'))

fig.update_layout(
    title_text='Question word count frequency',
    xaxis_title_text='Word count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.1,
    barmode='overlay'
)
fig.show()

In [None]:
SEQ_LEN = 360
VOCAB_SIZE = 100000
EPOCHS = 25
BATCH_SIZE = 32
OOV_TOKEN = '<UNK>'

In [None]:
train, remain = train_test_split(data, test_size=0.2, random_state=0)
validation, test = train_test_split(remain, test_size=0.5, random_state=0)

print(f'The size of training dataset is: {len(train)} samples')
print(f'The size of validation dataset is: {len(validation)} samples')
print(f'The size of test dataset is: {len(test)} samples')

In [None]:
encoder = LabelEncoder()
encoder.fit(data.Y.values)

encoded_train_Y = encoder.transform(train.Y.values)
encoded_valid_Y = encoder.transform(validation.Y.values)
encoded_test_Y = encoder.transform(test.Y.values)

train_X = train.Text.values
valid_X = validation.Text.values
test_X = test.Text.values

train_Y = np_utils.to_categorical(encoded_train_Y)
valid_Y = np_utils.to_categorical(encoded_valid_Y)
test_Y = np_utils.to_categorical(encoded_test_Y)

In [None]:
tokens = preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokens.fit_on_texts(list(train_X))

train_X = tokens.texts_to_sequences(train_X)
valid_X = tokens.texts_to_sequences(valid_X)
test_X = tokens.texts_to_sequences(test_X)

print(np.quantile([len(x) for x in train_X], 0.95))

train_X = preprocessing.sequence.pad_sequences(train_X, maxlen=SEQ_LEN, padding='post', truncating='post')
valid_X = preprocessing.sequence.pad_sequences(valid_X, maxlen=SEQ_LEN, padding='post', truncating='post')
test_X = preprocessing.sequence.pad_sequences(test_X, maxlen=SEQ_LEN, padding='post', truncating='post')

In [None]:
inputs = Input(shape=(None,), dtype="int32")

x = layers.Embedding(VOCAB_SIZE, 128)(inputs)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)

outputs = layers.Dense(3, activation="softmax")(x)

model = Model(inputs, outputs)

model.compile(loss='binary_crossentropy', 
              optimizer=optimizers.Adam(lr=1e-4), 
              metrics=['accuracy'],)

model.summary()

In [None]:
my_callbacks = [callbacks.ModelCheckpoint('lstm.h5', monitor='val_loss', save_best_only=True, verbose=2),
                callbacks.EarlyStopping(monitor='val_loss',  patience=5, verbose=2),
                callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=2)]

lstm = model.fit(train_X, 
                train_Y, 
                batch_size=BATCH_SIZE, 
                epochs=EPOCHS, 
                validation_data=(valid_X, valid_Y),
                callbacks = my_callbacks, 
                verbose=1)

In [None]:
loss, accuracy = model.evaluate(test_X, test_Y)

pred_Y = model.predict(test_X)
pred_Y = np.argmax(pred_Y, axis=1)

true_Y = np.argmax(test_Y, axis=1)

CLASSES = list(encoder.classes_)
print(classification_report(true_Y, pred_Y, target_names=CLASSES))

In [None]:
visualize_confusion_matrix(round(accuracy, 4), CLASSES, pred_Y, true_Y)

In [None]:
inputs = Input(shape=(SEQ_LEN,), dtype="int32")

x = layers.Embedding(VOCAB_SIZE, 128)(inputs)
x = layers.Conv1D(32, 3, padding='same', activation='relu')(x)
x = layers.MaxPooling1D()(x)
x = layers.Flatten()(x)
x = layers.Dense(128, activation='relu')(x)

outputs = layers.Dense(3, activation="softmax")(x)

model = Model(inputs, outputs)

model.compile(loss='binary_crossentropy', 
              optimizer=optimizers.Adam(lr=1e-4), 
              metrics=['accuracy'],)

model.summary()

In [None]:
my_callbacks = [callbacks.ModelCheckpoint('cnn.h5', monitor='val_loss', save_best_only=True, verbose=2),
                callbacks.EarlyStopping(monitor='val_loss',  patience=5, verbose=2),
                callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=2)]

cnn = model.fit(train_X, 
                train_Y, 
                batch_size=BATCH_SIZE, 
                epochs=EPOCHS, 
                validation_data=(valid_X, valid_Y),
                callbacks = my_callbacks, 
                verbose=1)

In [None]:
loss, accuracy = model.evaluate(test_X, test_Y)

pred_Y = model.predict(test_X)
pred_Y = np.argmax(pred_Y, axis=1)

true_Y = np.argmax(test_Y, axis=1)

CLASSES = list(encoder.classes_)
print(classification_report(true_Y, pred_Y, target_names=CLASSES))

In [None]:
visualize_confusion_matrix(round(accuracy, 4), CLASSES, pred_Y, true_Y)

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(lstm.history['accuracy'], label='LSTM train', linestyle='-', color='b')
plt.plot(lstm.history['val_accuracy'], label='LSTM validation', linestyle=':', color='b')
plt.plot(cnn.history['accuracy'], label='CNN train', linestyle='-', color='g')
plt.plot(cnn.history['val_accuracy'], label='CNN validation', linestyle=':', color='g')
plt.title('Training and validation accuracy')
plt.xlabel('Number of epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(lstm.history['loss'], label='LSTM train', linestyle='-', color='b')
plt.plot(lstm.history['val_loss'], label='LSTM validation', linestyle=':', color='b')
plt.plot(cnn.history['loss'], label='CNN train', linestyle='-', color='g')
plt.plot(cnn.history['val_loss'], label='CNN validation', linestyle=':', color='g')
plt.title('Training and validation loss')
plt.xlabel('Number of epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()