In [25]:
# Required imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import pickle
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE
from collections import Counter

stop_words = set(stopwords.words('english'))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, GlobalMaxPooling1D, GlobalAveragePooling2D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model, load_model
from keras import regularizers

import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors

In [26]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [27]:
def get_metrics(ytrue, ypred):
    y_true = np.argmax(ytrue, axis=1)
    y_pred = np.argmax(ypred, axis=1)
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [28]:
def evaluate_on_test_data():
    # Load data
    data = pd.read_csv('../split_data/test.csv')
    data = shuffle(data)

    # Class split stats
    print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
    X_test = data['post'].apply(lambda post: clean_post(post))
    label_encoder = LabelEncoder()
    y1 = label_encoder.fit_transform(np.array(data['mental_disorder']))
    y_test = to_categorical(y1)
    
    # Load tokenizer
    with open('../models/tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    # Convert  texts to sequence of integers
    sequences_test = tokenizer.texts_to_sequences(X_test)

    # Limit size of test sequences to 200 and pad the sequence
    X_test = pad_sequences(sequences_test, maxlen=200)
    print(f'Shape of X test tensor: {X_test.shape}')

    # Convert target to array
    y_test = np.asarray(y_test)
    print(f'Shape of y test tensor: {y_test.shape}')
    
    # Evaluating
    for _type in [1, 3, 5, 6, -1, 0, 2]:
        model = load_model(f'../models/CNN_model_{_type}')
        print('#'*110)
        print()
        if _type == -1:
            print('Without any oversampling/undersampling')
        else:
            print(f'With sampling type: {_type}')
        print()
        print()
        # Predict on test dataset
        pred_test = model.predict(X_test)
        get_metrics(y_test, pred_test)
        print()
        print()
        print('#'*110)

In [29]:
def get_text_label(text):
    X_test = np.array([clean_post(text),])
    
    # Load tokenizer
    with open('../models/tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    # Convert  texts to sequence of integers
    sequences_test = tokenizer.texts_to_sequences(X_test)

    # Limit size of test sequences to 200 and pad the sequence
    X_test = pad_sequences(sequences_test, maxlen=200)
    
    # Evaluating
    model = load_model(f'../models/CNN_model_6')
    pred_test = model.predict(X_test)
    return np.argmax(pred_test, axis=1)[0]

In [None]:
evaluate_on_test_data()