In [None]:
# Required imports
import matplotlib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, accuracy_score
import re
# from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from numpy import loadtxt
from xgboost import XGBClassifier
import joblib

In [None]:
# cleaning data
def clean_post(post):
    post = post.lower()
    post = re.sub(r"\n", " ", post)
    post = re.sub("[\<\[].*?[\>\]]", " ", post)
    post = re.sub(r"[^a-z ]", " ", post)
    post = re.sub(r"\b\w{1,3}\b", " ", post)
    return " ".join([x for x in post.split() if x not in stop_words])

In [None]:
def get_metrics(y_true, y_pred):
    result1 = classification_report(y_true, y_pred)
    print('Classification Report: ', result1)
    result2 = accuracy_score(y_true, y_pred)
    print('Accuracy: ', result2, "\n\n")

In [None]:
def evaluate_on_test_data():
    # Load data
    data = pd.read_csv('/home/starc52/split_reddit_data/test.csv')
    data = shuffle(data)

    # Class split stats
    print(data.groupby(['mental_disorder'])[['mental_disorder']].describe())
    x = data['post'].apply(lambda post: clean_post(post))

    # Vectorizing text data
    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(x)
    tfidf_transformer = TfidfTransformer()
    X = tfidf_transformer.fit_transform(X_counts)
    
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(np.array(data['mental_disorder']))
    
    # Evaluating for best version
    model = joblib.load("/home/starc52/models/XGBoost.sav")
    
    # Predict on test dataset
    pred_test = model.predict(X)
    get_metrics(y, pred_test)
    print()
    print()
    print('#'*110)
    return np.argmax(pred_test, axis=1)

In [None]:
def get_text_label(text):
    X_test = np.array([clean_post(text),])
    
    # Vectorizing text data
    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(X_test)
    tfidf_transformer = TfidfTransformer()
    X_test = tfidf_transformer.fit_transform(X_counts)
    
    
    # Evaluating
    model = joblib.load("/home/starc52/models/XGBoost.pickle.dat")
    # cols_when_model_builds = model.get_booster().feature_names
    # X_test=X_test[cols_when_model_builds]
    pred_test = model.predict(X_test)
    return np.argmax(pred_test, axis=1)[0]

In [None]:
pred_labels = evaluate_on_test_data()