In [None]:
import pandas as pd
import numpy as np
import re
from time import time

import gensim
from nltk.corpus import stopwords
from scipy.sparse import hstack, csr_matrix

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc, f1_score, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import fbeta_score, make_scorer

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import FunctionTransformer

import nlp_utils as utils
from nlp_utils import get_vectorizer

pd.options.display.max_colwidth = 100

# Pretty plots
%matplotlib inline
plt.style.use('seaborn-ticks')
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

# Display wide columns
pd.options.display.max_colwidth = 100

In [None]:
# Parameters of feature extraction
vectorizer_mode = "select features"
params = {'analyzer' : "word",
          'ngram_range' : (1,2),
          'use_idf' : True,
          'mode' : "select by pvalue",
          'thresh' : 0.001}

In [None]:
df = pd.read_csv("./data/rmh_data_prepared.csv")

class_names = ("Controls", "Suicidal ideation", "Self harm")

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.y)

n_controls = 10000
df_train = pd.concat([df_train[df_train.y == 0].sample(n_controls, random_state=42), 
                      df_train[df_train.y != 0]], 
                     axis=0)

print(df_train.y.value_counts())
print(df_test.y.value_counts())

**Build-in CV with pipeline**

In [None]:
X = df_train.entities.values
y = df_train.y.values

vectorizer = get_vectorizer(vectorizer_mode, params)

clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")
# clf = MultinomialNB()

pipe = make_pipeline(vectorizer, clf)

# scores = cross_val_score(pipe, X, y, cv=10, scoring="f1_macro")

# print("Average score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

y_pred = cross_val_predict(pipe, X, y, cv=10)
print("\nPerformance evaluation:")
print("F1 score:", f1_score(y, y_pred, average="macro"))
print("Classification report:\n", classification_report(y, y_pred, target_names=class_names))

**TESTING**

In [None]:
X_train = df_train.entities.values
y_train = df_train.SISH.values
pipe.fit(X_train, y_train)

X_test = df_test.entities.values
y_test = df_test.SISH.values
y_proba = pipe.predict_proba(X_test)
utils.evaluate_model(y_test, y_proba, class_names, "full testing")
# print("F1 score: %0.2f" % f1_score(y_test, y_pred, average="macro"))
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [None]:
df_test_small = pd.concat([df_test[df_test.SISH == 0].sample(2500, random_state=42), 
                           df_test[df_test.SISH != 0]
                          ], axis=0)

print(df_test_small.SISH.value_counts())

X_test = df_test_small.entities.values
y_test = df_test_small.SISH.values
y_proba = pipe.predict_proba(X_test)
utils.evaluate_model(y_test, y_proba, class_names, "small testing")
# print("F1 score: %0.2f" % f1_score(y_test, y_pred, average="macro"))
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [None]:
df_test_ = df_test.drop(df_test[(y_test == 0) & (y_pred == 2)].index)
X_test = df_test_.entities.values
y_test = df_test_.SISH.values
y_pred = pipe.predict(X_test)
f1_score(y_test, y_pred, average="macro")

In [None]:
confusion_matrix(y_test, y_pred)

**WHY?? Expects the same distribution?**

___
**OTHER CV IMPLEMENTATIONS**

**Vanilla**

In [None]:
X = df_train.entities.values
y = df_train.SISH.values

vectorizer = TfidfVectorizer(analyzer="word", 
                             stop_words=stopwords.words('english'), 
                             token_pattern=r'\S+',
                             ngram_range=(1,2),
                             min_df=2, 
                             use_idf=True)

X_train = vectorizer.fit_transform(X)

clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")

scores = cross_val_score(clf, X_train, y, cv=10, scoring="f1_macro")

print("Average score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

**Explicit CV**

In [None]:
X = df_train.entities.values
y = df_train.SISH.values

vectorizer = TfidfVectorizer(analyzer="word", 
                             stop_words=stopwords.words('english'), 
                             token_pattern=r'\S+',
                             ngram_range=(1,2),
                             min_df=2, 
                             use_idf=True)

X_train = vectorizer.fit_transform(X)

clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")

cv = StratifiedKFold(n_splits=10)

scores = []

for train_index, val_index in cv.split(X_train, y):
    clf.fit(X_train[train_index], y[train_index])
    y_pred = clf.predict(X_train[val_index])
    scores.append(f1_score(y[val_index], y_pred, average="macro"))

print("Average score: %0.2f (+/- %0.2f)" % (np.asarray(scores).mean(), np.asarray(scores).std() * 2))

**Feature extraction inside CV loop**

In [None]:
X = df_train.entities.values
y = df_train.y.values

vectorizer = get_vectorizer(vectorizer_mode, params)

clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")

cv = StratifiedKFold(n_splits=10)

scores = []

for train_index, val_index in cv.split(X, y):
    X_train = vectorizer.fit_transform(X[train_index], y[train_index])
    clf.fit(X_train, y[train_index])
    X_val = vectorizer.transform(X[val_index])
    y_pred = clf.predict(X_val)
    scores.append(f1_score(y[val_index], y_pred, average="macro"))

print("Average score: %0.4f (+/- %0.2f)" % (np.asarray(scores).mean(), np.asarray(scores).std() * 2))

**Initialise vectorizer inside CV loop**

In [None]:
X = df_train.entities.values
y = df_train.y.values

clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")

cv = StratifiedKFold(n_splits=10)

scores = []

for train_index, val_index in cv.split(X, y):
    vectorizer = get_vectorizer(vectorizer_mode, params)
    
    X_train = vectorizer.fit_transform(X[train_index], y[train_index])
    clf.fit(X_train, y[train_index])
    X_val = vectorizer.transform(X[val_index])
    y_pred = clf.predict(X_val)
    scores.append(f1_score(y[val_index], y_pred, average="macro"))

print("Average score: %0.4f (+/- %0.2f)" % (np.asarray(scores).mean(), np.asarray(scores).std() * 2))

**Explicit CV with pipeline**

In [None]:
X = df_train.entities.values
y = df_train.y.values

vectorizer = get_vectorizer(vectorizer_mode, params)

clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")

pipe = make_pipeline(vectorizer, clf)

cv = StratifiedKFold(n_splits=10)

scores = []

for train_index, val_index in cv.split(X, y):
    pipe.fit(X[train_index], y[train_index])
    y_pred = pipe.predict(X[val_index])
    scores.append(f1_score(y[val_index], y_pred, average="macro"))

print("Average score: %0.4f (+/- %0.2f)" % (np.asarray(scores).mean(), np.asarray(scores).std() * 2))

**Built-in CV and feature extraction with pipeline**

In [None]:
X = df_train.entities.values
y = df_train.y.values

vectorizer = get_vectorizer(vectorizer_mode, params)

clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class="ovr", class_weight="balanced")

pipe = make_pipeline(vectorizer, clf)

scores = cross_val_score(pipe, X, y, cv=10, scoring="f1_macro")

print("Average score: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

___
**CALIBRATION**

**Uncalibrated**

In [3]:
from numpy import mean
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
import nlp_utils as utils

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# scaler = StandardScaler()
clf = DecisionTreeClassifier()
# pipe = make_pipeline(scaler, clf)

cv = StratifiedKFold(n_splits=10)

scores = cross_validate(clf, X_train, y_train, n_jobs=-1, cv=10, scoring="roc_auc")

print('Mean ROC AUC: %.3f' % mean(scores["test_score"]))


Mean ROC AUC: 0.826


**Calibrated**

In [8]:
%%time

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

scaler = StandardScaler()
clf = DecisionTreeClassifier()
calibrated = CalibratedClassifierCV(clf, method='sigmoid', cv=3, ensemble=False)
pipe = make_pipeline(scaler, calibrated)

scores = cross_validate(pipe, X_train, y_train, n_jobs=-1, cv=10, scoring="roc_auc")

print('Mean ROC AUC: %.3f' % mean(scores["test_score"]))

Mean ROC AUC: 0.826
CPU times: user 26.9 ms, sys: 1.1 ms, total: 28 ms
Wall time: 265 ms


**Calibration curves**

In [None]:
# SVM reliability diagram
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve
from matplotlib import pyplot
# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[1,1], random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# fit a model
model = SVC(probability=True)
model.fit(trainX, trainy)
# predict probabilities
probs = model.predict_proba(testX)
# reliability diagram
fop, mpv = calibration_curve(testy, probs[:,1], n_bins=10, normalize=True)
# plot perfectly calibrated
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot model reliability
pyplot.plot(mpv, fop, marker='.')
pyplot.show()

In [None]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
%matplotlib inline

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

scaler = StandardScaler()
clf = SVC()
pipe = make_pipeline(scaler, clf)

pipe.fit(X_train, y_train)

y_proba = pipe.decision_function(X_test)
# y_proba = pipe.predict_proba(X_test)[:,1]

fop, mpv = calibration_curve(y_test, y_proba, n_bins=10, normalize=True)

plt.plot([0, 1], [0, 1], linestyle='--');
plt.plot(mpv, fop, marker='.');

___
**PROJECTION**

In [None]:
vectorizer = TfidfVectorizer(analyzer="word", 
                             stop_words=stopwords.words('english'), 
                             token_pattern=r'\S+',
                             ngram_range=(1,2),
                             min_df=2, 
                             use_idf=True)
                             
X = vectorizer.fit_transform(df.entities)

svd = TruncatedSVD(n_components=2)
proj = svd.fit_transform(X)

In [None]:
df["proj1"] = proj[:, 0]
df["proj2"] = proj[:, 1]

In [None]:
sns.scatterplot(x="proj1", y="proj2", hue="SISH", data=df[df.SISH != 0]);

___
**TF-IDF VECTORIZER**

In [None]:
corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]

In [None]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

In [None]:
X.todense()

In [None]:
np.linalg.norm(X[0, :].todense())

In [None]:
X[0, :].todense() / np.linalg.norm(X[0, :].todense())

In [None]:
vectorizer = TfidfVectorizer(
    norm=None,
    use_idf=True)

X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

In [None]:
X.todense()

In [None]:
vectorizer = CountVectorizer(
#     stop_words=stopwords.words('english'), 
    min_df=2
)

X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

In [None]:
X.todense()

In [None]:
vectorizer = TfidfVectorizer(
#     stop_words=stopwords.words('english'), 
    min_df=2,
#     ngram_range=(1,2),
    norm=None,
    use_idf=True)

X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

In [None]:
X.todense()

**FEATURE SELECTION**

In [None]:
vectorizer = TfidfVectorizer(
    stop_words=stopwords.words('english'), 
    min_df=2,
    ngram_range=(1,2),
    norm=None,
    use_idf=True)

X = vectorizer.fit_transform(df[:10].entities)

vectorizer.get_feature_names()

In [None]:
y = df[:10].y.values

selector = SelectKBest(chi2, k=5)

selector.fit(X, y)

selector.get_support()

In [None]:
feature_names = vectorizer.get_feature_names()

np.asarray(feature_names)[selector.get_support()]

In [None]:
X1 = selector.transform(X)

vectorizer.set_params(vocabulary=np.asarray(feature_names)[selector.get_support()])

X2 = vectorizer.fit_transform(df[:10].entities)

In [None]:
X

In [None]:
X1

In [None]:
X2

In [None]:
X[:, selector.get_support()].data

In [None]:
X1.data

In [None]:
X2.data

In [None]:
dir(selector)

**FEATURE SELECTOR CLASS**

In [None]:
params = {'word_emb' : False, 
          'model_path' : "./models/rmh_cleaned_w2v_model.bin",
          'analyzer' : "word",
          'ngram_range' : (1,2),
          'use_idf' : True,
          'select_features' : False,
          'mode' : "select k best",
          'thresh' : 5}

In [None]:
class FeatureSelector(object):
    def __init__(self, params):
        self.vectorizer = TfidfVectorizer(analyzer=params['analyzer'], 
                                          stop_words=stopwords.words('english'), 
                                          token_pattern=r'\S+',
                                          ngram_range=params['ngram_range'],
                                          min_df=2, 
                                          use_idf=params['use_idf']
                                         )
        self.mode = params['mode']
        self.thresh = params['thresh']
        self.df_features = pd.DataFrame()
        
    def fit(self, X, y):    
        X_ = self.vectorizer.fit_transform(X)
        feature_names = self.vectorizer.get_feature_names()
        
        if self.mode == "select k best":
            self.df_features = utils.select_k_best(X_, y, 
                                                   feature_names, 
                                                   k=self.thresh)
        if self.mode == "select by pvalue":
            self.df_features = utils.select_by_pvalue(X_, y, 
                                                      feature_names, 
                                                      alpha=self.thresh, 
                                                      verbose=False)
                                          
        self.vectorizer.set_params(vocabulary=self.df_features.feature.unique())
                                          
        return self
        
    def transform(self, X):
        X = self.vectorizer.transform(X)
        return X
                                          
    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)  

In [None]:
selector = FeatureSelector(params)
selector

In [None]:
y = df[:10].y.values

selector.fit(df[:10].entities, y)

In [None]:
selector.vectorizer

In [None]:
selector.df_features

In [None]:
selector.fit_transform(df[:10].entities)