In [1]:
#!pip install xgboost
#!pip install ftfy
#!pip install plotly

In [2]:
# Standard Imports
import nltk
import pandas                        as pd
import numpy                         as np
import seaborn                       as sns
import matplotlib.pyplot             as plt
import re
import ftfy
from IPython.display                 import display_html
from IPython.core.display            import display, HTML

# Proprocessing, Modeling, & Evaluation
from nltk.corpus                     import stopwords
from nltk.stem                       import WordNetLemmatizer
from nltk.tokenize                   import RegexpTokenizer 
from sklearn.ensemble                import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model            import LogisticRegression
from sklearn.model_selection         import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics                 import accuracy_score, recall_score, roc_auc_score, matthews_corrcoef
from sklearn.pipeline                import Pipeline
from sklearn.svm                     import SVC
from sklearn.tree                    import DecisionTreeClassifier
from xgboost                         import XGBClassifier

# Custom Modules
import graphs
import metrics


# Notebook settings & styles
sns.set(style = "white", palette = "deep")
display(HTML("<style>.container { width:95% !important; }</style>"))
%matplotlib inline

In [3]:
Annotated_Sample = pd.read_csv('./Reddit Data/All/Reddit Eczema_All_20201201_Sample_Annotated.csv', encoding = "iso-8859-1" )


In [4]:
#Annotated_Sample["body"] =  Annotated_Sample["body"].apply(lambda x: re.sub(r'[^a-z]',' ', str(x)))

In [5]:
#Annotated_Sample ["body"] = Annotated_Sample ["body"].map(ftfy.fix_encoding)

In [6]:
# Downloading the default stopwords

nltk.download("stopwords");

# Adding our stopwords to the English set

#new_stopwords = ["like", "just", "make", "cook","use", "chicken", "recipe", "sauce"]

stopwords     = stopwords.words('english')

#stopwords.extend(new_stopwords)

# Instantiating the lemmatizier and tokenizer
# The tokenizer will only keep text
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
tokenizer  = RegexpTokenizer(r'\w+')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ying.Ying_Zenbook\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ying.Ying_Zenbook\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def cleaning_data(dataframe):
   
    dataframe["body"] =  dataframe["body"].apply(lambda x: re.sub(r'[^a-z]',' ',str(x)))
    
    dataframe ["body"] = dataframe ["body"].map(ftfy.fix_encoding)
    
    if 'Annotation' in dataframe :
        dataframe["label"] = dataframe["Annotation"].apply(lambda x: 1 if x == "R" else 0)
    
    #Annotated_Sample.drop(columns=['Annotation'],inplace=True)
    dataframe.rename(columns={'Author':'author'},inplace=True)
    
    # Setting up the lemmatizer

    lemmatized_posts = []

    for post in dataframe["body"]:
        tokens = tokenizer.tokenize(post)
        post   = [lemmatizer.lemmatize(post) for post in tokens]
        lemmatized_posts.append(" ".join(post))

    # Appending the lemmatized posts to the dataframe

    dataframe["lemmatized_text"] = lemmatized_posts

    #remove URL
    dataframe["lemmatized_text"] = dataframe["lemmatized_text"].str.replace("http\S+", "")
    
    #lower case
    dataframe["lemmatized_text"] = dataframe["lemmatized_text"].str.lower()
    
     #remove none letters
   


    # Checking the head of the dataframe
    #dataframe.head()

    return dataframe


In [8]:
Annotated_Sample = cleaning_data(Annotated_Sample)
Annotated_Sample.head()

Unnamed: 0,author,body,Annotation,label,lemmatized_text
0,Hanniballbearings,t s just one of those things hate that it ...,R,1,t s just one of those thing hate that it doe t...
1,cleverleper,feel you ve thought the same during awful ...,I,0,feel you ve thought the same during awful flar...
2,sd_red_lobster,you can also try asking on r ecze s there h...,R,1,you can also try asking on r ecze s there have...
3,Rumanaaa,ve never heard of this hank you for your i...,I,0,ve never heard of this hank you for your input...
4,touchyfeelies,hanks,I,0,hank


In [9]:
"""
# Generating a list of text lengths

lengths = [len(text) for text in Annotated_Sample["body"]]

# Plotting the text lengths

plt.figure(figsize = (16,6), facecolor = "white")
sns.distplot(lengths, kde = False, bins = 100, color = "black")
plt.axvline(np.mean(lengths), color = "red")
plt.title("Distribution Of Text Length", size = 18)
plt.xlabel("Words", size = 16)
plt.ylabel("Frequency", size = 16)
plt.xticks(np.arange(0,23500,1500), size = 14)
plt.yticks(size = 14);

# The red line marks the mean length
"""

'\n# Generating a list of text lengths\n\nlengths = [len(text) for text in Annotated_Sample["body"]]\n\n# Plotting the text lengths\n\nplt.figure(figsize = (16,6), facecolor = "white")\nsns.distplot(lengths, kde = False, bins = 100, color = "black")\nplt.axvline(np.mean(lengths), color = "red")\nplt.title("Distribution Of Text Length", size = 18)\nplt.xlabel("Words", size = 16)\nplt.ylabel("Frequency", size = 16)\nplt.xticks(np.arange(0,23500,1500), size = 14)\nplt.yticks(size = 14);\n\n# The red line marks the mean length\n'

In [10]:
"""
# Saving the vectorized dfs to a new dataframe
vec = CountVectorizer(stop_words = "english")

# Fit-transforming the vectorizer
vec_sample     = vec.fit_transform(Annotated_Sample["body"])

sample_vectorized     = pd.DataFrame(vec_sample.toarray(), columns = vec.get_feature_names())
sample_vectorized.sum().sort_values(ascending=False)

"""

'\n# Saving the vectorized dfs to a new dataframe\nvec = CountVectorizer(stop_words = "english")\n\n# Fit-transforming the vectorizer\nvec_sample     = vec.fit_transform(Annotated_Sample["body"])\n\nsample_vectorized     = pd.DataFrame(vec_sample.toarray(), columns = vec.get_feature_names())\nsample_vectorized.sum().sort_values(ascending=False)\n\n'

### CountVectorizer

Convert a collection of text documents to a matrix of token counts

<https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>

In [11]:
vec_lem = CountVectorizer(ngram_range =(1,2),stop_words = stopwords)

vec_sample_lem     = vec_lem.fit_transform(Annotated_Sample["lemmatized_text"])

sample_vectorized_lem     = pd.DataFrame(vec_sample_lem.toarray(), columns = vec_lem.get_feature_names())

sample_vectorized_lem.sum().sort_values(ascending=False)





eczema           475
skin             376
wa               355
like             284
get              256
                ... 
peace since        1
peace skin         1
peak               1
peak summer        1
literally day      1
Length: 32432, dtype: int64

### TfidfVectorizer

TF-IDF (term frequency-inverse document frequency) was invented for document search and information retrieval. It works by increasing proportionally to the number of times a word appears in a document, but is offset by the number of documents that contain the word. 


<https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>

In [12]:
tvec_lem = TfidfVectorizer(ngram_range =(1,2),stop_words = stopwords)

tvec_sample_lem     = tvec_lem.fit_transform(Annotated_Sample["lemmatized_text"])

tsample_vectorized_lem     = pd.DataFrame(tvec_sample_lem.toarray(),columns = tvec_lem.get_feature_names())

tsample_vectorized_lem.T[0].sort_values(ascending=False)

fault             0.183883
hile fun          0.183883
high hile         0.183883
nothing person    0.183883
conscious know    0.183883
                    ...   
preying people    0.000000
price             0.000000
price aybe        0.000000
price better      0.000000
aaf               0.000000
Name: 0, Length: 32432, dtype: float64

# Modelling

__Create Test and Training data split__

In [13]:

X = Annotated_Sample["lemmatized_text"]
y = Annotated_Sample["label"]

# The random state ensures reproducability
# The stratify argument preserves the distribution of classes

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 42,
                                                    stratify     = y)

## __Logistic Regression__

The logistic regression is very similar to the linear regression, but it uses a logit function to bend the line so that it can predict either 0 or 1.

The gridsearch will be searching hyperparameters for the vectorizers, not the logistic regression.

__Count Vectorizer__


In [14]:
# Setting up the pipeline

cvec_lr_pipe = Pipeline([("cvec", CountVectorizer()), 
                         ("log_reg", LogisticRegression())])

# Setting the CVEC hyperparameters

cvec_pipe_params = {"cvec__max_features": [None], 
                    "cvec__ngram_range" : [(1,2)], 
                    "cvec__stop_words"  : [stopwords]}

# Instantiating the grid search

cvec_lr_gs = GridSearchCV(cvec_lr_pipe, 
                          param_grid = cvec_pipe_params, 
                          cv         = 10)

# Fitting the model to the training data

cvec_lr_gs.fit(X_train, y_train);

# The futurewarning can be ignored

In [15]:
# Generating training predictions

cvec_lr_train_preds = cvec_lr_gs.predict(X_train)

# Generating test predictions

cvec_lr_preds       = cvec_lr_gs.predict(X_test)

# Generating test probabilities

cvec_lr_probas      = cvec_lr_gs.predict_proba(X_test)

In [16]:
# Training metrics

metrics.binary_classification_summary(y_train, cvec_lr_train_preds)

Unnamed: 0,Score
Accuracy,0.998797
Sensitivity,0.998469
Specificity,1.0
AUROC,0.999234
Matthews Corr. Coef.,0.996439


In [17]:
# Test metrics

metrics.binary_classification_summary(y_test, cvec_lr_preds)

Unnamed: 0,Score
Accuracy,0.827338
Sensitivity,0.890411
Specificity,0.59322
AUROC,0.741816
Matthews Corr. Coef.,0.483631


In [18]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   cvec_lr_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,35,24
Actual Related,24,195


__TFIDF Vectorization__

In [19]:
# Setting up the pipeline

tvec_lr_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                         ("log_reg", LogisticRegression())])

# Setting TFIDF hyperparameters

tvec_pipe_params = {"tvec__max_features": [None], 
                    "tvec__ngram_range" : [(1,2)], 
                    "tvec__stop_words"  : [stopwords]}
                    
# Instantiating the grid search

tvec_lr_gs = GridSearchCV(tvec_lr_pipe, 
                          param_grid = tvec_pipe_params, 
                          cv         = 10)

# Fitting the model to the training data

tvec_lr_gs.fit(X_train, y_train);

# The warning is a futurewarning and can be ignored

In [20]:
# Generating training predictions

tvec_lr_train_preds = tvec_lr_gs.predict(X_train)

# Generating test predictions

tvec_lr_preds       = tvec_lr_gs.predict(X_test) 

# Generating test probabilities

tvec_lr_probas     = tvec_lr_gs.predict(X_test)

In [21]:

# Training metrics

metrics.binary_classification_summary(y_train, tvec_lr_train_preds)

Unnamed: 0,Score
Accuracy,0.808664
Sensitivity,1.0
Specificity,0.106742
AUROC,0.553371
Matthews Corr. Coef.,0.292985


In [22]:
# Test metrics

metrics.binary_classification_summary(y_test, tvec_lr_preds)

Unnamed: 0,Score
Accuracy,0.798561
Sensitivity,1.0
Specificity,0.050847
AUROC,0.525424
Matthews Corr. Coef.,0.201229


In [23]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   tvec_lr_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,3,56
Actual Related,0,219


## Support Vector Classifier

A support vector machine (in this case a classifier) is at its core a linear model. However, instead of running like a logistic regression, it seeks to linearly separate the data. To do that, it uses a kernel to raise the data into n-dimensional space. It then uses a line, plane (3-dimensional line), or hyperplane (greater than 3-dimensions) to delineate the data

__Count Vectorizer__


In [24]:
# Setting up the pipeline

cvec_svc_pipe = Pipeline([("cvec", CountVectorizer()), 
                         ("svc", SVC())])

# Setting CVEC and pipe hyperparameters

cvec_pipe_params = {"cvec__max_features": [None], 
                    "cvec__ngram_range" : [(1,2)], 
                    "cvec__stop_words"  : [stopwords],
                    "svc__C"            : [1.0],
                    "svc__kernel"       : ["linear"],
                    "svc__gamma"        : ["auto"]}
                    
# Instantiating the grid search

cvec_svc_gs = GridSearchCV(cvec_svc_pipe, 
                           param_grid = cvec_pipe_params, 
                           cv         = 10)

# Fitting the model to the training data

cvec_svc_gs.fit(X_train, y_train);

In [25]:

# Generating training predictions

cvec_svc_train_preds = cvec_svc_gs.predict(X_train)

# Generating test predictions

cvec_svc_preds       = cvec_svc_gs.predict(X_test)

In [26]:
# Training metrics

metrics.binary_classification_summary(y_train, cvec_svc_train_preds)

Unnamed: 0,Score
Accuracy,1.0
Sensitivity,1.0
Specificity,1.0
AUROC,1.0
Matthews Corr. Coef.,1.0


In [27]:
# Test metrics
print(roc_auc_score(y_test, cvec_svc_preds))

metrics.binary_classification_summary(y_test, cvec_svc_preds)

0.7975388901787788


Unnamed: 0,Score
Accuracy,0.827338
Sensitivity,0.849315
Specificity,0.745763
AUROC,0.797539
Matthews Corr. Coef.,0.543724


In [28]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   cvec_svc_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,44,15
Actual Related,33,186


**TFIDF Vectorizer**


In [29]:
# Setting up the pipeline

tvec_svc_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                         ("svc", SVC())])

# Setting TFIDF and pipe hyperparameters

tvec_pipe_params = {"tvec__max_features": [None], 
                    "tvec__ngram_range" : [(1,2)], 
                    "tvec__stop_words"  : [stopwords],
                    "svc__C"            : [1.0],
                    "svc__kernel"       : ["linear"],
                    "svc__gamma"        : ["auto"]}
                    
# Instantiating the grid search

tvec_svc_gs = GridSearchCV(tvec_svc_pipe, 
                           param_grid = tvec_pipe_params, 
                           cv         = 10)

# Fitting the model to the training data

tvec_svc_gs.fit(X_train, y_train);

In [30]:
# Generating training predictions

tvec_svc_train_preds = tvec_svc_gs.predict(X_train)

# Generating test predictions

tvec_svc_preds       = tvec_svc_gs.predict(X_test)

In [31]:
# Training metrics

metrics.binary_classification_summary(y_train, tvec_svc_train_preds)

Unnamed: 0,Score
Accuracy,0.99278
Sensitivity,1.0
Specificity,0.966292
AUROC,0.983146
Matthews Corr. Coef.,0.978516


In [32]:
# Test metrics
metrics.binary_classification_summary(y_test, tvec_svc_preds)

Unnamed: 0,Score
Accuracy,0.830935
Sensitivity,0.995434
Specificity,0.220339
AUROC,0.607886
Matthews Corr. Coef.,0.40344


In [33]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   tvec_svc_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,13,46
Actual Related,1,218


## Random Forest Classifier

A random forest classifier is a decision tree-based classification method. However, it has advantages over other tree-based models. Firstly, it bootstraps the dataframe to have a random subset of the data, but it also takes a random subset of the features. Having two levels of randomness in the model reduces the likelihood of the model being overfit on training data but it also allows the model to be less prone to variance caused by many features.

__Count Vectorizer__

In [34]:
# Creating the pipeline

cvec_rf_pipe = Pipeline([("cvec", CountVectorizer()), 
                         ("rf", RandomForestClassifier(random_state = 42))])

# Setting CVEC and pipeline hyperparameters

cvec_pipe_params = {"cvec__max_features"   : [None], 
                    "cvec__ngram_range"    : [(1,2)], 
                    "cvec__stop_words"     : [stopwords],
                    "rf__n_estimators"     : [72],
                    "rf__min_samples_split": [6],
                    "rf__min_samples_leaf" : [2],
                    "rf__max_depth"        : [20]}

# Instantiating the grid search

cvec_rf_gs = GridSearchCV(cvec_rf_pipe, 
                          param_grid = cvec_pipe_params, 
                          cv         = 10,
                          n_jobs     = 6)

# Fitting the model to the training data

cvec_rf_gs.fit(X_train, y_train);

In [35]:
# Generating training predictions

cvec_rf_train_preds = cvec_rf_gs.predict(X_train)

# Generating test predictions

cvec_rf_preds       = cvec_rf_gs.predict(X_test) 

# Generating test probabilities

cvec_rf_probas      = cvec_rf_gs.predict_proba(X_test)

In [36]:
# Training metrics

metrics.binary_classification_summary(y_train, cvec_rf_train_preds)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Unnamed: 0,Score
Accuracy,0.7858
Sensitivity,1.0
Specificity,0.0
AUROC,0.5
Matthews Corr. Coef.,0.0


In [37]:
# Test metrics

metrics.binary_classification_summary(y_test, cvec_rf_preds)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Unnamed: 0,Score
Accuracy,0.78777
Sensitivity,1.0
Specificity,0.0
AUROC,0.5
Matthews Corr. Coef.,0.0


In [38]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   cvec_rf_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,0,59
Actual Related,0,219


__TFIDF Vectorizer__


In [39]:
# Creating the pipeline

tvec_rf_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                         ("rf", RandomForestClassifier(random_state = 42))])

# Setting the TVEC and pipeline hyperparameters

tvec_pipe_params = {"tvec__max_features"   : [None], 
                    "tvec__ngram_range"    : [(1,2)], 
                    "tvec__stop_words"     : [stopwords],
                    "rf__n_estimators"     : [30],
                    "rf__min_samples_split": [6],
                    "rf__min_samples_leaf" : [2],
                    "rf__max_depth"        : [12]}

# Instantiating the grid search

tvec_rf_gs = GridSearchCV(tvec_rf_pipe, 
                          param_grid = tvec_pipe_params, 
                          cv         = 5,
                          n_jobs     = 6)

# Fitting the model to the testing data

tvec_rf_gs.fit(X_train, y_train);

In [40]:
# Generating training predictions

tvec_rf_train_preds = tvec_rf_gs.predict(X_train)

# Generating test predictions

tvec_rf_preds       = tvec_rf_gs.predict(X_test) 

# Generating test probabilities

tvec_rf_probas      = tvec_rf_gs.predict_proba(X_test)

In [41]:
# Training metrics

metrics.binary_classification_summary(y_train, tvec_rf_train_preds)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Unnamed: 0,Score
Accuracy,0.7858
Sensitivity,1.0
Specificity,0.0
AUROC,0.5
Matthews Corr. Coef.,0.0


In [42]:
# Test metrics

metrics.binary_classification_summary(y_test, tvec_rf_preds)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Unnamed: 0,Score
Accuracy,0.78777
Sensitivity,1.0
Specificity,0.0
AUROC,0.5
Matthews Corr. Coef.,0.0


In [43]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   tvec_rf_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,0,59
Actual Related,0,219


## XGBoost Classifier

XGBoost is a tree-based boosting model that iteratively fits tree models on the errors of the previous model and uses gradient descent to help minimize the loss function. Furthermore, the XGBoost is much more computationally efficient and can be parallelized unlike other boosting models.

__Count Vectorizer__

In [44]:
# Creating the pipeline
# The model's best parameters are shown

cvec_xgbc_pipe = Pipeline([("cvec", CountVectorizer()), 
                           ("xgbc", XGBClassifier(n_jobs                = 6,
                                                  seed                  = 42,
                                                  early_stopping_rounds = 10))])

# Setting CVEC and pipeline hyperparameters

cvec_pipe_params = {"cvec__max_features"   : [None], 
                    "cvec__ngram_range"    : [(1,2)], 
                    "cvec__stop_words"     : [stopwords],
                    "xgbc__max_depth"      : [3],
                    "xgbc__learning_rate"  : [0.04],
                    "xgbc__n_estimators"   : [175],
                    "xgbc__gamma"          : [3.0]}

# Instantiating the grid search

cvec_xgbc_gs = GridSearchCV(cvec_xgbc_pipe, 
                            param_grid = cvec_pipe_params, 
                            cv         = 5,
                            n_jobs     = 6)

# Fitting the model to the testing data

cvec_xgbc_gs.fit(X_train, y_train);

Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [45]:

# Generating training predictions

cvec_xgbc_train_preds = cvec_xgbc_gs.predict(X_train)

# Generating test predictions

cvec_xgbc_preds       = cvec_xgbc_gs.predict(X_test) 

# Generating test probabilities

cvec_xgbc_probas      = cvec_xgbc_gs.predict_proba(X_test)

In [46]:
# Training metrics

metrics.binary_classification_summary(y_train, cvec_xgbc_train_preds)

Unnamed: 0,Score
Accuracy,0.821901
Sensitivity,0.998469
Specificity,0.174157
AUROC,0.586313
Matthews Corr. Coef.,0.368065


In [47]:
# Test metrics

metrics.binary_classification_summary(y_test, cvec_xgbc_preds)

Unnamed: 0,Score
Accuracy,0.823741
Sensitivity,0.995434
Specificity,0.186441
AUROC,0.590937
Matthews Corr. Coef.,0.365922


In [48]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   cvec_xgbc_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,11,48
Actual Related,1,218


__TFIDF Vectorizer__


In [49]:

# Creating the pipeline
# The model's best parameters are shown

tvec_xgbc_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                           ("xgbc", XGBClassifier(n_jobs                = 6,
                                                  seed                  = 42,
                                                  early_stopping_rounds = 10))])

# Setting the TFIDF and pipeline hyperparameters

tvec_pipe_params = {"tvec__max_features"   : [525], 
                    "tvec__ngram_range"    : [(1,2)], 
                    "tvec__stop_words"     : [stopwords],
                    "xgbc__max_depth"      : [3],
                    "xgbc__learning_rate"  : [0.25],
                    "xgbc__n_estimators"   : [139],
                    "xgbc__gamma"          : [1.0]}

# Instantiating the grid search

tvec_xgbc_gs = GridSearchCV(tvec_xgbc_pipe, 
                            param_grid = tvec_pipe_params, 
                            cv         = 5,
                            n_jobs     = 6)

# Fitting the model to the testing data

tvec_xgbc_gs.fit(X_train, y_train);

Parameters: { early_stopping_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [50]:
# Generating training predictions

tvec_xgbc_train_preds = tvec_xgbc_gs.predict(X_train)

# Generating test predictions

tvec_xgbc_preds       = tvec_xgbc_gs.predict(X_test) 

# Generating test probabilities

tvec_xgbc_probas      = tvec_xgbc_gs.predict_proba(X_test)

In [51]:
# Training metrics

metrics.binary_classification_summary(y_train, tvec_xgbc_train_preds)

Unnamed: 0,Score
Accuracy,0.959085
Sensitivity,0.967841
Specificity,0.926966
AUROC,0.947404
Matthews Corr. Coef.,0.880764


In [52]:
# Test metrics

metrics.binary_classification_summary(y_test, tvec_xgbc_preds)

Unnamed: 0,Score
Accuracy,0.827338
Sensitivity,0.885845
Specificity,0.610169
AUROC,0.748007
Matthews Corr. Coef.,0.490058


In [53]:
# Generating a confusion matrix on the test results

metrics.confusion_matrix_dataframe(y_test, 
                                   tvec_xgbc_preds,
                                   columns = ["Predicted Irrelated", "Predicted Related"],
                                   index   = ["Actual Irrelated", "Actual Related"])

Unnamed: 0,Predicted Irrelated,Predicted Related
Actual Irrelated,36,23
Actual Related,25,194


## Evaluation

In [54]:
# This function allows for dataframes to be displayed side-by-side

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table', 'table style="display:inline"'), raw = True)

In [55]:
# Count vectorizer metrics

cvec_accuracy          = [accuracy_score(y_test, cvec_lr_preds), 
                          accuracy_score(y_test, cvec_svc_preds),
                          accuracy_score(y_test, cvec_rf_preds), 
                          accuracy_score(y_test, cvec_xgbc_preds)]

cvec_specificity       = [metrics.specificity(y_test, cvec_lr_preds), 
                          metrics.specificity(y_test, cvec_svc_preds),
                          metrics.specificity(y_test, cvec_rf_preds), 
                          metrics.specificity(y_test, cvec_xgbc_preds)]

cvec_sensitivity       = [recall_score(y_test, cvec_lr_preds), 
                          recall_score(y_test, cvec_svc_preds),
                          recall_score(y_test, cvec_rf_preds), 
                          recall_score(y_test, cvec_xgbc_preds)]

cvec_rocauc_score      = [roc_auc_score(y_test, cvec_lr_preds),
                          roc_auc_score(y_test, cvec_svc_preds),
                          roc_auc_score(y_test, cvec_rf_preds),
                          roc_auc_score(y_test, cvec_xgbc_preds)]

cvec_matthews_corrcoef = [matthews_corrcoef(y_test, cvec_lr_preds),
                         matthews_corrcoef(y_test, cvec_svc_preds),
                         matthews_corrcoef(y_test, cvec_rf_preds),
                         matthews_corrcoef(y_test, cvec_xgbc_preds)]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [56]:

# TFIDF vectorizer metrics

tvec_accuracy          = [accuracy_score(y_test, tvec_lr_preds), 
                          accuracy_score(y_test, tvec_svc_preds),
                          accuracy_score(y_test, tvec_rf_preds), 
                          accuracy_score(y_test, tvec_xgbc_preds)]

tvec_specificity       = [metrics.specificity(y_test, tvec_lr_preds), 
                          metrics.specificity(y_test, tvec_svc_preds),
                          metrics.specificity(y_test, tvec_rf_preds), 
                          metrics.specificity(y_test, tvec_xgbc_preds)]

tvec_sensitivity       = [recall_score(y_test, tvec_lr_preds), 
                          recall_score(y_test, tvec_svc_preds),
                          recall_score(y_test, tvec_rf_preds), 
                          recall_score(y_test, tvec_xgbc_preds)]

tvec_rocauc_score      = [roc_auc_score(y_test, tvec_lr_preds),
                          roc_auc_score(y_test, tvec_svc_preds),
                          roc_auc_score(y_test, tvec_rf_preds),
                          roc_auc_score(y_test, tvec_xgbc_preds)]

tvec_matthews_corrcoef = [matthews_corrcoef(y_test, tvec_lr_preds),
                         matthews_corrcoef(y_test, tvec_svc_preds),
                         matthews_corrcoef(y_test, tvec_rf_preds),
                         matthews_corrcoef(y_test, tvec_xgbc_preds)]

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [57]:
# Converting the lists into dataframes

# A dataframe for the CVEC scores

cvec_scores = pd.DataFrame(data    = [cvec_accuracy, cvec_specificity, 
                                      cvec_sensitivity, cvec_rocauc_score, 
                                      cvec_matthews_corrcoef],
                           columns = ["Log. Reg.", "SVC", "Random Forest", "XGBoost"],
                           index   = ["Accuracy", "Specificity", 
                                      "Sensitivity", "AUROC Score", 
                                      "Matthews Corr. Coef."])

# A dataframe for the TVEC scores

tvec_scores = pd.DataFrame(data    = [tvec_accuracy, tvec_specificity, 
                                      tvec_sensitivity, tvec_rocauc_score,
                                      tvec_matthews_corrcoef],
                           columns = ["Log. Reg.", "SVC", "Random Forest", "XGBoost"],
                           index   = ["Accuracy", "Specificity", 
                                      "Sensitivity", "ROC-AUC Score",
                                      "Matthews Corr. Coef."])

In [58]:
# Displaying the two dataframes side by side

display_side_by_side(cvec_scores,
                     tvec_scores)

# The first table is the CVEC scores
# The second table is the TVEC scores

Unnamed: 0,Log. Reg.,SVC,Random Forest,XGBoost
Accuracy,0.827338,0.827338,0.78777,0.823741
Specificity,0.59322,0.745763,0.0,0.186441
Sensitivity,0.890411,0.849315,1.0,0.995434
AUROC Score,0.741816,0.797539,0.5,0.590937
Matthews Corr. Coef.,0.483631,0.543724,0.0,0.365922

Unnamed: 0,Log. Reg.,SVC,Random Forest,XGBoost
Accuracy,0.798561,0.830935,0.78777,0.827338
Specificity,0.050847,0.220339,0.0,0.610169
Sensitivity,1.0,0.995434,1.0,0.885845
ROC-AUC Score,0.525424,0.607886,0.5,0.748007
Matthews Corr. Coef.,0.201229,0.40344,0.0,0.490058


In [None]:

# For cvec models

cvec_predictions = pd.DataFrame([np.array(X_test),np.array(y_test), cvec_lr_preds, cvec_svc_preds, 
                                 cvec_rf_preds, cvec_xgbc_preds],
                                index = ["body","Actual","LgR.", "SVC", "RFC", "XGBC"]).T

# For tvec models

tvec_predictions = pd.DataFrame([np.array(X_test),np.array(y_test), tvec_lr_preds, tvec_svc_preds, 
                                 tvec_rf_preds, tvec_xgbc_preds],
                              index = ["body","Actual","LgR.", "SVC", "RFC", "XGBC"]).T


In [None]:
# Saving the dataframes as csvs

cvec_predictions.to_csv("./Reddit Data/All/cvec_model_predictions_Test3.csv")
tvec_predictions.to_csv("./Reddit Data/All/tvec_model_predctions_Test3.csv")


## Use Prediction Model

In [None]:

test_data= pd.read_csv('./Reddit Data/All/Reddit Eczema_All_20201201_Pred.csv', encoding="iso-8859-1" )

test_data = cleaning_data(test_data)

test_data.head()


In [None]:

def predict(dataframe):

    pred_text= dataframe["lemmatized_text"]
   
    # For cvec models
    cvec_lr_preds = cvec_lr_gs.predict(pred_text) 
    cvec_svc_preds = cvec_svc_gs.predict(pred_text) 
    cvec_rf_preds = cvec_rf_gs.predict(pred_text) 
    cvec_xgbc_preds = cvec_xgbc_gs.predict(pred_text) 
    
    cvec_predictions = pd.DataFrame([np.array(pred_text),cvec_lr_preds, cvec_svc_preds, 
                                 cvec_rf_preds, cvec_xgbc_preds],
                                index = ["body","LgR.", "SVC", "RFC", "XGBC"]).T
    
    # For tvec models
    tvec_lr_preds = tvec_lr_gs.predict(pred_text) 
    tvec_svc_preds = tvec_svc_gs.predict(pred_text) 
    tvec_rf_preds = tvec_rf_gs.predict(pred_text) 
    tvec_xgbc_preds = tvec_xgbc_gs.predict(pred_text) 
    
    
    tvec_predictions = pd.DataFrame([np.array(pred_text),tvec_lr_preds, tvec_svc_preds, 
                                 tvec_rf_preds, tvec_xgbc_preds],
                                index = ["body","LgR.", "SVC", "RFC", "XGBC"]).T
    
    # Saving the dataframes as csvs

    cvec_predictions.to_csv("./Reddit Data/All/cvec_model_predictions_Pred5.csv")
    tvec_predictions.to_csv("./Reddit Data/All/tvec_model_predctions_Pred5.csv")

In [None]:

predict(test_data)
