In [29]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD

from sklearn.model_selection import cross_val_score

path_name = 'your_path_name_here/mental_health.csv'
df=pd.read_csv(path_name)
print(df)

df = df.drop_duplicates()

#Split the matrix into predictors and response 
msg=df["text"]
msg=msg.str.replace('[^a-zA-Z0-9]+'," h", regex = True) 
y=df['label'].values
y 


#Stemming and Tokenising
stemmer=PorterStemmer()
msg=msg.apply(lambda line:[stemmer.stem(token.lower()) for token in word_tokenize(line)]).apply(lambda token:" ".join(token))
msg=msg.apply(lambda line:[token for token in word_tokenize(line) if len(token)>2]).apply(lambda y:" ".join(y))

#Vectorising
tf=TfidfVectorizer() 
data_vec=tf.fit_transform(msg)

#Train, Validation, and Test Split ###RANDOMSTATE=1###
from sklearn.model_selection import train_test_split
#test_size=0.3 implies 70% will be used to training and 30% for testing. 
#random_state sets seed for random number generator, ensuring reducibility
x_train,x_test,y_train,y_test=train_test_split(data_vec,y,test_size=0.3,random_state=1)

                                                    text  label
0      dear american teens question dutch person hear...      0
1      nothing look forward lifei dont many reasons k...      1
2      music recommendations im looking expand playli...      0
3      im done trying feel betterthe reason im still ...      1
4      worried  year old girl subject domestic physic...      1
...                                                  ...    ...
27972  posting everyday people stop caring  religion ...      0
27973  okay definetly need hear guys opinion ive pret...      0
27974  cant get dog think ill kill myselfthe last thi...      1
27975  whats point princess bridei really think like ...      1
27976  got nudes person might might know snapchat do ...      0

[27977 rows x 2 columns]


In [30]:
#Latent Semantic Analysis (aka Truncated SVD)  #2 Components

#Find principal components and transform data
lsa=TruncatedSVD(n_components=2,random_state=1)
x_train_lsa= lsa.fit_transform(x_train)

modelLR = LogisticRegression(penalty='none')

modelLR.fit(x_train_lsa,y_train)

# Transform the test set as well 
x_test_lsa = lsa.transform(x_test)

# Make predictions on a new set of data
predictions = modelLR.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR.score(x_test_lsa, y_test) 
print("Accuracy on the test set:", accuracy_on_test_set)
print(classification_report(y_test,predictions))

Accuracy on the test set: 0.85569590085796
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      4271
           1       0.87      0.82      0.85      4121

    accuracy                           0.86      8392
   macro avg       0.86      0.86      0.86      8392
weighted avg       0.86      0.86      0.86      8392



In [31]:
#Logistic Lasso with 5 fold Cross Validation on LSA reduced data (2 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_lasso = LogisticRegression(penalty='l1', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_lasso, x_train_lsa, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Print or inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_lasso = LogisticRegression(penalty='l1', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_lasso.fit(x_train_lsa,y_train)


# Make predictions on a new set of data
predictions = modelLR_lasso.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_lasso.score(x_test_lsa, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 2.5118864315096747
Best lambda value (regularisation value) selected by cross-validation: 0.39810717055348227
Accuracy on the test set: 0.8541468064823642
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      4271
           1       0.88      0.82      0.85      4121

    accuracy                           0.85      8392
   macro avg       0.86      0.85      0.85      8392
weighted avg       0.86      0.85      0.85      8392



In [32]:
#Logistic Ridge with 5 fold Cross Validation on LSA reduced data (2 components)


alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_Ridge = LogisticRegression(penalty='l2', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_Ridge, x_train_lsa, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Print or inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_Ridge = LogisticRegression(penalty='l2', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_Ridge.fit(x_train_lsa,y_train)


# Make predictions on a new set of data
predictions = modelLR_Ridge.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_Ridge.score(x_test_lsa, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 1258.9254117941869
Best lambda value (regularisation value) selected by cross-validation: 0.0007943282347242692
Accuracy on the test set: 0.853789323164919
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      4271
           1       0.87      0.82      0.85      4121

    accuracy                           0.85      8392
   macro avg       0.86      0.85      0.85      8392
weighted avg       0.86      0.85      0.85      8392



In [33]:
#Latent Semantic Analysis (aka Truncated SVD)  #4 Components

#Find principal components and transform data
lsa=TruncatedSVD(n_components=4, random_state=1)
x_train_lsa= lsa.fit_transform(x_train)

modelLR = LogisticRegression(penalty='none')

modelLR.fit(x_train_lsa,y_train)

# Transform the test set as well 
x_test_lsa = lsa.transform(x_test)

# Make predictions on a new set of data
predictions = modelLR.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR.score(x_test_lsa, y_test) 
print("Accuracy on the test set:", accuracy_on_test_set)
print(classification_report(y_test,predictions))

Accuracy on the test set: 0.8690419447092469
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      4271
           1       0.89      0.84      0.86      4121

    accuracy                           0.87      8392
   macro avg       0.87      0.87      0.87      8392
weighted avg       0.87      0.87      0.87      8392



In [34]:
#Logistic Lasso with 5 fold Cross Validation on LSA reduced data (4 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_lasso = LogisticRegression(penalty='l1', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_lasso, x_train_lsa, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Print or inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_lasso = LogisticRegression(penalty='l1', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_lasso.fit(x_train_lsa,y_train)


# Make predictions on a new set of data
predictions = modelLR_lasso.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_lasso.score(x_test_lsa, y_test)
print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 125.89254117941971
Best lambda value (regularisation value) selected by cross-validation: 0.007943282347242626
Accuracy on the test set: 0.8688036224976168
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      4271
           1       0.89      0.84      0.86      4121

    accuracy                           0.87      8392
   macro avg       0.87      0.87      0.87      8392
weighted avg       0.87      0.87      0.87      8392



In [35]:
#Logistic Ridge with 5 fold Cross Validation on LSA reduced data (4 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_Ridge = LogisticRegression(penalty='l2', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_Ridge, x_train_lsa, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Print or inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_Ridge = LogisticRegression(penalty='l2', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_Ridge.fit(x_train_lsa,y_train)


# Make predictions on a new set of data
predictions = modelLR_Ridge.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_Ridge.score(x_test_lsa, y_test) 
print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 99999.99999999999
Best lambda value (regularisation value) selected by cross-validation: 1e-05
Accuracy on the test set: 0.8689227836034319
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      4271
           1       0.89      0.84      0.86      4121

    accuracy                           0.87      8392
   macro avg       0.87      0.87      0.87      8392
weighted avg       0.87      0.87      0.87      8392



In [36]:
#Latent Semantic Analysis (aka Truncated SVD)  #6 Components

#Find principal components and transform data
lsa=TruncatedSVD(n_components=6, random_state=1)
x_train_lsa= lsa.fit_transform(x_train)

modelLR = LogisticRegression(penalty='none')

modelLR.fit(x_train_lsa,y_train)

# Transform the test set as well 
x_test_lsa = lsa.transform(x_test)

# Make predictions on a new set of data
predictions = modelLR.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR.score(x_test_lsa, y_test) 
print("Accuracy on the test set:", accuracy_on_test_set)
print(classification_report(y_test,predictions))

Accuracy on the test set: 0.8741658722592945
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      4271
           1       0.89      0.85      0.87      4121

    accuracy                           0.87      8392
   macro avg       0.88      0.87      0.87      8392
weighted avg       0.87      0.87      0.87      8392



In [37]:
#Logistic Lasso with 5 fold Cross Validation on LSA reduced data (6 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_lasso = LogisticRegression(penalty='l1', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_lasso, x_train_lsa, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Print or inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_lasso = LogisticRegression(penalty='l1', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_lasso.fit(x_train_lsa,y_train)


# Make predictions on a new set of data
predictions = modelLR_lasso.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_lasso.score(x_test_lsa, y_test) 
print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 7.9432823472430805
Best lambda value (regularisation value) selected by cross-validation: 0.1258925411794125
Accuracy on the test set: 0.8727359389895138
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      4271
           1       0.89      0.85      0.87      4121

    accuracy                           0.87      8392
   macro avg       0.87      0.87      0.87      8392
weighted avg       0.87      0.87      0.87      8392



In [38]:
#Logistic Ridge with 5 fold Cross Validation on LSA reduced data (6 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_Ridge = LogisticRegression(penalty='l2', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_Ridge, x_train_lsa, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Print or inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_Ridge = LogisticRegression(penalty='l2', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_Ridge.fit(x_train_lsa,y_train)


# Make predictions on a new set of data
predictions = modelLR_Ridge.predict(x_test_lsa)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_Ridge.score(x_test_lsa, y_test) 

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 79432.82347242822
Best lambda value (regularisation value) selected by cross-validation: 1.2589254117941661e-05
Accuracy on the test set: 0.8740467111534795
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      4271
           1       0.89      0.85      0.87      4121

    accuracy                           0.87      8392
   macro avg       0.88      0.87      0.87      8392
weighted avg       0.87      0.87      0.87      8392

