In [37]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

path_name = 'your_path_name_here/mental_health.csv'
df=pd.read_csv(path_name)
print(df)

df = df.drop_duplicates()

#REDUCED SAMPLING
import random
random.seed(1)
size = list(df.index)
sample = random.sample(size,1000)
df = df.loc[sample]


#Split the matrix into predictors and response 
msg=df["text"]
msg=msg.str.replace('[^a-zA-Z0-9]+'," h", regex = True) 
y=df['label'].values
y 


#Stemming and Tokenising
stemmer=PorterStemmer()
msg=msg.apply(lambda line:[stemmer.stem(token.lower()) for token in word_tokenize(line)]).apply(lambda token:" ".join(token))
msg=msg.apply(lambda line:[token for token in word_tokenize(line) if len(token)>2]).apply(lambda y:" ".join(y))

#Vectorising
tf=TfidfVectorizer() 
data_vec=tf.fit_transform(msg)
print(data_vec.shape)

#Train, Validation, and Test Split ###RANDOMSTATE=1###
from sklearn.model_selection import train_test_split
#test_size=0.3 implies 70% will be used to training and 30% for testing. 
#random_state sets seed for random number generator, ensuring reducibility
x_train,x_test,y_train,y_test=train_test_split(data_vec,y,test_size=0.3,random_state=1)


                                                    text  label
0      dear american teens question dutch person hear...      0
1      nothing look forward lifei dont many reasons k...      1
2      music recommendations im looking expand playli...      0
3      im done trying feel betterthe reason im still ...      1
4      worried  year old girl subject domestic physic...      1
...                                                  ...    ...
27972  posting everyday people stop caring  religion ...      0
27973  okay definetly need hear guys opinion ive pret...      0
27974  cant get dog think ill kill myselfthe last thi...      1
27975  whats point princess bridei really think like ...      1
27976  got nudes person might might know snapchat do ...      0

[27977 rows x 2 columns]
513
(1000, 7779)


In [38]:
x_train_dense = x_train.todense()
x_test_dense = x_test.todense()

In [39]:
#PCA #2 Components


pca = PCA(n_components = 2)
x_train_pca = pca.fit_transform(x_train_dense)

modelLR = LogisticRegression(penalty='none')

modelLR.fit(x_train_pca,y_train)

# Transform the test set as well 
x_test_pca = pca.transform(x_test_dense)

# Make predictions on a new set of data
predictions = modelLR.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR.score(x_test_pca, y_test) #accuracy = # of correct predictions / total # of predictions
print("Accuracy on the test set:", accuracy_on_test_set)
print(classification_report(y_test,predictions))



Accuracy on the test set: 0.8233333333333334
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       152
           1       0.85      0.78      0.81       148

    accuracy                           0.82       300
   macro avg       0.83      0.82      0.82       300
weighted avg       0.83      0.82      0.82       300





In [40]:
#Logistic Lasso with 5 fold Cross Validation on PCA reduced data (2 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_lasso = LogisticRegression(penalty='l1', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_lasso, x_train_pca, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_lasso = LogisticRegression(penalty='l1', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_lasso.fit(x_train_pca,y_train)


# Make predictions on a new set of data
predictions = modelLR_lasso.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_lasso.score(x_test_pca, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 0.39810717055351486
Best lambda value (regularisation value) selected by cross-validation: 2.511886431509469
Accuracy on the test set: 0.8266666666666667
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       152
           1       0.87      0.76      0.81       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300



In [41]:
#Logistic Ridge with 5 fold Cross Validation on PCA reduced data (2 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_Ridge = LogisticRegression(penalty='l2', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_Ridge, x_train_pca, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_Ridge = LogisticRegression(penalty='l2', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_Ridge.fit(x_train_pca,y_train)


# Make predictions on a new set of data
predictions = modelLR_Ridge.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_Ridge.score(x_test_pca, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 2.5118864315096747
Best lambda value (regularisation value) selected by cross-validation: 0.39810717055348227
Accuracy on the test set: 0.8266666666666667
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       152
           1       0.87      0.76      0.81       148

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300



In [42]:
#PCA #4 Components

pca = PCA(n_components = 4)
x_train_pca = pca.fit_transform(x_train_dense)

modelLR = LogisticRegression(penalty='none')

modelLR.fit(x_train_pca,y_train)

# Transform the test set as well 
x_test_pca = pca.transform(x_test_dense)

# Make predictions on a new set of data
predictions = modelLR.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR.score(x_test_pca, y_test) #accuracy = # of correct predictions / total # of predictions
print("Accuracy on the test set:", accuracy_on_test_set)
print(classification_report(y_test,predictions))



Accuracy on the test set: 0.8466666666666667
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       152
           1       0.89      0.79      0.84       148

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300





In [43]:
#Logistic Lasso with 5 fold Cross Validation on PCA reduced data (4 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_lasso = LogisticRegression(penalty='l1', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_lasso, x_train_pca, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_lasso = LogisticRegression(penalty='l1', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_lasso.fit(x_train_pca,y_train)


# Make predictions on a new set of data
predictions = modelLR_lasso.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_lasso.score(x_test_pca, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 1.2589254117942177
Best lambda value (regularisation value) selected by cross-validation: 0.7943282347242496
Accuracy on the test set: 0.8466666666666667
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       152
           1       0.89      0.79      0.84       148

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



In [44]:
#Logistic Ridge with 5 fold Cross Validation on PCA reduced data (4 components)


alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_Ridge = LogisticRegression(penalty='l2', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_Ridge, x_train_pca, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_Ridge = LogisticRegression(penalty='l2', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_Ridge.fit(x_train_pca,y_train)


# Make predictions on a new set of data
predictions = modelLR_Ridge.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_Ridge.score(x_test_pca, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 2.5118864315096747
Best lambda value (regularisation value) selected by cross-validation: 0.39810717055348227
Accuracy on the test set: 0.8466666666666667
              precision    recall  f1-score   support

           0       0.81      0.91      0.86       152
           1       0.90      0.78      0.83       148

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



In [45]:
#PCA #6 Components

pca = PCA(n_components = 6)
x_train_pca = pca.fit_transform(x_train_dense)

modelLR = LogisticRegression(penalty='none')

modelLR.fit(x_train_pca,y_train)

# Transform the test set as well 
x_test_pca = pca.transform(x_test_dense)

# Make predictions on a new set of data
predictions = modelLR.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR.score(x_test_pca, y_test) #accuracy = # of correct predictions / total # of predictions
print("Accuracy on the test set:", accuracy_on_test_set)
print(classification_report(y_test,predictions))



Accuracy on the test set: 0.87
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       152
           1       0.91      0.82      0.86       148

    accuracy                           0.87       300
   macro avg       0.87      0.87      0.87       300
weighted avg       0.87      0.87      0.87       300





In [46]:
#Logistic Lasso with 5 fold Cross Validation on PCA reduced data (6 components)

alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_lasso = LogisticRegression(penalty='l1', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_lasso, x_train_pca, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Print or inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_lasso = LogisticRegression(penalty='l1', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_lasso.fit(x_train_pca,y_train)


# Make predictions on a new set of data
predictions = modelLR_lasso.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_lasso.score(x_test_pca, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 158.48931924611497
Best lambda value (regularisation value) selected by cross-validation: 0.006309573444801788
Accuracy on the test set: 0.87
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       152
           1       0.91      0.82      0.86       148

    accuracy                           0.87       300
   macro avg       0.87      0.87      0.87       300
weighted avg       0.87      0.87      0.87       300



In [47]:
#Logistic Ridge with 5 fold Cross Validation on PCA reduced data (6 components)


alphas = 10.0 ** np.arange(-5, 5, 0.1)

cv_scores = [] 

# Perform cross-validation for each value of C
for alpha in alphas:
    modelLR_Ridge = LogisticRegression(penalty='l2', C = 1/alpha, random_state=1, solver='liblinear')
    scores = cross_val_score(modelLR_Ridge, x_train_pca, y_train, cv = 5)  # 5-fold cross-validation
    cv_scores.append(np.mean(scores))


# Find the index corresponding to the best C value
best_CV_index = np.argmax(cv_scores)
best_alpha = alphas[best_CV_index]

# Inspect the best C value
print("Best C value selected by cross-validation:", 1/best_alpha)
print("Best lambda value (regularisation value) selected by cross-validation:", best_alpha)

modelLR_Ridge = LogisticRegression(penalty='l2', C=1/best_alpha, random_state=1, solver = 'liblinear')

modelLR_Ridge.fit(x_train_pca,y_train)


# Make predictions on a new set of data
predictions = modelLR_Ridge.predict(x_test_pca)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_Ridge.score(x_test_pca, y_test) #accuaracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Best C value selected by cross-validation: 1000.0000000000165
Best lambda value (regularisation value) selected by cross-validation: 0.0009999999999999835
Accuracy on the test set: 0.87
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       152
           1       0.91      0.82      0.86       148

    accuracy                           0.87       300
   macro avg       0.87      0.87      0.87       300
weighted avg       0.87      0.87      0.87       300

