In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer 

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score

from scipy.stats import uniform 
from scipy.stats import loguniform 
from sklearn.model_selection import RandomizedSearchCV

path_name = 'your_path_name_here/mental_health.csv'
df=pd.read_csv(path_name)
print(df)

df = df.drop_duplicates()

#Split the matrix into predictors and response 
msg=df["text"]
msg=msg.str.replace('[^a-zA-Z0-9]+'," h", regex = True) 
y=df['label'].values
y 


#Stemming and Tokenising
stemmer=PorterStemmer()
msg=msg.apply(lambda line:[stemmer.stem(token.lower()) for token in word_tokenize(line)]).apply(lambda token:" ".join(token))
msg=msg.apply(lambda line:[token for token in word_tokenize(line) if len(token)>2]).apply(lambda y:" ".join(y))

#Vectorising
tf=TfidfVectorizer() 
data_vec=tf.fit_transform(msg)

#Train, Validation, and Test Split ###RANDOMSTATE=1###
from sklearn.model_selection import train_test_split
#test_size=0.3 implies 70% will be used to training and 30% for testing. 
#random_state sets seed for random number generator, ensuring reducibility
x_train,x_test,y_train,y_test=train_test_split(data_vec,y,test_size=0.3,random_state=1)

                                                    text  label
0      dear american teens question dutch person hear...      0
1      nothing look forward lifei dont many reasons k...      1
2      music recommendations im looking expand playli...      0
3      im done trying feel betterthe reason im still ...      1
4      worried  year old girl subject domestic physic...      1
...                                                  ...    ...
27972  posting everyday people stop caring  religion ...      0
27973  okay definetly need hear guys opinion ive pret...      0
27974  cant get dog think ill kill myselfthe last thi...      1
27975  whats point princess bridei really think like ...      1
27976  got nudes person might might know snapchat do ...      0

[27977 rows x 2 columns]


In [4]:
#Elastic Net with 5 fold Cross Validation for Hyperparameter Tuning for 200 parameter suggestions

param_space = [
  {'C': loguniform(1e-5,100), 
   'l1_ratio': uniform(0,1)},
 ]

modelLR_ElasticNet = LogisticRegression(penalty='elasticnet', random_state=1, solver='saga', max_iter=1000, warm_start=True)



cv_randomsearch = RandomizedSearchCV(modelLR_ElasticNet, param_space, n_iter=200, scoring='accuracy', cv = 5, random_state=1, verbose=3)
results = cv_randomsearch.fit(x_train,y_train)


print("Best Accuracy Score:", results.best_score_)
print("Best Hyperparameters:", results.best_params_)

best_parameters = results.best_params_
best_C = best_parameters['C']
best_ratio = best_parameters['l1_ratio']

modelLR_ElasticNet = LogisticRegression(penalty='elasticnet', C = best_C, l1_ratio = best_ratio, random_state=1, solver='saga')

modelLR_ElasticNet.fit(x_train,y_train)

# Make predictions on a new set of data
predictions = modelLR_ElasticNet.predict(x_test)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelLR_ElasticNet.score(x_test, y_test) #accuracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 1/5] END C=0.008301451461243866, l1_ratio=0.7203244934421581;, score=0.750 total time=   1.6s
[CV 2/5] END C=0.008301451461243866, l1_ratio=0.7203244934421581;, score=0.739 total time=   1.1s
[CV 3/5] END C=0.008301451461243866, l1_ratio=0.7203244934421581;, score=0.743 total time=   0.8s
[CV 4/5] END C=0.008301451461243866, l1_ratio=0.7203244934421581;, score=0.734 total time=   0.9s
[CV 5/5] END C=0.008301451461243866, l1_ratio=0.7203244934421581;, score=0.739 total time=   0.7s
[CV 1/5] END C=1.0018452045446539e-05, l1_ratio=0.30233257263183977;, score=0.504 total time=   0.3s
[CV 2/5] END C=1.0018452045446539e-05, l1_ratio=0.30233257263183977;, score=0.504 total time=   0.3s
[CV 3/5] END C=1.0018452045446539e-05, l1_ratio=0.30233257263183977;, score=0.504 total time=   0.3s
[CV 4/5] END C=1.0018452045446539e-05, l1_ratio=0.30233257263183977;, score=0.504 total time=   0.3s
[CV 5/5] END C=1.0018452045446539e-05, l1_