In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform 
from scipy.stats import loguniform 


path_name = 'your_path_name_here/mental_health.csv'
df=pd.read_csv(path_name)
print(df)

df = df.drop_duplicates()

#REDUCED SAMPLING
import random
size = list(df.index)
sample = random.sample(size,1000)
df = df.loc[sample]

#Split the matrix into predictors and response 
msg=df["text"]
msg=msg.str.replace('[^a-zA-Z0-9]+'," h", regex = True) 
y=df['label'].values
y 


#Stemming and Tokenising
stemmer=PorterStemmer()
msg=msg.apply(lambda line:[stemmer.stem(token.lower()) for token in word_tokenize(line)]).apply(lambda token:" ".join(token))
msg=msg.apply(lambda line:[token for token in word_tokenize(line) if len(token)>2]).apply(lambda y:" ".join(y))

#Vectorising
tf=TfidfVectorizer() 
data_vec=tf.fit_transform(msg)

#Train, Validation, and Test Split ###RANDOMSTATE=1###
from sklearn.model_selection import train_test_split
#test_size=0.3 implies 70% will be used to training and 30% for testing. 
#random_state sets seed for random number generator, ensuring reducibility
x_train,x_test,y_train,y_test=train_test_split(data_vec,y,test_size=0.3,random_state=1)

                                                    text  label
0      dear american teens question dutch person hear...      0
1      nothing look forward lifei dont many reasons k...      1
2      music recommendations im looking expand playli...      0
3      im done trying feel betterthe reason im still ...      1
4      worried  year old girl subject domestic physic...      1
...                                                  ...    ...
27972  posting everyday people stop caring  religion ...      0
27973  okay definetly need hear guys opinion ive pret...      0
27974  cant get dog think ill kill myselfthe last thi...      1
27975  whats point princess bridei really think like ...      1
27976  got nudes person might might know snapchat do ...      0

[27977 rows x 2 columns]


In [4]:
#RBF SVM with 5 fold CV

param_space = [
  {'C': loguniform(1e-5, 100),
   'gamma': [0.1, 1, 'scale', 'auto']},
 ]

modelSVM_rbf = SVC(kernel = 'rbf', random_state=1, max_iter=-1)



cv_randomsearch = RandomizedSearchCV(modelSVM_rbf, param_space, n_iter=200, scoring='accuracy', cv = 5, random_state=1, verbose=3)
results = cv_randomsearch.fit(x_train,y_train)


print("Best Accuracy Score:", results.best_score_)
print("Best Hyperparameters:", results.best_params_)

best_parameters = results.best_params_
print(best_parameters)
best_C = best_parameters['C']
best_gamma = best_parameters['gamma']


modelSVM_rbf = SVC(kernel = 'rbf', C = best_C, gamma= best_gamma, random_state=1, max_iter=-1)

modelSVM_rbf.fit(x_train,y_train)

# Make predictions on a new set of data
predictions = modelSVM_rbf.predict(x_test)

# Compare the predicted labels to the true labels
accuracy_on_test_set = modelSVM_rbf.score(x_test, y_test) #accuracy = # of correct predictions / total # of predictions

print("Accuracy on the test set:", accuracy_on_test_set)

print(classification_report(y_test,predictions))

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 1/5] END .C=0.008301451461243866, gamma=0.1;, score=0.543 total time=   0.3s
[CV 2/5] END .C=0.008301451461243866, gamma=0.1;, score=0.543 total time=   0.2s
[CV 3/5] END .C=0.008301451461243866, gamma=0.1;, score=0.543 total time=   0.2s
[CV 4/5] END .C=0.008301451461243866, gamma=0.1;, score=0.543 total time=   0.2s
[CV 5/5] END .C=0.008301451461243866, gamma=0.1;, score=0.536 total time=   0.2s
[CV 1/5] END ......C=33.72108309441093, gamma=1;, score=0.879 total time=   0.2s
[CV 2/5] END ......C=33.72108309441093, gamma=1;, score=0.836 total time=   0.2s
[CV 3/5] END ......C=33.72108309441093, gamma=1;, score=0.879 total time=   0.2s
[CV 4/5] END ......C=33.72108309441093, gamma=1;, score=0.843 total time=   0.2s
[CV 5/5] END ......C=33.72108309441093, gamma=1;, score=0.907 total time=   0.2s
[CV 1/5] END C=0.0013071577689307433, gamma=auto;, score=0.543 total time=   0.2s
[CV 2/5] END C=0.0013071577689307433, gamma=