In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

# Load the dataset
df = pd.read_csv('data/mental_health.csv')

# Shuffle the dataset to ensure randomness
df = shuffle(df, random_state=42)

# Extract the text and label columns
X = df['text']
y = df['label']

# Split the data into training (80%) and testing (20%) sets, ensuring class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42, stratify=y)

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency

# Fit the TF-IDF vectorizer on the training data only
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test_tfidf = tfidf.transform(X_test)

# Initialize the logistic regression model
logreg = LogisticRegression(max_iter=1000)

# Train the model on the training data
logreg.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = logreg.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9210150107219442

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92      2828
           1       0.94      0.90      0.92      2768

    accuracy                           0.92      5596
   macro avg       0.92      0.92      0.92      5596
weighted avg       0.92      0.92      0.92      5596



In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

# Load the dataset
df = pd.read_csv('data/mental_health.csv')

# Shuffle the dataset to ensure randomness
df = shuffle(df, random_state=42)

# Extract the text and label columns
X = df['text']
y = df['label']

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency

# Transform the text data using TF-IDF
X_tfidf = tfidf.fit_transform(X)

# Split the data into training (1%) and testing (99%) sets, ensuring class balance
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, train_size=0.8, random_state=42, stratify=y)

# Initialize the SVM classifier
svm = SVC(kernel='linear', random_state=42)

# Train the model on the training data
svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred_svm = svm.predict(X_test)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.92012151536812

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92      2828
           1       0.93      0.90      0.92      2768

    accuracy                           0.92      5596
   macro avg       0.92      0.92      0.92      5596
weighted avg       0.92      0.92      0.92      5596



In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle
from gensim.models import Word2Vec
import numpy as np

# Load the dataset
df = pd.read_csv('data/mental_health.csv')

# Shuffle the dataset to ensure randomness
df = shuffle(df, random_state=42)

# Extract the text and label columns
X = df['text']
y = df['label']

# Preprocess the text: split each sentence into tokens (words)
X_tokens = X.apply(lambda x: x.split())

# Split the data into training (1%) and testing (99%) sets, ensuring class balance
X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(X_tokens, y, test_size=0.99, train_size=0.01, random_state=42, stratify=y)

# Train a Word2Vec model using skip-gram (sg=1) only on the training data
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, sg=1, min_count=1, workers=4, seed=42)

# Create a function to get the average Word2Vec vectors for a given text
def text_to_w2v(text_tokens, model):
    word_vectors = [model.wv[word] for word in text_tokens if word in model.wv]
    if len(word_vectors) == 0:  # In case no word in the sentence is in the model vocabulary
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Transform the training data into Word2Vec vectors
X_train_w2v = np.array([text_to_w2v(text, w2v_model) for text in X_train_tokens])

# Transform the test data into Word2Vec vectors (using the model trained on the training data)
X_test_w2v = np.array([text_to_w2v(text, w2v_model) for text in X_test_tokens])

# Initialize the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=5000, random_state=42)

# Train the model on the training data
mlp.fit(X_train_w2v, y_train)

# Make predictions on the test data
y_pred_mlp = mlp.predict(X_test_w2v)

# Evaluate the model
print("MLP Accuracy with Word2Vec:", accuracy_score(y_test, y_pred_mlp))
print("\nMLP Classification Report with Word2Vec:\n", classification_report(y_test, y_pred_mlp))

MLP Accuracy with Word2Vec: 0.6861506245938335

MLP Classification Report with Word2Vec:
               precision    recall  f1-score   support

           0       0.86      0.46      0.60     13998
           1       0.62      0.92      0.74     13700

    accuracy                           0.69     27698
   macro avg       0.74      0.69      0.67     27698
weighted avg       0.74      0.69      0.67     27698



In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

# Load the dataset
df = pd.read_csv('data/mental_health.csv')

# Shuffle the dataset to ensure randomness
df = shuffle(df, random_state=42)

# Extract the text and label columns
X = df['text']
y = df['label']

# Split the data into training (80%) and testing (20%) sets, ensuring class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, train_size=0.01, random_state=42, stratify=y)

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency

# Fit the TF-IDF vectorizer on the training data only
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test_tfidf = tfidf.transform(X_test)

# Initialize the logistic regression model
logreg = LogisticRegression(max_iter=1000)

# Train the model on the training data
logreg.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = logreg.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8594844393096974

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.86     13998
           1       0.87      0.84      0.86     13700

    accuracy                           0.86     27698
   macro avg       0.86      0.86      0.86     27698
weighted avg       0.86      0.86      0.86     27698

