In [1]:
import pandas as pd

df = pd.read_csv('Mental-Health-Twitter-Preprocessed.csv')  # or load your pickle file
df['text'] = df['post_text']  # make sure the column is named 'text' for consistency

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Optional: Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Apply sentiment scoring
df[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df['text'].apply(
    lambda x: pd.Series(analyzer.polarity_scores(x))
)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_features=5000)
bow_matrix = bow_vectorizer.fit_transform(df['text'])

# Optional: Convert to DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
bow_df

Unnamed: 0,ab,ability,able,about,above,abroad,absolute,absolutely,abuse,abused,...,zenhabits,zero,zimmermann,zink,znati,zone,zquad,zsa,zurab,zyl
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19761,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19762,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19764,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Combine TF-IDF and VADER features
combined_df = pd.concat([tfidf_df, df[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']]], axis=1)
combined_df

Unnamed: 0,ab,ability,able,about,above,abroad,absolute,absolutely,abuse,abused,...,znati,zone,zquad,zsa,zurab,zyl,vader_neg,vader_neu,vader_pos,vader_compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,1.000,0.000,0.0000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,1.000,0.000,0.0000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.243,0.757,0.000,-0.5927
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.672,0.328,0.8316
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.063,0.811,0.126,0.3400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.487,0.513,0.000,-0.5773
19762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.302,0.698,0.000,-0.6369
19763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,1.000,0.000,0.0000
19764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.860,0.140,0.2382


In [6]:
from sklearn.model_selection import train_test_split

X = combined_df
y = df['label']  # replace 'label' with your actual label column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1989
           1       0.73      0.75      0.74      1965

    accuracy                           0.74      3954
   macro avg       0.74      0.74      0.74      3954
weighted avg       0.74      0.74      0.74      3954

