In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv('Mental-Health-Twitter-Preprocessed.csv')  # or load your pickle file
df['text'] = df['post_text']  # make sure the column is named 'text' for consistency

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Optional: Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,ab,ability,able,about,above,abroad,absolute,absolutely,abuse,abused,...,zenhabits,zero,zimmermann,zink,znati,zone,zquad,zsa,zurab,zyl
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Split the data
X = tfidf_matrix  # This is your feature matrix (can also use bow_df if you prefer)
y = df['label']  # replace 'label' with your actual label column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.74      0.74      1989
           1       0.74      0.74      0.74      1965

    accuracy                           0.74      3954
   macro avg       0.74      0.74      0.74      3954
weighted avg       0.74      0.74      0.74      3954



In [10]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Apply sentiment scoring
df[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']] = df['text'].apply(
    lambda x: pd.Series(analyzer.polarity_scores(x))
)

In [20]:
# Combine TF-IDF and VADER features
combined_df = pd.concat([tfidf_df, df[['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']]], axis=1)
combined_df.head()

Unnamed: 0,ab,ability,able,about,above,abroad,absolute,absolutely,abuse,abused,...,znati,zone,zquad,zsa,zurab,zyl,vader_neg,vader_neu,vader_pos,vader_compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.243,0.757,0.0,-0.5927
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.672,0.328,0.8316
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.063,0.811,0.126,0.34


In [13]:
from sklearn.model_selection import train_test_split

X = combined_df
y = df['label']  # replace 'label' with your actual label column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1989
           1       0.73      0.75      0.74      1965

    accuracy                           0.74      3954
   macro avg       0.74      0.74      0.74      3954
weighted avg       0.74      0.74      0.74      3954



In [21]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_features=5000)
bow_matrix = bow_vectorizer.fit_transform(df['text'])

# Optional: Convert to DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
bow_df.head()

Unnamed: 0,ab,ability,able,about,above,abroad,absolute,absolutely,abuse,abused,...,zenhabits,zero,zimmermann,zink,znati,zone,zquad,zsa,zurab,zyl
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Split the data
X = bow_matrix  # This is your feature matrix (can also use bow_df if you prefer)
y = df['label']  # replace 'label' with your actual label column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75      1989
           1       0.75      0.73      0.74      1965

    accuracy                           0.74      3954
   macro avg       0.74      0.74      0.74      3954
weighted avg       0.74      0.74      0.74      3954



In [15]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # inference mode

# Get BERT embeddings for each text
def get_bert_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return cls_embedding.squeeze().numpy()

df['bert_vector'] = df['text'].apply(get_bert_embedding)

# Convert to matrix
bert_matrix = np.vstack(df['bert_vector'].values)

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


KeyboardInterrupt: 