In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


df_train = pd.read_csv('../resource/Mental-Health-Twitter-Preprocessed/train.csv')
df_val = pd.read_csv('../resource/Mental-Health-Twitter-Preprocessed/val.csv')

# TF-IDF

In [14]:
def extract_tfidf_features(df_train, df_val, text_column='post_text', max_features=5000):
    """
    Transforms train and validation DataFrames using TF-IDF vectorization.
    
    Parameters:
        df_train (pd.DataFrame): Training set containing a 'post_text' column.
        df_val (pd.DataFrame): Validation set with the same structure.
        text_column (str): Name of the text column to vectorize.
        max_features (int): Maximum number of features for TF-IDF.
        
    Returns:
        tfidf_X_train (pd.DataFrame): TF-IDF features for training set.
        tfidf_X_val (pd.DataFrame): TF-IDF features for validation set.
        vectorizer (TfidfVectorizer): The fitted vectorizer (for later use on test data).
    """
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_train = vectorizer.fit_transform(df_train[text_column])
    tfidf_val = vectorizer.transform(df_val[text_column])
    
    feature_names = vectorizer.get_feature_names_out()
    
    tfidf_X_train = pd.DataFrame(tfidf_train.toarray(), columns=feature_names, index=df_train.index)
    tfidf_X_val = pd.DataFrame(tfidf_val.toarray(), columns=feature_names, index=df_val.index)
    
    return tfidf_X_train, tfidf_X_val, vectorizer


In [15]:
X_train, X_val, tfidf_vectorizer = extract_tfidf_features(df_train, df_val)

# Get target labels
y_train = df_train['label']
y_val = df_val['label']

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.74      0.74      1481
           1       0.74      0.74      0.74      1484

    accuracy                           0.74      2965
   macro avg       0.74      0.74      0.74      2965
weighted avg       0.74      0.74      0.74      2965



# TF-IDF & Vader

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

def extract_tfidf_features(df_train, df_val, text_column='post_text', max_features=5000):
    """
    Transforms train and validation DataFrames using TF-IDF and VADER sentiment scores.
    
    Parameters:
        df_train (pd.DataFrame): Training set with a text column.
        df_val (pd.DataFrame): Validation set with the same structure.
        text_column (str): Name of the column containing text.
        max_features (int): Maximum number of TF-IDF features.
        
    Returns:
        combined_train (pd.DataFrame): Combined TF-IDF + sentiment features for train.
        combined_val (pd.DataFrame): Combined TF-IDF + sentiment features for val.
        vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
    """
    # TF-IDF
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_train = vectorizer.fit_transform(df_train[text_column])
    tfidf_val = vectorizer.transform(df_val[text_column])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df_train = pd.DataFrame(tfidf_train.toarray(), columns=feature_names, index=df_train.index)
    tfidf_df_val = pd.DataFrame(tfidf_val.toarray(), columns=feature_names, index=df_val.index)

    # VADER Sentiment
    analyzer = SentimentIntensityAnalyzer()
    
    def get_sentiment_scores(text):
        return pd.Series(analyzer.polarity_scores(text))
    
    vader_train = df_train[text_column].apply(get_sentiment_scores)
    vader_val = df_val[text_column].apply(get_sentiment_scores)

    # Combine TF-IDF and Sentiment
    combined_train = pd.concat([tfidf_df_train, vader_train], axis=1)
    combined_val = pd.concat([tfidf_df_val, vader_val], axis=1)

    return combined_train, combined_val, vectorizer


In [17]:
X_train, X_val, tfidf_vectorizer = extract_tfidf_features(df_train, df_val)

# Target labels
y_train = df_train['label']
y_val = df_val['label']

# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1481
           1       0.73      0.74      0.73      1484

    accuracy                           0.73      2965
   macro avg       0.73      0.73      0.73      2965
weighted avg       0.73      0.73      0.73      2965



# TF-IDF & Vader (including nemerical)

In [18]:
def extract_tfidf_features(df, vectorizer=None):
    # 1. TF-IDF
    if vectorizer is None:
        vectorizer = TfidfVectorizer(max_features=5000)
        tfidf_matrix = vectorizer.fit_transform(df['post_text'])
    else:
        tfidf_matrix = vectorizer.transform(df['post_text'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # 2. VADER sentiment
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = df['post_text'].apply(lambda x: pd.Series(analyzer.polarity_scores(x)))
    sentiment_scores.columns = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']

    # 3. Numeric metadata features
    numeric_cols = ['followers', 'friends', 'favourites', 'statuses', 'retweets']
    metadata_df = df[numeric_cols].reset_index(drop=True)

    # 4. Combine all features
    combined_df = pd.concat([tfidf_df, sentiment_scores.reset_index(drop=True), metadata_df], axis=1)

    return combined_df, vectorizer


In [20]:
# Separate labels
y_train = df_train['label']
y_val = df_val['label']

# Extract features
X_train, vectorizer = extract_tfidf_features(df_train)
X_val, _ = extract_tfidf_features(df_val, vectorizer=vectorizer)

# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.82      0.86      1481
           1       0.83      0.90      0.87      1484

    accuracy                           0.86      2965
   macro avg       0.86      0.86      0.86      2965
weighted avg       0.86      0.86      0.86      2965



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
