In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


df_train = pd.read_csv('../resource/Mental-Health-Twitter-Preprocessed/train.csv')
df_val = pd.read_csv('../resource/Mental-Health-Twitter-Preprocessed/val.csv')

# Bow

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

def extract_bow_features(df_train, df_val, text_column='post_text', max_features=5000):
    """
    Transforms train and validation DataFrames using Bag-of-Words (BoW) vectorization.
    
    Parameters:
        df_train (pd.DataFrame): Training set containing a text column.
        df_val (pd.DataFrame): Validation set with the same structure.
        text_column (str): Name of the text column to vectorize.
        max_features (int): Maximum number of features for BoW.
        
    Returns:
        bow_X_train (pd.DataFrame): BoW features for training set.
        bow_X_val (pd.DataFrame): BoW features for validation set.
        vectorizer (CountVectorizer): The fitted vectorizer (for later use on test data).
    """
    vectorizer = CountVectorizer(max_features=max_features)
    bow_train = vectorizer.fit_transform(df_train[text_column])
    bow_val = vectorizer.transform(df_val[text_column])
    
    feature_names = vectorizer.get_feature_names_out()
    
    bow_X_train = pd.DataFrame(bow_train.toarray(), columns=feature_names, index=df_train.index)
    bow_X_val = pd.DataFrame(bow_val.toarray(), columns=feature_names, index=df_val.index)
    
    return bow_X_train, bow_X_val, vectorizer

In [4]:
X_train, X_val, tfidf_vectorizer = extract_bow_features(df_train, df_val)

# Get target labels
y_train = df_train['label']
y_val = df_val['label']

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.76      0.75      1481
           1       0.75      0.73      0.74      1484

    accuracy                           0.74      2965
   macro avg       0.74      0.74      0.74      2965
weighted avg       0.74      0.74      0.74      2965



# Bow & Vader

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

def extract_bow_features(df_train, df_val, text_column='post_text', max_features=5000):
    """
    Transforms train and validation DataFrames using Bag-of-Words (BoW) and VADER sentiment scores.
    
    Parameters:
        df_train (pd.DataFrame): Training set with a text column.
        df_val (pd.DataFrame): Validation set with the same structure.
        text_column (str): Name of the column containing text.
        max_features (int): Maximum number of BoW features.
        
    Returns:
        combined_train (pd.DataFrame): Combined BoW + sentiment features for train.
        combined_val (pd.DataFrame): Combined BoW + sentiment features for val.
        vectorizer (CountVectorizer): Fitted BoW vectorizer.
    """
    # Bag-of-Words (BoW)
    vectorizer = CountVectorizer(max_features=max_features)
    bow_train = vectorizer.fit_transform(df_train[text_column])
    bow_val = vectorizer.transform(df_val[text_column])
    feature_names = vectorizer.get_feature_names_out()
    bow_df_train = pd.DataFrame(bow_train.toarray(), columns=feature_names, index=df_train.index)
    bow_df_val = pd.DataFrame(bow_val.toarray(), columns=feature_names, index=df_val.index)

    # VADER Sentiment (unchanged)
    analyzer = SentimentIntensityAnalyzer()
    
    def get_sentiment_scores(text):
        return pd.Series(analyzer.polarity_scores(text))
    
    vader_train = df_train[text_column].apply(get_sentiment_scores)
    vader_val = df_val[text_column].apply(get_sentiment_scores)

    # Combine BoW and Sentiment
    combined_train = pd.concat([bow_df_train, vader_train], axis=1)
    combined_val = pd.concat([bow_df_val, vader_val], axis=1)

    return combined_train, combined_val, vectorizer

In [6]:
X_train, X_val, bow_vectorizer = extract_bow_features(df_train, df_val)

# Target labels
y_train = df_train['label']
y_val = df_val['label']

# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74      1481
           1       0.74      0.73      0.74      1484

    accuracy                           0.74      2965
   macro avg       0.74      0.74      0.74      2965
weighted avg       0.74      0.74      0.74      2965



# Bow & Vader (including nemerical)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

def extract_bow_features(df, vectorizer=None):
    """
    Extracts Bag-of-Words (BoW), VADER sentiment, and numeric metadata features from text data.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame containing text and metadata.
        vectorizer (CountVectorizer, optional): Pre-fitted BoW vectorizer. If None, fits a new one.
        
    Returns:
        combined_df (pd.DataFrame): Combined BoW + sentiment + metadata features.
        vectorizer (CountVectorizer): Fitted BoW vectorizer (for reuse on new data).
    """
    # 1. Bag-of-Words (BoW)
    if vectorizer is None:
        vectorizer = CountVectorizer(max_features=5000)
        bow_matrix = vectorizer.fit_transform(df['post_text'])
    else:
        bow_matrix = vectorizer.transform(df['post_text'])
    bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # 2. VADER sentiment (unchanged)
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = df['post_text'].apply(lambda x: pd.Series(analyzer.polarity_scores(x)))
    sentiment_scores.columns = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']

    # 3. Numeric metadata features (unchanged)
    numeric_cols = ['followers', 'friends', 'favourites', 'statuses', 'retweets']
    metadata_df = df[numeric_cols].reset_index(drop=True)

    # 4. Combine all features
    combined_df = pd.concat([bow_df, sentiment_scores.reset_index(drop=True), metadata_df], axis=1)

    return combined_df, vectorizer

In [12]:
# Separate labels
y_train = df_train['label']
y_val = df_val['label']

# Extract features
X_train, vectorizer = extract_bow_features(df_train)
X_val, _ = extract_bow_features(df_val, vectorizer=vectorizer)

# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.84      0.86      1481
           1       0.85      0.90      0.87      1484

    accuracy                           0.87      2965
   macro avg       0.87      0.87      0.87      2965
weighted avg       0.87      0.87      0.87      2965



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
