In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, classification_report
import torch
import gc
import warnings

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


### Load data

In [2]:
# Load CSV
df = pd.read_csv('../data/cleaned_for_tfidf.csv')
print(f"Loaded {len(df):,} rows\n")
df.head(3)

Loaded 22,624,379 rows



Unnamed: 0,review_id,user_id,rating,text,gmap_id,food_quality_positive,food_quality_negative,food_quality_neutral,service_positive,service_negative,...,price_value_positive,price_value_negative,price_value_neutral,cleanliness_positive,cleanliness_negative,cleanliness_neutral,atmosphere_positive,atmosphere_negative,atmosphere_neutral,text_baseline
0,456e420929727f933dbaed63eff45cde53c7b92438cf0d...,1.067134e+20,5.0,"Easy process, extremely friendly, helpful staf...",0x80960c29f2e3bf29:0x4b291f0d275a5699,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,easy process extremely friendly helpful staff ...
1,ea2ad448a8b443c1c42c5d4ca9dd84d02fe9f2f110b993...,1.024963e+20,5.0,My girlfriends and I took a weekend ski trip t...,0x80960c29f2e3bf29:0x4b291f0d275a5699,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,my girlfriends and i took a weekend ski trip t...
2,77efbe6a6f4d27512b59bb2f878b0ac8b533aa03a11fb7...,1.102407e+20,5.0,The team at Black Tie never disappoints our se...,0x80960c29f2e3bf29:0x4b291f0d275a5699,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,the team at black tie never disappoints our se...


In [3]:
# 18 binary labels (6 aspects x 3 sentiments)
aspect_cols = [
    'food_quality_positive', 'food_quality_negative', 'food_quality_neutral',
    'service_positive', 'service_negative', 'service_neutral',
    'wait_time_positive', 'wait_time_negative', 'wait_time_neutral',
    'price_value_positive', 'price_value_negative', 'price_value_neutral',
    'cleanliness_positive', 'cleanliness_negative', 'cleanliness_neutral',
    'atmosphere_positive', 'atmosphere_negative', 'atmosphere_neutral'
]

## TF-IDF Vectorization

In [4]:
print("Fitting TF-IDF vectorizer...")
tfidf = TfidfVectorizer(
    max_features=10000,      # Default: None | Limits vocabulary to top 10k most frequent words to reduce dimensionality and speed up training
    stop_words='english',    # Default: None | 'english' - removes common words like "the", "is", "and" that don't carry sentiment
    min_df=5,                # Default: 1 | 5 - ignores words appearing in fewer than 5 documents (removes typos/rare words)
    max_df=0.7,              # Default: 1 | 0.7 - ignores words appearing in more than 70% of documents (removes overly common words)
    ngram_range=(1, 2)       # Default: (1, 1) | (1, 2) - unigrams AND bigrams (captures phrases like "not good", "really bad")
    )

Fitting TF-IDF vectorizer...


### Train/test split

In [5]:
print("Splitting data...")
X = tfidf.fit_transform(df['text_baseline'].fillna('')) # fillna to handle nan values
y = df[aspect_cols].values  

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1, # 90/10 split (we have a lot of data, 10% is 2.2M for testing)
                                                    random_state=2)

print(f"Train size: {X_train.shape[0]:,}")
print(f"Test size:  {X_test.shape[0]:,}\n")

Train size: 20,361,941
Test size:  2,262,438



## Logistic Regression (Baseline)


### Train 

In [6]:
print("Training Logistic Regression...")
print()

# MultiOutputClassifier trains separate binary classifiers for each of the 18
# aspect-sentiment labels, enabling LogisticRegressionCV to handle multi-label classification
clf_1 = MultiOutputClassifier(
    LogisticRegressionCV( 
        cv=3,                  # Testing at 3 to start for speed
        scoring='f1_weighted',
        random_state=2,
        n_jobs=1,
        max_iter=500
    ),
    n_jobs=-1
)

clf_1.fit(X_train, y_train)

Training Logistic Regression...



0,1,2
,estimator  estimator: estimator object An estimator object implementing :term:`fit` and :term:`predict`. A :term:`predict_proba` method will be exposed only if `estimator` implements it.,LogisticRegre...'f1_weighted')
,"n_jobs  n_jobs: int or None, optional (default=None) The number of jobs to run in parallel. :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported by the passed estimator) will be parallelized for each target. When individual estimators are fast to train or predict, using ``n_jobs > 1`` can result in slower performance due to the parallelism overhead. ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all available processes / threads. See :term:`Glossary ` for more details. .. versionchanged:: 0.20  `n_jobs` default changed from `1` to `None`.",-1

0,1,2
,"Cs  Cs: int or list of floats, default=10 Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Like in support vector machines, smaller values specify stronger regularization.",10
,"l1_ratios  l1_ratios: array-like of shape (n_l1_ratios), default=None Floats between 0 and 1 passed as Elastic-Net mixing parameter (scaling between L1 and L2 penalties). For `l1_ratio = 0` the penalty is an L2 penalty. For `l1_ratio = 1` it is an L1 penalty. For `0 < l1_ratio < 1`, the penalty is a combination of L1 and L2. All the values of the given array-like are tested by cross-validation and the one giving the best prediction score is used. .. warning::  Certain values of `l1_ratios`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. deprecated:: 1.8  `l1_ratios=None` is deprecated in 1.8 and will raise an error  in version 1.10. Default value will change from `None` to `(0.0,)`  in version 1.10.",'warn'
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"cv  cv: int or cross-validation generator, default=None The default cross-validation generator used is Stratified K-Folds. If an integer is provided, it specifies the number of folds, `n_folds`, used. See the module :mod:`sklearn.model_selection` module for the list of possible cross-validation objects. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",3
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.",False
,"penalty  penalty: {'l1', 'l2', 'elasticnet'}, default='l2' Specify the norm of the penalty: - `'l2'`: add a L2 penalty term (used by default); - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"scoring  scoring: str or callable, default=None The scoring method to use for cross-validation. Options: - str: see :ref:`scoring_string_names` for options. - callable: a scorer callable object (e.g., function) with signature  ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. - `None`: :ref:`accuracy ` is used.",'f1_weighted'
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' might be slower in :class:`LogisticRegressionCV`  because it does not handle warm-starting. - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty (`l1_ratio=0` for  L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) chosen and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"max_iter  max_iter: int, default=100 Maximum number of iterations of the optimization algorithm.",500


### Evaluate

In [7]:
y_pred_1 = clf_1.predict(X_test)

print("Logistic Regression (Baseline):")
print()
print(f"Training Accuracy: {clf_1.score(X_train, y_train):.4f}")
print(f"Test Accuracy:     {clf_1.score(X_test, y_test):.4f}")
print()
print(f"F1 Score (macro):    {f1_score(y_test, y_pred_1, average='macro'):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_1, average='weighted'):.4f}")
print()

print("Classification Report:")
print(classification_report(y_test, y_pred_1))
print()
print("\nLabel Index Key:")
for i, label in enumerate(aspect_cols):
    print(f"  {i}: {label}")

Logistic Regression (Baseline):

Training Accuracy: 0.7812
Test Accuracy:     0.7814

F1 Score (macro):    0.4786
F1 Score (weighted): 0.7698

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91    652525
           1       0.76      0.39      0.52     82540
           2       0.46      0.06      0.10     70495
           3       0.93      0.91      0.92    558326
           4       0.81      0.59      0.68     94898
           5       0.38      0.06      0.11     52074
           6       0.87      0.75      0.81    281182
           7       0.74      0.40      0.52     63970
           8       0.35      0.03      0.06     42068
           9       0.89      0.85      0.87    254752
          10       0.65      0.32      0.43     43259
          11       0.37      0.09      0.14     36230
          12       0.90      0.84      0.87    114873
          13       0.75      0.34      0.47     20552
          14       0.29

### Baseline analysis
- The baseline TF-IDF model fails to predict neutral sentiment labels (0% precision/recall) due to significant class imbalances.
- Neutral reviews comprise only 7% of the dataset while positive reviews dominate at 64%. The model optimizes for overall accuracy and ignores rare labels.
- This is a strong indication we need to address class imbalance. 

In [13]:
# Clear memory to train another model
del clf_1, y_pred_1
gc.collect()

7847

## Logistic Regression (Weighted Classes)


### Train & evaluate 

In [14]:
print("Training Logistic Regression...")
print()

# MultiOutputClassifier trains separate binary classifiers for each of the 18
# aspect-sentiment labels, enabling LogisticRegressionCV to handle multi-label classification
clf_2 = MultiOutputClassifier(
    LogisticRegressionCV( 
        cv=3,                  # Testing at 3 for speed
        scoring='f1_weighted',
        random_state=2,
        n_jobs=1,
        max_iter=500,
        class_weight='balanced' # Classes will all be equal in weight
    ),
    n_jobs=-1
)

clf_2.fit(X_train, y_train)

Training Logistic Regression...



0,1,2
,estimator  estimator: estimator object An estimator object implementing :term:`fit` and :term:`predict`. A :term:`predict_proba` method will be exposed only if `estimator` implements it.,LogisticRegre...'f1_weighted')
,"n_jobs  n_jobs: int or None, optional (default=None) The number of jobs to run in parallel. :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported by the passed estimator) will be parallelized for each target. When individual estimators are fast to train or predict, using ``n_jobs > 1`` can result in slower performance due to the parallelism overhead. ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all available processes / threads. See :term:`Glossary ` for more details. .. versionchanged:: 0.20  `n_jobs` default changed from `1` to `None`.",-1

0,1,2
,"Cs  Cs: int or list of floats, default=10 Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Like in support vector machines, smaller values specify stronger regularization.",10
,"l1_ratios  l1_ratios: array-like of shape (n_l1_ratios), default=None Floats between 0 and 1 passed as Elastic-Net mixing parameter (scaling between L1 and L2 penalties). For `l1_ratio = 0` the penalty is an L2 penalty. For `l1_ratio = 1` it is an L1 penalty. For `0 < l1_ratio < 1`, the penalty is a combination of L1 and L2. All the values of the given array-like are tested by cross-validation and the one giving the best prediction score is used. .. warning::  Certain values of `l1_ratios`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. deprecated:: 1.8  `l1_ratios=None` is deprecated in 1.8 and will raise an error  in version 1.10. Default value will change from `None` to `(0.0,)`  in version 1.10.",'warn'
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"cv  cv: int or cross-validation generator, default=None The default cross-validation generator used is Stratified K-Folds. If an integer is provided, it specifies the number of folds, `n_folds`, used. See the module :mod:`sklearn.model_selection` module for the list of possible cross-validation objects. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",3
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.",False
,"penalty  penalty: {'l1', 'l2', 'elasticnet'}, default='l2' Specify the norm of the penalty: - `'l2'`: add a L2 penalty term (used by default); - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"scoring  scoring: str or callable, default=None The scoring method to use for cross-validation. Options: - str: see :ref:`scoring_string_names` for options. - callable: a scorer callable object (e.g., function) with signature  ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. - `None`: :ref:`accuracy ` is used.",'f1_weighted'
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' might be slower in :class:`LogisticRegressionCV`  because it does not handle warm-starting. - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty (`l1_ratio=0` for  L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) chosen and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"max_iter  max_iter: int, default=100 Maximum number of iterations of the optimization algorithm.",500


In [15]:
y_pred_2 = clf_2.predict(X_test)

print("Logistic Regression (Weighted Clases):")
print()
print(f"Training Accuracy: {clf_2.score(X_train, y_train):.4f}")
print(f"Test Accuracy:     {clf_2.score(X_test, y_test):.4f}")
print()
print(f"F1 Score (macro):    {f1_score(y_test, y_pred_2, average='macro'):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_2, average='weighted'):.4f}")    
print()

print("Classification Report:")
print(classification_report(y_test, y_pred_2))
print()
print("\nLabel Index Key:")
for i, label in enumerate(label_cols):
    print(f"  {i}: {label}")


Logistic Regression (Weighted Clases):
Training Accuracy: 0.5529
Test Accuracy:     0.5529
F1 Score (macro):    0.5080
F1 Score (weighted): 0.7863


Label Index Key:
  0: food_quality_positive
  1: food_quality_negative
  2: food_quality_neutral
  3: service_positive
  4: service_negative
  5: service_neutral
  6: wait_time_positive
  7: wait_time_negative
  8: wait_time_neutral
  9: price_value_positive
  10: price_value_negative
  11: price_value_neutral
  12: cleanliness_positive
  13: cleanliness_negative
  14: cleanliness_neutral
  15: atmosphere_positive
  16: atmosphere_negative
  17: atmosphere_neutral

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92    652953
           1       0.30      0.94      0.46     83229
           2       0.16      0.88      0.27     70289
           3       0.89      0.96      0.93    558125
           4       0.42      0.95      0.59     95145
           5       0.13      0.89

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Weighted classes analysis
- Balanced or equal class weighting forces the model to predict all labels including the less frequent neutral sentiments (recall jumps from ~5% to ~90% on neutral classes), but overcorrects with very low precision (16-42% on neutral labels), this results in a lot of false positives. 
- Weighted F1 improves slightly (0.77 to 0.79) and macro F1 increases (0.48 to 0.51). 
- The model now predicts neutral too often, labeling many positive or negative reviews as neutral.
- This could indicate that class weights alone don't solve the imbalance problem.

In [18]:
# Clear memory to train another model
del clf_2, y_pred_2
gc.collect()

2693