<a href="https://colab.research.google.com/github/yaldaradan/data_mining/blob/main/embed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Group P18**

**Names: Sarah Kamoun (221913835) and Yalda Radan (218515080)**

**TO DO:**
1. build two classifiers based on the training data using two different types of text classification approaches: Bag of Words (BOW) + text embeddings
2. make a comparison between the two
3. classify the reviews in the test data set and submit your prediction files (***one for each type of approach***)
please name the file **prediction1.csv** for the method with BOW and **prediction2.csv** for the method with text embeddings. **the rows should be in the same order of examples in the test dataset.**

**Notes:**
* reviews in the training set are labeled as positive, negative, or neutral

*  test set is Yelp reviews without sentiment/class labels

* 60,000 reviews were randomly selected to form a training set and another 60,000 reviews were selected as testing data. Both are stored in csv files.

* Each row in the training data set contain a review in text, its class label (positive, negative or neutral) and the id of the review (which you should ignore when learning a classifier).

* In the test data set, each row contains the id and text of a review.


# Loading the Data

In [None]:
# read from google drive "train_yelp_60k.csv" and "test_yelp_60k.csv"
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

# data loading
df_train = pd.read_csv('/content/drive/My Drive/train_yelp_60k.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test_yelp_60k.csv')

Mounted at /content/drive


In [None]:
print(df_train.head(2))

                                                Text     Class       ID
0  Chef Kevin Sousa's  2018 award winning restaur...  positive   727658
1  This place has got potential. I did quite enjo...  positive  5407165


In [None]:
# check class distribution
print(df_train['Class'].value_counts())

Class
positive    39328
negative    14028
neutral      6644
Name: count, dtype: int64


In [None]:
print(df_test.head(2))

        ID                                               Text
0   226336  All McD's these days are new and updated.  Ele...
1  5905814  Want to try this place.  I'm so dissapointed t...


In [None]:
# importing our libraries
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix


# Text Embeddings Method

In [None]:
from sentence_transformers import SentenceTransformer

https://huggingface.co/sentence-transformers

In [None]:
print(f"\n Trying model: all-MiniLM-L6-v2")

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Encode train set
X = embedder.encode(df_train['Text'].tolist())

# Train classifier
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, X, df_train['Class'], cv=5, scoring='f1_weighted')

print("Mean F1 score:", scores.mean())


 Trying model: all-MiniLM-L6-v2
Mean F1 score: 0.8012182294124275


In [None]:
print(f"\n Trying model: paraphrase-MiniLM-L3-v2")

embedder = SentenceTransformer('paraphrase-MiniLM-L3-v2')

# Encode train set
X = embedder.encode(df_train['Text'].tolist())

# Train classifier
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, X, df_train['Class'], cv=5, scoring='f1_weighted')

print("Mean F1 score:", scores.mean())


 Trying model: paraphrase-MiniLM-L3-v2
Mean F1 score: 0.7836685645741979


In [None]:
print(f"\n Trying model: all-mpnet-base-v2")

embedder = SentenceTransformer('all-mpnet-base-v2')

# Encode train set
X = embedder.encode(df_train['Text'].tolist())

# Train classifier
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, X, df_train['Class'], cv=5, scoring='f1_weighted')

print("Mean F1 score:", scores.mean())


 Trying model: all-mpnet-base-v2
Mean F1 score: 0.846147171261622


In [None]:
print(f"\n Trying model: sentence-t5-base")

embedder = SentenceTransformer('sentence-t5-base')

# Encode train set
X = embedder.encode(df_train['Text'].tolist())

# Train classifier
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, X, df_train['Class'], cv=5, scoring='f1_weighted')

print("Mean F1 score:", scores.mean())


 Trying model: sentence-t5-base
Mean F1 score: 0.8556266863220637


In [None]:
print(f"\n Trying model: multi-qa-MiniLM-L6-cos-v1")

embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Encode train set
X = embedder.encode(df_train['Text'].tolist())

# Train classifier
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, X, df_train['Class'], cv=5, scoring='f1_weighted')

print("Mean F1 score:", scores.mean())


 Trying model: multi-qa-MiniLM-L6-cos-v1
Mean F1 score: 0.7792259897451362


In [None]:
print(f"\n Trying model: all-distilroberta-v1")

embedder = SentenceTransformer('all-distilroberta-v1')

# Encode train set
X = embedder.encode(df_train['Text'].tolist())

# Train classifier
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, X, df_train['Class'], cv=5, scoring='f1_weighted')

print("Mean F1 score:", scores.mean())


 Trying model: all-distilroberta-v1
Mean F1 score: 0.8445566944409059


* Given that SVM was better than Random Forest previously, we'll start by testing an SVM-based approach.

In [None]:
embedder = SentenceTransformer('sentence-t5-base')
X = embedder.encode(df_train['Text'].tolist())

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Linear SVM": LinearSVC(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=50,class_weight='balanced',random_state=42)}

for model_name, model in models.items():
    scores = cross_val_score(model, X, df_train['Class'], cv=5, scoring='f1_macro')
    print(f"{model_name} F1 score:: {np.mean(scores):.3f} (±{np.std(scores):.3f})")


Logistic Regression F1 score:: 0.744 (±0.002)
Linear SVM F1 score:: 0.699 (±0.006)
RandomForest F1 score:: 0.618 (±0.003)


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X, df_train['Class'])

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=50,random_state=42)}

for model_name, model in models.items():
    scores = cross_val_score(model, X_train_balanced, y_train_balanced, cv=5, scoring='f1_macro')
    print(f"{model_name} F1 score:: {np.mean(scores):.3f} (±{np.std(scores):.3f})")


Logistic Regression F1 score:: 0.828 (±0.017)
Linear SVM F1 score:: 0.829 (±0.017)
RandomForest F1 score:: 0.931 (±0.007)


In [None]:
#Final classification and prediction
X_test = embedder.encode(df_test['Text'].tolist())
clf = RandomForestClassifier(n_estimators=50,random_state=42)
clf.fit(X_train_balanced, y_train_balanced)
y_pred = clf.predict(X_test)
print(y_pred)

['positive' 'negative' 'positive' ... 'negative' 'positive' 'positive']


##SVM (SVC from sklearn requires numerical labels)

###**svm, balanced classes, with 'all-MiniLM-L6-v2'**

In [None]:
#sentence_transformers library
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# SVM gives us errors if we don't replace class labels with numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

#load pretrained model 'all-MiniLM-L6-v2'
print("pretrained model used: 'all-MiniLM-L6-v2'")
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

# Precompute embeddings for all text data
train_embeddings = model.encode(df_train['Text'].tolist(), convert_to_tensor=False)
test_embeddings = model.encode(df_test['Text'].tolist(), convert_to_tensor=False)

# Train classifier (SVM)
clf = SVC(kernel='linear', class_weight='balanced')
clf.fit(train_embeddings, y_train)  # y_train is your pre-encoded labels

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# cross-validation accuracy
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect coefficients
# Get top weighted embedding dimensions
top_dimensions = np.argsort(np.abs(clf.coef_).mean(axis=0))[-10:]
print("Top discriminative embedding dimensions:", top_dimensions)  # Not interpretable like words

# predict test set labels
print("Predicting test set labels...")
y_pred_codes = clf.predict(test_embeddings)

y_pred_cat={1:'positive', 0:'neutral', -1:'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_TextEmbd_svm_all-MiniLM-L6-v2.csv', index=False)
print("Prediction saved to 'prediction1_TextEmbd_svm_all-MiniLM-L6-v2.csv'")

pretrained model used: 'all-MiniLM-L6-v2'


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Cross-validation F1_macro: 0.672 (±0.005)
Cross-validation accuracy: 0.747 (±0.005)
Top discriminative embedding dimensions: [  1  14  45 288 333 322 299 136 317 301]
Predicting test set labels...
Prediction saved to 'prediction1_TextEmbd_svm_all-MiniLM-L6-v2.csv'


Cross-validation F1_macro: 0.672 (±0.005)
Cross-validation accuracy: 0.747 (±0.005)

**Notes**
* performs less than our previous BOW-method model by around 10%!

### c=0.2 and paraphrase-mpnet-base-v2

In [None]:
#sentence_transformers library
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# SVM gives us errors if we don't replace class labels with numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

#load pretrained model 'all-distilroberta-v1'
model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2',device='cuda')

print("pretrained model used: 'paraphrase-mpnet-base-v2'")

# Precompute embeddings for all text data
train_embeddings = model.encode(df_train['Text'].tolist(), convert_to_tensor=False)
test_embeddings = model.encode(df_test['Text'].tolist(), convert_to_tensor=False)

# Train classifier (SVM)
clf = SVC(C=0.2, random_state=42, kernel='linear', class_weight='balanced')
clf.fit(train_embeddings, y_train)  # y_train is your pre-encoded labels

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# cross-validation accuracy
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect coefficients
# Get top weighted embedding dimensions (analogous to TF-IDF features)
top_dimensions = np.argsort(np.abs(clf.coef_).mean(axis=0))[-10:]
print("Top discriminative embedding dimensions:", top_dimensions)  # Not interpretable like words

# predict test set labels
print("Predicting test set labels...")
y_pred_codes = clf.predict(test_embeddings)

y_pred_cat={1:'positive', 0:'neutral', -1:'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_TextEmbd_svm02_paraphrase-mpnet-base-v2.csv', index=False)
print("Prediction saved to 'prediction1_TextEmbd_svm02_paraphrase-mpnet-base-v2.csv'")


pretrained model used: 'paraphrase-mpnet-base-v2'
Cross-validation F1_macro: 0.754 (±0.004)
Cross-validation accuracy: 0.828 (±0.003)
Top discriminative embedding dimensions: [326 497 670 154 165 181 432 494   6 176]
Predicting test set labels...
Prediction saved to 'prediction1_TextEmbd_svm02_paraphrase-mpnet-base-v2.csv'


Cross-validation F1_macro: 0.754 (±0.004)

Cross-validation accuracy: 0.828 (±0.003)

###**a larger pretrained model, 'all-mpnet-base-v2', with svm, C=0.2**

In [None]:
#sentence_transformers library
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# SVM gives us errors if we don't replace class labels with numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

#load pretrained model 'all-mpnet-base-v2'
model = SentenceTransformer('all-mpnet-base-v2',device='cuda')

print("pretrained model used: 'all-mpnet-base-v2'")

# Precompute embeddings for all text data
train_embeddings = model.encode(df_train['Text'].tolist(), convert_to_tensor=False)
test_embeddings = model.encode(df_test['Text'].tolist(), convert_to_tensor=False)

# Train classifier (SVM)
clf = SVC(C=0.2, random_state=42, kernel='linear', class_weight='balanced')
clf.fit(train_embeddings, y_train)  # y_train is your pre-encoded labels

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# cross-validation accuracy
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect coefficients
# Get top weighted embedding dimensions (analogous to TF-IDF features)
top_dimensions = np.argsort(np.abs(clf.coef_).mean(axis=0))[-10:]
print("Top discriminative embedding dimensions:", top_dimensions)  # Not interpretable like words

# predict test set labels
print("Predicting test set labels...")
y_pred_codes = clf.predict(test_embeddings)

y_pred_cat={1:'positive', 0:'neutral', -1:'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_TextEmbd_svm02_all-mpnet-base-v2.csv', index=False)
print("Prediction saved to 'prediction1_TextEmbd_svm02_all-mpnet-base-v2.csv'")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

pretrained model used: 'all-mpnet-base-v2'
Cross-validation F1_macro: 0.722 (±0.004)
Cross-validation accuracy: 0.799 (±0.004)
Top discriminative embedding dimensions: [165 528  36 401 254 299 338 517 150 766]
Predicting test set labels...
Prediction saved to 'prediction1_TextEmbd_svm02_all-mpnet-base-v2.csv'


pretrained model used: 'all-mpnet-base-v2'

Cross-validation F1_macro: 0.722 (±0.004)

Cross-validation accuracy: 0.799 (±0.004)

In [None]:
#sentence_transformers library
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# SVM gives us errors if we don't replace class labels with numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

#load pretrained model 'paraphrase-MiniLM-L3-v2'
model = SentenceTransformer('paraphrase-MiniLM-L3-v2',device='cuda')

print("pretrained model used: 'paraphrase-MiniLM-L3-v2'")

# Precompute embeddings for all text data
train_embeddings = model.encode(df_train['Text'].tolist(), convert_to_tensor=False)
test_embeddings = model.encode(df_test['Text'].tolist(), convert_to_tensor=False)

# Train classifier (SVM)
clf = SVC(C=0.2, random_state=42, kernel='linear', class_weight='balanced')
clf.fit(train_embeddings, y_train)  # y_train is your pre-encoded labels

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# cross-validation accuracy
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect coefficients
# Get top weighted embedding dimensions (analogous to TF-IDF features)
top_dimensions = np.argsort(np.abs(clf.coef_).mean(axis=0))[-10:]
print("Top discriminative embedding dimensions:", top_dimensions)  # Not interpretable like words

# predict test set labels
print("Predicting test set labels...")
y_pred_codes = clf.predict(test_embeddings)

y_pred_cat={1:'positive', 0:'neutral', -1:'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_TextEmbd_svm02_paraphrase-MiniLM-L3-v2.csv', index=False)
print("Prediction saved to 'prediction1_TextEmbd_svm02_paraphrase-MiniLM-L3-v2.csv'")


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

pretrained model used: 'paraphrase-MiniLM-L3-v2'
Cross-validation F1_macro: 0.644 (±0.003)
Cross-validation accuracy: 0.715 (±0.002)
Top discriminative embedding dimensions: [ 29  84 292  14  12 145 164  57 204 220]
Predicting test set labels...
Prediction saved to 'prediction1_TextEmbd_svm02_paraphrase-MiniLM-L3-v2.csv'


pretrained model used: 'paraphrase-MiniLM-L3-v2'

Cross-validation F1_macro: 0.644 (±0.003)

Cross-validation accuracy: 0.715 (±0.002)

##**Random forest (100 trees, all-mpnet-base-v2)**

In [None]:
#sentence_transformers library
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# SVM gives us errors if we don't replace class labels with numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

#load pretrained model 'all-mpnet-base-v2'
model = SentenceTransformer('all-mpnet-base-v2',device='cuda')

print("pretrained model used: 'all-mpnet-base-v2'")

# Precompute embeddings for all text data
train_embeddings = model.encode(df_train['Text'].tolist(), convert_to_tensor=False)
test_embeddings = model.encode(df_test['Text'].tolist(), convert_to_tensor=False)

# Replace SVM with RF (keep the rest identical)
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # Handles imbalanced classes
    random_state=42
)
clf.fit(train_embeddings, y_train)  # y_train is your pre-encoded labels

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# cross-validation accuracy
scores = cross_val_score(clf, train_embeddings, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# For Random Forest, top features used
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
top_features = selected_features[np.argsort(clf.feature_importances_)[-10:]]
print("Top features:", top_features)

# predict test set labels
print("Predicting test set labels...")
y_pred_codes = clf.predict(test_embeddings)

y_pred_cat={1:'positive', 0:'neutral', -1:'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_TextEmbd_Randomforest_all-mpnet-base-v2.csv', index=False)
print("Prediction saved to 'prediction1_TextEmbd_Randomforest_all-mpnet-base-v2.csv'")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

pretrained model used: 'all-mpnet-base-v2'
Cross-validation F1_macro: 0.551 (±0.002)
Cross-validation accuracy: 0.805 (±0.002)


AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'

# Re-running the best model

**Model configuration**
* sentence-t5-base, SMOTE, Random Forest with 50 trees

In [None]:
#sentence_transformers library
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE

import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# SVM gives us errors if we don't replace class labels with numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

#load pretrained model 'sentence-t5-base'
model = SentenceTransformer('sentence-t5-base',device='cuda')

print("pretrained model used: 'sentence-t5-base'")

# Precompute embeddings for all text data
train_embeddings = model.encode(df_train['Text'].tolist(), convert_to_tensor=False)
test_embeddings = model.encode(df_test['Text'].tolist(), convert_to_tensor=False)

# Replace SVM with RF (keep the rest identical)
clf = RandomForestClassifier(
    n_estimators=50,
    random_state=42
)

smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(train_embeddings, y_train)

clf.fit(X_train_balanced, y_train_balanced)

# Cross-validation using F1-macro and accuracy
scoring_metrics = ['f1_macro', 'accuracy']
for metric in scoring_metrics:
    scores = cross_val_score(clf, X_train_balanced, y_train_balanced, cv=5, scoring=metric)
    print(f"Cross-validation {metric}: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# predict test set labels
print("Predicting test set labels...")
y_pred_codes = clf.predict(test_embeddings)

y_pred_cat={1:'positive', 0:'neutral', -1:'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction2.csv', index=False)
print("Prediction saved to 'prediction2.csv'")


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

rust_model.ot:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

pretrained model used: 'sentence-t5-base'
Cross-validation f1_macro: 0.930 (±0.006)
Cross-validation accuracy: 0.930 (±0.006)
Predicting test set labels...
Prediction saved to 'prediction2.csv'


pretrained model used: 'sentence-t5-base'

Cross-validation f1_macro: 0.930 (±0.006)

Cross-validation accuracy: 0.930 (±0.006)

In [None]:
import numpy as np

# Save embeddings to a .npy file
np.save('test_embeddings.npy', test_embeddings)

# Load embeddings back
loaded_embeddings = np.load('test_embeddings.npy')