<a href="https://colab.research.google.com/github/yaldaradan/data_mining/blob/main/bow_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Group P18**

**Names: Sarah Kamoun (221913835) and Yalda Radan (218515080)**

**TO DO:**
1. build two classifiers based on the training data using two different types of text classification approaches: Bag of Words (BOW) + text embeddings
2. make a comparison between the two
3. classify the reviews in the test data set and submit your prediction files (***one for each type of approach***)
please name the file **prediction1.csv** for the method with BOW and **prediction2.csv** for the method with text embeddings. **the rows should be in the same order of examples in the test dataset.**

**Notes:**
* reviews in the training set are labeled as positive, negative, or neutral

*  test set is Yelp reviews without sentiment/class labels

* 60,000 reviews were randomly selected to form a training set and another 60,000 reviews were selected as testing data. Both are stored in csv files.

* Each row in the training data set contain a review in text, its class label (positive, negative or neutral) and the id of the review (which you should ignore when learning a classifier).

* In the test data set, each row contains the id and text of a review.


# Loading the Data

In [None]:
# read from google drive "train_yelp_60k.csv" and "test_yelp_60k.csv"
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

# data loading
df_train = pd.read_csv('/content/drive/My Drive/train_yelp_60k.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test_yelp_60k.csv')

Mounted at /content/drive


In [None]:
print(df_train.head(2))

                                                Text     Class       ID
0  Chef Kevin Sousa's  2018 award winning restaur...  positive   727658
1  This place has got potential. I did quite enjo...  positive  5407165


In [None]:
# check class distribution
print(df_train['Class'].value_counts())

Class
positive    39328
negative    14028
neutral      6644
Name: count, dtype: int64


In [None]:
print(df_test.head(2))

        ID                                               Text
0   226336  All McD's these days are new and updated.  Ele...
1  5905814  Want to try this place.  I'm so dissapointed t...


In [None]:
# importing our libraries
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix


# BOW Method

* Convert the set of documents in the training data into the BOW numerical vectors.  You can use one of the two classes provided in scikit-learn to do it: CountVectorizer or TfidfVectorizer.

* Select features via any feature selection methods provided in scikit-learn to do so.

* Train a classifier with any of the classification methods provided in scikit-learn


In [None]:
# Feature extraction (TfidfVectorizer)
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2),stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])


In [None]:
models = {"Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Linear SVM": LinearSVC(),}

for model_name, model in models.items():
    scores = cross_val_score(model, X_train, df_train['Class'], cv=5, scoring='f1_weighted')
    print(f"{model_name} Accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")


Naive Bayes Accuracy: 0.775 (±0.003)
Logistic Regression Accuracy: 0.818 (±0.003)
Linear SVM Accuracy: 0.826 (±0.003)


In [None]:
# Feature selection with k = 2000
feature_selector =  SelectKBest(chi2, k=2000)
X_train_s = feature_selector.fit_transform(X_train, df_train['Class'])

# To get selected feature names
selected_features = feature_selector.get_support(indices=True)
vocab = tfidf.get_feature_names_out()
selected_vocab = vocab[selected_features]
print("Selected vocab:", selected_vocab)

Selected vocab: ['00' '000' '10' ... 'yummy' 'zero' 'zero stars']


In [None]:
# Model training - model 1 (Random Forest)
print("Training Random Forest Classifier...")
clf = RandomForestClassifier(n_estimators=100,random_state=42)
clf.fit(X_train_s, df_train['Class'])

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

Training Random Forest Classifier...
Cross-validation F1_macro: 0.578 (±0.004)


In [None]:
models = {"Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Linear SVM": LinearSVC(random_state=42, class_weight='balanced'),}
    #clf = LinearSVC(C=1, random_state=42, class_weight='balanced')

for model_name, model in models.items():
    scores = cross_val_score(model, X_train_s, df_train['Class'], cv=5, scoring='f1_macro')
    print(f"{model_name} Accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")


Naive Bayes Accuracy: 0.570 (±0.002)
Logistic Regression Accuracy: 0.706 (±0.002)
Linear SVM Accuracy: 0.720 (±0.003)


In [None]:
# Feature selection with k = 5000
feature_selector =  SelectKBest(chi2, k=5000)
X_train_s = feature_selector.fit_transform(X_train, df_train['Class'])

# To get selected feature names
selected_features = feature_selector.get_support(indices=True)
vocab = tfidf.get_feature_names_out()
selected_vocab = vocab[selected_features]

In [None]:
clf = RandomForestClassifier(n_estimators=100,random_state=42)
clf.fit(X_train_s, df_train['Class'])

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

Cross-validation F1_macro: 0.569 (±0.002)


In [None]:
models = {"Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Linear SVM": LinearSVC(random_state=42, class_weight='balanced'),}
    #clf = LinearSVC(C=1, random_state=42, class_weight='balanced')

for model_name, model in models.items():
    scores = cross_val_score(model, X_train_s, df_train['Class'], cv=5, scoring='f1_macro')
    print(f"{model_name} Accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")


Naive Bayes Accuracy: 0.593 (±0.004)
Logistic Regression Accuracy: 0.714 (±0.003)
Linear SVM Accuracy: 0.722 (±0.004)


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, df_train['Class'])

In [None]:
# Feature selection with k = 2000
feature_selector =  SelectKBest(chi2, k=2000)
X_train_s = feature_selector.fit_transform(X_train_balanced, y_train_balanced)

# To get selected feature names
selected_features = feature_selector.get_support(indices=True)
vocab = tfidf.get_feature_names_out()
selected_vocab = vocab[selected_features]

In [None]:
clf = RandomForestClassifier(n_estimators=100,random_state=42)

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, X_train_s, y_train_balanced, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

Cross-validation F1_macro: 0.898 (±0.052)


In [None]:
models = {"Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC(random_state=42),}

for model_name, model in models.items():
    scores = cross_val_score(model, X_train_s, y_train_balanced, cv=5, scoring='f1_macro')
    print(f"{model_name} Accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")


Naive Bayes Accuracy: 0.758 (±0.012)
Logistic Regression Accuracy: 0.820 (±0.022)
Linear SVM Accuracy: 0.824 (±0.022)


In [None]:
# Predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
clf = RandomForestClassifier(n_estimators=100,random_state=42)
clf.fit(X_train_s, y_train_balanced)
y_pred_codes = clf.predict(X_test_s)



Predicting test set labels...


In [None]:
print(y_pred_codes)

['negative' 'negative' 'positive' ... 'positive' 'positive' 'positive']


In [None]:
# Save predictions to CSV
df_pred = pd.DataFrame({'ID': df_test['ID'], 'Class': y_pred_codes})
df_pred.to_csv('prediction1.csv', index=False)
print("Prediction saved to 'prediction1.csv'")

Prediction saved to 'prediction1.csv'


**Random Forest with chi squared for feature selection**

In [None]:
# Feature extraction (TfidfVectorizer)
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2),stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Feature selection - method 1 (chi-squared)
feature_selector =  SelectKBest(chi2, k=5000)
X_train_s = feature_selector.fit_transform(X_train, df_train['Class'])

# Model training - model 1 (Random Forest)
print("Training Random Forest Classifier...")
clf = RandomForestClassifier(n_estimators=100,random_state=42)
clf.fit(X_train_s, df_train['Class'])

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")
# cross-validation accuracy
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# For Random Forest, top features used
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
top_features = selected_features[np.argsort(clf.feature_importances_)[-10:]]
print("Top features:", top_features)

# predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred = clf.predict(X_test_s)

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_chi2_randomforest.csv', index=False)
print("Prediction saved to 'prediction1_chi2_randomforest.csv'")

Training Random Forest Classifier...
Cross-validation F1_macro: 0.569 (±0.002)
Cross-validation accuracy: 0.813 (±0.002)
Top features: ['ok' 'asked' 'delicious' 'amazing' 'best' 'rude' 'told' 'horrible'
 'worst' 'great']
Predicting test set labels...
Prediction saved to 'prediction1_chi2_randomforest.csv'


**Random Forest with weights = balanced**

In [None]:
# Feature extraction (TfidfVectorizer)
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2),stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Feature selection - method 1 (chi-squared)
feature_selector =  SelectKBest(chi2, k=5000)
X_train_s = feature_selector.fit_transform(X_train, df_train['Class'])

# Model training - model 1 (Random Forest) with class_weight='balanced'
print("Training Random Forest Classifier with class balance...")
clf = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=42)
clf.fit(X_train_s, df_train['Class'])

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")
# cross-validation accuracy
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# For Random Forest, top features used
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
top_features = selected_features[np.argsort(clf.feature_importances_)[-10:]]
print("Top features:", top_features)

# predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred = clf.predict(X_test_s)

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_chi2_randomforest_balanced.csv', index=False)
print("Prediction saved to 'prediction1_chi2_randomforest_balanced.csv'")

Training Random Forest Classifier with class balance...
Cross-validation F1_macro: 0.570 (±0.004)
Cross-validation accuracy: 0.805 (±0.002)
Top features: ['ok' 'friendly' 'worst' 'told' 'love' 'best' 'delicious' 'amazing' 'good'
 'great']
Predicting test set labels...
Prediction saved to 'prediction1_chi2_randomforest_balanced.csv'


**Random Forest with mutual information for feature selection (weights=balanced)**

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Feature extraction (TfidfVectorizer)
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2),stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Feature selection - - method 2 (mutual information)
feature_selector =  SelectKBest(mutual_info_classif, k=5000)
X_train_s = feature_selector.fit_transform(X_train, df_train['Class'])

# Model training - model 1 (Random Forest) with class_weight='balanced'
print("Training Random Forest Classifier with class balance...")
clf = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=42)
clf.fit(X_train_s, df_train['Class'])

# cross-validation; we found that for sentiment analysis, F1-macro is preferable
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")
# cross-validation accuracy
scores = cross_val_score(clf, X_train_s, df_train['Class'], cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# For Random Forest, top features used
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
top_features = selected_features[np.argsort(clf.feature_importances_)[-10:]]
print("Top features:", top_features)

# predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred = clf.predict(X_test_s)

df_pred = pd.DataFrame({'ID': df_test['ID'],'Class': y_pred})
df_pred.to_csv('prediction1_MI_randomforest_balanced.csv', index=False)
print("Prediction saved to 'prediction1_MI_randomforest_balanced.csv'")

Training Random Forest Classifier with class balance...
Cross-validation F1_macro: 0.575 (±0.005)
Cross-validation accuracy: 0.806 (±0.002)
Top features: ['definitely' 'friendly' 'delicious' 'love' 'told' 'best' 'worst'
 'amazing' 'good' 'great']
Predicting test set labels...
Prediction saved to 'prediction1_MI_randomforest_balanced.csv'


**Conclusion (Random Forest):**
* Adding balanced weights does not seem to have improved the performance (accuracy decreased a little, f1_macro has more variability). Balanced weights should be better, logically...
* Amongst the 2 feature selection methods, mutual information is - very slightly - better (80.5% vs. 80.6% accuracy)
* Increasing the number of trees didn't do anything.
* The performance is not perfect anyways, given our task (sentiment classification). Let's try a different model to see if it works better.


* As using "mutual information" gave slightly better results earlier, we'll start by testing it as our feature selection method

**SVM with class weighting and mutual_info to see if it's better than random forest (MI, SVM, balance)**

In [None]:

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Feature extraction (TfidfVectorizer)
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Map class labels to numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

# Feature selection using mutual information
feature_selector = SelectKBest(mutual_info_classif, k=5000)
X_train_s = feature_selector.fit_transform(X_train, y_train)

# Train LinearSVC for multi-class classification
print("Training LinearSVC Classifier with class balance...")
clf = LinearSVC(C=1, random_state=42, class_weight='balanced')  # One-vs-rest by default
clf.fit(X_train_s, y_train)

# Cross-validation using F1-macro (better for imbalanced classes)
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Cross-validation accuracy
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect top discriminative features
coef_dense = clf.coef_.toarray() if hasattr(clf.coef_, "toarray") else clf.coef_
top_feature_indices = np.argsort(np.abs(coef_dense).mean(axis=0))[-10:]
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
print("Top discriminative features:", selected_features[top_feature_indices])

# Predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred_codes = clf.predict(X_test_s)

# Map numerical predictions back to class labels
y_pred_cat = {1: 'positive', 0: 'neutral', -1: 'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

# Save predictions to CSV
df_pred = pd.DataFrame({'ID': df_test['ID'], 'Class': y_pred})
df_pred.to_csv('prediction1_MI_svm.csv', index=False)
print("Prediction saved to 'prediction1_MI_svm.csv'")

Training LinearSVC Classifier with class balance...
Cross-validation F1_macro: 0.706 (±0.006)
Cross-validation accuracy: 0.827 (±0.005)
Top discriminative features: ['excellent' 'perfect' 'grateful' 'best' 'awesome' 'disgusting' 'great'
 'worst' 'amazing' 'delicious']
Predicting test set labels...
Prediction saved to 'prediction1_MI_svm.csv'


**Notes**
* The f1_macro improved from being around 57% to 70% when using SVM instead of random forest, and accuracy improved from 80% to 82.7% so we're headed in a good direction for our task.
* Our next idea is to expand the n_gram range to include longer expressions ('not the best' for example)

**SVM + expanding the ngram_range to include 3-word phrases**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Feature extraction (TfidfVectorizer) with ngram 3 this time
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Map class labels to numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

# Feature selection using mutual information
feature_selector = SelectKBest(mutual_info_classif, k=5000)
X_train_s = feature_selector.fit_transform(X_train, y_train)

# Train LinearSVC for multi-class classification
print("Training LinearSVC Classifier with class balance...")
clf = LinearSVC(C=1, random_state=42, class_weight='balanced')  # One-vs-rest by default
clf.fit(X_train_s, y_train)

# Cross-validation using F1-macro (better for imbalanced classes)
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Cross-validation accuracy
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect top discriminative features
coef_dense = clf.coef_.toarray() if hasattr(clf.coef_, "toarray") else clf.coef_
top_feature_indices = np.argsort(np.abs(coef_dense).mean(axis=0))[-10:]
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
print("Top discriminative features:", selected_features[top_feature_indices])

# Predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred_codes = clf.predict(X_test_s)

# Map numerical predictions back to class labels
y_pred_cat = {1: 'positive', 0: 'neutral', -1: 'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

# Save predictions to CSV
df_pred = pd.DataFrame({'ID': df_test['ID'], 'Class': y_pred})
df_pred.to_csv('prediction1_MI3_svm.csv', index=False)
print("Prediction saved to 'prediction1_MI3_svm.csv'")

Training LinearSVC Classifier with class balance...
Cross-validation F1_macro: 0.707 (±0.007)
Cross-validation accuracy: 0.827 (±0.005)
Top discriminative features: ['excellent' 'perfect' 'grateful' 'best' 'awesome' 'disgusting' 'great'
 'worst' 'amazing' 'delicious']
Predicting test set labels...
Prediction saved to 'prediction1_MI3_svm.csv'


**Notes**
* No change in performance, but let's try expanding the range to 5 to see if this could help.

**Trying to increase ngram to 5**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Feature extraction (TfidfVectorizer) with ngram 5 this time
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 5), stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Map class labels to numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

# Feature selection using mutual information
feature_selector = SelectKBest(mutual_info_classif, k=5000)
X_train_s = feature_selector.fit_transform(X_train, y_train)

# Train LinearSVC for multi-class classification
print("Training LinearSVC Classifier with class balance...")
clf = LinearSVC(C=1, random_state=42, class_weight='balanced')  # One-vs-rest by default
clf.fit(X_train_s, y_train)

# Cross-validation using F1-macro (better for imbalanced classes)
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Cross-validation accuracy
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect top discriminative features
coef_dense = clf.coef_.toarray() if hasattr(clf.coef_, "toarray") else clf.coef_
top_feature_indices = np.argsort(np.abs(coef_dense).mean(axis=0))[-10:]
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
print("Top discriminative features:", selected_features[top_feature_indices])

# Predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred_codes = clf.predict(X_test_s)

# Map numerical predictions back to class labels
y_pred_cat = {1: 'positive', 0: 'neutral', -1: 'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

# Save predictions to CSV
df_pred = pd.DataFrame({'ID': df_test['ID'], 'Class': y_pred})
df_pred.to_csv('prediction1_MI5_svm.csv', index=False)
print("Prediction saved to 'prediction1_MI5_svm.csv'")

Training LinearSVC Classifier with class balance...
Cross-validation F1_macro: 0.707 (±0.007)
Cross-validation accuracy: 0.827 (±0.005)
Top discriminative features: ['excellent' 'perfect' 'grateful' 'best' 'awesome' 'disgusting' 'great'
 'worst' 'amazing' 'delicious']
Predicting test set labels...
Prediction saved to 'prediction1_MI5_svm.csv'


**Notes**
* It's not getting better with more ngram range. Let's revert to (1,3) and see if changing the hyperparameter "C" of the SVM changes anything.

**reducing C to 0.05 and n_gram back to (1,2)**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Feature extraction (TfidfVectorizer) with ngram 2 this time
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Map class labels to numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

# Feature selection using mutual information
feature_selector = SelectKBest(mutual_info_classif, k=5000)
X_train_s = feature_selector.fit_transform(X_train, y_train)

# Train LinearSVC for multi-class classification
print("Training LinearSVC Classifier with class balance...")
clf = LinearSVC(C=0.05, random_state=42, class_weight='balanced')  # One-vs-rest by default
clf.fit(X_train_s, y_train)

# Cross-validation using F1-macro (better for imbalanced classes)
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Cross-validation accuracy
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Inspect top discriminative features
coef_dense = clf.coef_.toarray() if hasattr(clf.coef_, "toarray") else clf.coef_
top_feature_indices = np.argsort(np.abs(coef_dense).mean(axis=0))[-10:]
selected_features = tfidf.get_feature_names_out()[feature_selector.get_support()]
print("Top discriminative features:", selected_features[top_feature_indices])

# Predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred_codes = clf.predict(X_test_s)

# Map numerical predictions back to class labels
y_pred_cat = {1: 'positive', 0: 'neutral', -1: 'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

# Save predictions to CSV
df_pred = pd.DataFrame({'ID': df_test['ID'], 'Class': y_pred})
df_pred.to_csv('prediction1_MI2_C005_svm', index=False)
print("Prediction saved to 'prediction1_MI2_C005_svm.csv'")

Training LinearSVC Classifier with class balance...
Cross-validation F1_macro: 0.715 (±0.004)
Cross-validation accuracy: 0.839 (±0.003)
Top discriminative features: ['horrible' 'excellent' 'love' 'awesome' 'ok' 'best' 'worst' 'amazing'
 'delicious' 'great']
Predicting test set labels...
Prediction saved to 'prediction1_MI2_C005_svm.csv'


* Final experiment

In [None]:
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Feature extraction (TfidfVectorizer) with ngram 2 this time
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train = tfidf.fit_transform(df_train['Text'])

# Map class labels to numerical codes
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
y_train = df_train['Class'].map(label_map).astype(int)

smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Feature selection with k = 2000
feature_selector =  SelectKBest(chi2, k=2000)
X_train_s = feature_selector.fit_transform(X_train_balanced, y_train_balanced)

# Train LinearSVC for multi-class classification
print("Training LinearSVC Classifier with class balance...")
clf = LinearSVC(C=0.05, random_state=42, class_weight='balanced')  # One-vs-rest by default
clf.fit(X_train_s, y_train_balanced)

# Cross-validation using F1-macro (better for imbalanced classes)
scores = cross_val_score(clf, X_train_s, y_train_balanced, cv=5, scoring='f1_macro')
print(f"Cross-validation F1_macro: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Cross-validation accuracy
scores = cross_val_score(clf, X_train_s, y_train_balanced, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {np.mean(scores):.3f} (±{np.std(scores):.3f})")

# Predict test set labels
print("Predicting test set labels...")
X_test = tfidf.transform(df_test['Text'])
X_test_s = feature_selector.transform(X_test)
y_pred_codes = clf.predict(X_test_s)

# Map numerical predictions back to class labels
y_pred_cat = {1: 'positive', 0: 'neutral', -1: 'negative'}
y_pred = [y_pred_cat[code] for code in y_pred_codes]

# Save predictions to CSV
df_pred = pd.DataFrame({'ID': df_test['ID'], 'Class': y_pred})
df_pred.to_csv('prediction1_lasttest.csv', index=False)
print("Prediction saved to 'prediction1_lasttest.csv'")

Training LinearSVC Classifier with class balance...
Cross-validation F1_macro: 0.803 (±0.019)
Cross-validation accuracy: 0.803 (±0.019)
Predicting test set labels...
Prediction saved to 'prediction1_lasttest.csv'
