## Pre process

In [25]:
import pandas as pd
import numpy as np
import nltk
import pickle
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = 'Labeled-Amazon-Reviews-Dataset.csv'
df = pd.read_csv(file_path)

# Ensure necessary columns are present
required_columns = ['Review', 'Labels']
df = df[required_columns].dropna()

# Preprocessing function
def preprocess_text(text):
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stopwords and non-alphabetic characters
    """ Stopwords are commonly used words in a language that are often filtered out in text processing tasks. These words are generally considered to have little to no semantic value in the context of tasks like text classification, information retrieval, and natural language processing (NLP). Examples of stopwords in English include "the," "is," "in," "and," "or," "an," and "a."

        Why Remove Stopwords?
        Noise Reduction: Stopwords can be seen as noise in the data, which can overshadow meaningful content.
        Dimensionality Reduction: Removing stopwords reduces the number of features (words) in the text data, which can make the models more efficient and faster to train.
        Improved Focus: It helps the algorithms to focus on the words that contribute more to the meaning and context of the text."""
    
    words = [word for word in words if word.isalpha()]
    stopwords = nltk.corpus.stopwords.words('english')
    words = [word for word in words if word.lower() not in stopwords]
    # Lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.lower()) for word in words]
    return ' '.join(words)

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Apply preprocessing
df['cleaned_review'] = df['Review'].apply(preprocess_text)

# Map labels to numerical values
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
df['label'] = df['Labels'].map(label_mapping)

# Check the result
print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              Review    Labels  \
0  frustrating because they sometimes get screwed...  negative   
1  I love these! They are super easy to use, ther...  positive   
2                 not the best quality, but it works  positive   
3  If you’re like me, I still like to write on a ...  positive   
4  As an office manager, I go through my fair sha...  positive   

                                      cleaned_review  label  
0        frustrating sometimes get screwed worthless      0  
1  love super easy use like drying time last long...      2  
2                                  best quality work      2  
3  like still like write calendar list old school...      2  
4  office manager go fair share correction tape p...      2  


In [26]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['Labels'], test_size=0.2, random_state=42)

# Vectorize the text data
"""Term Frequency: TF of a term or word is the number of times the term appears in a document compared to the total number of words in the document.
   Inverse Document Frequency: IDF of a term reflects the proportion of documents in the corpus that contain the term.
   TF-IDF Score: Combines TF and IDF to provide a balanced score that highlights words that are important in a specific document while reducing the impact of commonly used words."""

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Save the vectorizer to a file
vectorizer_path = "tfidf_vectorizer.pkl"
with open(vectorizer_path, "wb") as file:
    pickle.dump(vectorizer, file)

In [3]:
df['label'].value_counts()

2    204
1     53
0     18
Name: label, dtype: int64

Due to Class Imbalance in the data, SMOTE and RandomUnderSampler are used to balance the data.

In [4]:
# Combine oversampling and undersampling in a pipeline
over = SMOTE(sampling_strategy={'negative': 200, 'neutral': 200, 'positive': 200})
under = RandomUnderSampler(sampling_strategy={'negative': 18, 'neutral': 53, 'positive': 53})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# Apply the pipeline to the training data
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_vec, y_train)

Applying different models on the dataset

### Logistic Regression

In [5]:
# Train a logistic regression model
model_logistic_regression = LogisticRegression(class_weight='balanced')
model_logistic_regression.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred = model_logistic_regression.predict(X_test_vec)
y_pred

array(['positive', 'positive', 'positive', 'positive', 'positive',
       'neutral', 'positive', 'positive', 'positive', 'neutral',
       'positive', 'negative', 'negative', 'neutral', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'neutral', 'positive', 'neutral', 'positive', 'neutral',
       'negative', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'neutral'],
      dtype=object)

In [6]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral','positive']))

Accuracy: 0.8909090909090909
Classification Report:
              precision    recall  f1-score   support

    negative       0.67      0.50      0.57         4
     neutral       0.78      0.70      0.74        10
    positive       0.93      0.98      0.95        41

    accuracy                           0.89        55
   macro avg       0.79      0.73      0.75        55
weighted avg       0.88      0.89      0.89        55



In [7]:
pickle_out=open("model_logistic_regression.pkl","wb")
pickle.dump(model_logistic_regression,pickle_out)
pickle_out.close()

### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
# Train a Random Forest model
model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred = model.predict(X_test_vec)
y_pred



array(['positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'neutral', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'neutral'],
      dtype=object)

In [9]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
     neutral       1.00      0.30      0.46        10
    positive       0.79      1.00      0.88        41

    accuracy                           0.80        55
   macro avg       0.60      0.43      0.45        55
weighted avg       0.77      0.80      0.74        55



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Gradient Boosting

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model_gradient_boosting = GradientBoostingClassifier(random_state=42)
model_gradient_boosting.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = model_gradient_boosting.predict(X_test_vec)
y_pred


array(['positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'negative', 'neutral', 'positive',
       'positive', 'positive', 'positive', 'negative', 'positive',
       'neutral', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'negative', 'positive', 'positive', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'neutral'],
      dtype=object)

In [11]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

    negative       0.33      0.25      0.29         4
     neutral       0.80      0.40      0.53        10
    positive       0.83      0.95      0.89        41

    accuracy                           0.80        55
   macro avg       0.65      0.53      0.57        55
weighted avg       0.79      0.80      0.78        55



In [12]:
pickle_out=open("model_gradient_boosting.pkl","wb")
pickle.dump(model_gradient_boosting,pickle_out)
pickle_out.close()

### SVM

In [13]:
# Train a SVM model
from sklearn.svm import SVC

model = SVC(class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test_vec)
y_pred

array(['positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'neutral', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'neutral', 'positive', 'positive', 'positive', 'neutral',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'neutral'],
      dtype=object)

In [14]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral','positive']))

Accuracy: 0.8181818181818182
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
     neutral       1.00      0.40      0.57        10
    positive       0.80      1.00      0.89        41

    accuracy                           0.82        55
   macro avg       0.60      0.47      0.49        55
weighted avg       0.78      0.82      0.77        55



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# Train a CatBoost model
'''from catboost import CatBoostClassifier

model = CatBoostClassifier(class_weights=[1, 1, len(y_train) / sum(y_train == 'negative')], verbose=0)
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test_vec)
y_pred'''


"from catboost import CatBoostClassifier\n\nmodel = CatBoostClassifier(class_weights=[1, 1, len(y_train) / sum(y_train == 'negative')], verbose=0)\nmodel.fit(X_train_resampled, y_train_resampled)\ny_pred = model.predict(X_test_vec)\ny_pred"

In [16]:
'''print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))'''


'print("Accuracy:", accuracy_score(y_test, y_pred))\nprint("Classification Report:")\nprint(classification_report(y_test, y_pred, target_names=[\'negative\', \'neutral\', \'positive\']))'

### LightGBM

In [17]:
# Train a LightGBM model
import lightgbm as lgb
from lightgbm import LGBMClassifier

model_LGBM = LGBMClassifier(class_weight='balanced',random_state=42)
model_LGBM.fit(X_train_resampled, y_train_resampled)
y_pred = model_LGBM.predict(X_test_vec)
y_pred


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 168
[LightGBM] [Info] Number of data points in the train set: 124, number of used features: 18
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


array(['positive', 'negative', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'neutral', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'positive',
       'neutral', 'positive', 'negative', 'positive', 'positive',
       'negative', 'negative', 'positive', 'positive', 'positive',
       'positive', 'neutral', 'positive', 'positive', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'positive',
       'positive', 'positive', 'neutral', 'positive', 'neutral',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'neutral', 'neutral', 'positive', 'neutral', 'positive'],
      dtype=object)

In [18]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


Accuracy: 0.6727272727272727
Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.50      0.50         4
     neutral       0.30      0.30      0.30        10
    positive       0.78      0.78      0.78        41

    accuracy                           0.67        55
   macro avg       0.53      0.53      0.53        55
weighted avg       0.67      0.67      0.67        55



In [19]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

# Fine-tune the hyperparameters
model_LGBM = LGBMClassifier(
    boosting_type='gbdt',  # Gradient Boosting Decision Tree
    class_weight='balanced',  # Balancing classes
    learning_rate=0.1,  # Increase learning rate
    n_estimators=500,  # Increase number of trees
    num_leaves=50,  # Adjust number of leaves
    max_depth=10,  # Limit depth to control overfitting
    min_child_samples=10,  # Adjust minimum number of samples in leaf nodes
    subsample=0.9,  # Sample a fraction of data for training each tree
    colsample_bytree=0.8,  # Sample a fraction of features for training each tree
    reg_alpha=0.1,  # L1 regularization term
    reg_lambda=0.1,  # L2 regularization term
    random_state=42  # Set random seed for reproducibility
)

model_LGBM.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model_LGBM.predict(X_test_vec)
y_pred

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 750
[LightGBM] [Info] Number of data points in the train set: 124, number of used features: 119
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


array(['neutral', 'positive', 'positive', 'positive', 'positive',
       'neutral', 'positive', 'neutral', 'positive', 'neutral', 'neutral',
       'positive', 'negative', 'neutral', 'positive', 'positive',
       'positive', 'neutral', 'negative', 'positive', 'neutral',
       'positive', 'neutral', 'positive', 'neutral', 'negative',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'negative', 'neutral', 'positive', 'positive', 'positive',
       'positive', 'neutral', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'positive', 'positive', 'positive', 'positive', 'positive',
       'neutral', 'neutral', 'positive', 'neutral'], dtype=object)

In [20]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.50      0.50         4
     neutral       0.53      0.80      0.64        10
    positive       0.94      0.83      0.88        41

    accuracy                           0.80        55
   macro avg       0.66      0.71      0.67        55
weighted avg       0.84      0.80      0.81        55



In [21]:
pickle_out=open("model_LGBM.pkl","wb")
pickle.dump(model_LGBM,pickle_out)
pickle_out.close()

### Naive Bayes

In [22]:
#Train a Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 0.8545454545454545
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
     neutral       0.88      0.70      0.78        10
    positive       0.85      0.98      0.91        41

    accuracy                           0.85        55
   macro avg       0.58      0.56      0.56        55
weighted avg       0.79      0.85      0.82        55



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### AdaBoost

In [23]:
# Train an AdaBoost model
from sklearn.ensemble import AdaBoostClassifier

model_Ada = AdaBoostClassifier()
model_Ada.fit(X_train_resampled, y_train_resampled)
y_pred = model_Ada.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))




Accuracy: 0.7636363636363637
Classification Report:
              precision    recall  f1-score   support

    negative       0.43      0.75      0.55         4
     neutral       0.50      0.20      0.29        10
    positive       0.84      0.90      0.87        41

    accuracy                           0.76        55
   macro avg       0.59      0.62      0.57        55
weighted avg       0.75      0.76      0.74        55



In [24]:
pickle_out=open("model_Ada.pkl","wb")
pickle.dump(model_Ada,pickle_out)
pickle_out.close()