In [10]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from joblib import Memory
import joblib
import os
import warnings
import time
import mlflow
from mlflow.tracking import MlflowClient
import mlflow.pyfunc


In [2]:
# Suppress warnings
warnings.filterwarnings('ignore')


In [3]:
# Set MLflow tracking URI
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Start MLflow experiment
mlflow.set_experiment("exp-6")


2024/03/28 13:19:06 INFO mlflow.tracking.fluent: Experiment with name 'exp-6' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/583061336309648736', creation_time=1711612146289, experiment_id='583061336309648736', last_update_time=1711612146289, lifecycle_stage='active', name='exp-6', tags={}>

In [4]:
# Load the dataset
df = pd.read_csv('data.csv')

# Display dataset information
print(df.info())

# Display the first few rows of the dataset
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB
None
            Reviewer Name               Review Title  \
0            Kamal Suresh               Nice product   
1       Flipkart Customer     Don't waste your money   
2  A. S. Raja Srinivasan   Did not meet expectations   
3     Suresh Narayanasamy                       Fair   
4               ASHIK P A                Over priced   

               Place of Revi

In [5]:
# Select relevant columns
df = df[["Review Title", "Review text", "Ratings"]]

# Preprocess the text and assign sentiment labels
df['Sentiment'] = df['Ratings'].apply(lambda rating: 'negative' if rating <= 2 else 'positive')

df.head()


Unnamed: 0,Review Title,Review text,Ratings,Sentiment
0,Nice product,"Nice product, good quality, but price is now r...",4,positive
1,Don't waste your money,They didn't supplied Yonex Mavis 350. Outside ...,1,negative
2,Did not meet expectations,Worst product. Damaged shuttlecocks packed in ...,1,negative
3,Fair,"Quite O. K. , but nowadays the quality of the...",3,positive
4,Over priced,Over pricedJust â?¹620 ..from retailer.I didn'...,1,negative


In [6]:
# Split the data into features and target
X = df[['Review text']]
y = df['Sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess text function
# nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text)
    text = text.replace('READ MORE', '')
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r':\)|:\(|:\D|:\S', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_text = [word for word in words if word not in stop_words]
    filtered_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return " ".join(filtered_text)

# Apply preprocessing to training and test data
X_train['clean_text'] = X_train['Review text'].apply(preprocess_text)
X_test['clean_text'] = X_test['Review text'].apply(preprocess_text)


In [7]:
# Define models and pipelines
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ]),
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ]),
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', RandomForestClassifier())
    ]),
    'svm': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', SVC())
    ])
}

# Define parameter grids
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__alpha' : [1, 10]
        }
    ],
    'decision_tree': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['elasticnet'],
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ],
    'random_forest': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'svm': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__kernel': ['linear', 'rbf', 'poly'],
            'classifier__C': [0.1, 1, 10],
        }
    ]
}


In [9]:
# Perform GridSearchCV for each algorithm
for algo in pipelines.keys():
    # Start main run for the model
    with mlflow.start_run(run_name=algo):
        print("*"*10, algo, "*"*10)
        grid_search = GridSearchCV(estimator=pipelines[algo],
                                   param_grid=param_grids[algo],
                                   cv=5,
                                   scoring='f1',
                                   return_train_score=True,
                                   verbose=1
                                  )

        grid_search.fit(X_train['clean_text'], y_train)

        # Log parameters for the main run
        best_params = grid_search.best_params_
        mlflow.log_params(best_params)

        # Log metrics for the main run
        best_model = grid_search.best_estimator_
        train_f1 = f1_score(y_train, best_model.predict(X_train['clean_text']), pos_label='positive')
        test_f1 = f1_score(y_test, best_model.predict(X_test['clean_text']), pos_label='positive')
        train_accuracy = accuracy_score(y_train, best_model.predict(X_train['clean_text']))
        test_accuracy = accuracy_score(y_test, best_model.predict(X_test['clean_text']))

        mlflow.log_metric("train_f1", train_f1)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("test_accuracy", test_accuracy)

        print('Best Train F1 Score:', train_f1)
        print('Best Test F1 Score:', test_f1)
        print('Best Train Accuracy:', train_accuracy)
        print('Best Test Accuracy:', test_accuracy)

        # Log the model for the main run
        mlflow.sklearn.log_model(best_model, "model")

        # Log each individual run's details
        for i, params in enumerate(grid_search.cv_results_['params']):
            with mlflow.start_run(nested=True, run_name=f"Run-{i+1}"):
                # Fit the pipeline with the current parameters
                pipeline = Pipeline([
                    ('vectorization', params['vectorization']),
                    ('classifier', pipelines[algo]['classifier'])  # Use the classifier of the pipeline
                ])
                pipeline.set_params(**params)
                pipeline.fit(X_train['clean_text'], y_train)

                # Calculate metrics for the individual run
                train_f1 = f1_score(y_train, pipeline.predict(X_train['clean_text']), pos_label='positive')
                test_f1 = f1_score(y_test, pipeline.predict(X_test['clean_text']), pos_label='positive')
                train_accuracy = accuracy_score(y_train, pipeline.predict(X_train['clean_text']))
                test_accuracy = accuracy_score(y_test, pipeline.predict(X_test['clean_text']))

                # Log parameters for the individual run
                mlflow.log_params(params)

                # Log metrics for the individual run
                mlflow.log_metric("train_f1", train_f1)
                mlflow.log_metric("test_f1", test_f1)
                mlflow.log_metric("train_accuracy", train_accuracy)
                mlflow.log_metric("test_accuracy", test_accuracy)

                # Log the model for the individual run
                mlflow.sklearn.log_model(pipeline, "model")


********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Train F1 Score: 0.9579439252336448
Best Test F1 Score: 0.9572649572649573
Best Train Accuracy: 0.9260346345758732
Best Test Accuracy: 0.9237089201877934
********** decision_tree **********
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Train F1 Score: 0.9841508174841508
Best Test F1 Score: 0.9459728206827974
Best Train Accuracy: 0.9721162312885236
Best Test Accuracy: 0.9043427230046949
********** logistic_regression **********
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Train F1 Score: 0.9323818640675311
Best Test F1 Score: 0.934201507882111
Best Train Accuracy: 0.8859700616378046
Best Test Accuracy: 0.8873239436619719
********** random_forest **********
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Train F1 Score: 0.9840899625156185
Best Test F1 Score: 0.955078125
Best Train Accuracy: 0.9719694746110948
Best Test Accuracy: 0

In [17]:
# Load and evaluate the models from MLflow
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)

    # Load model from MLflow
    client = MlflowClient()
    runs = client.search_runs(experiment_ids=[583061336309648736], filter_string=f"tags.mlflow.runName = '{algo}'")
    run_id = runs[0].info.run_id
    model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")

    # Measure prediction time
    start_time = time.time()

    y_test_pred = model.predict(X_test['clean_text'])

    # Calculate prediction time
    prediction_time = time.time() - start_time
    
    # Log prediction time to MLflow
    mlflow.log_metric("prediction_time", prediction_time)

    # Calculate and print evaluation metrics
    test_f1 = f1_score(y_test, y_test_pred, pos_label='positive')
    classification_rep = classification_report(y_test, y_test_pred)
    print("Test F1 Score:", test_f1)
    print("Classification Report:")
    print(classification_rep)
    print("Prediction Time:", prediction_time, "seconds")


********** naive_bayes **********


Downloading artifacts: 100%|█████████████████████| 9/9 [00:00<00:00, 512.20it/s]


Test F1 Score: 0.9572649572649573
Classification Report:
              precision    recall  f1-score   support

    negative       0.71      0.59      0.64       199
    positive       0.95      0.97      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.83      0.78      0.80      1704
weighted avg       0.92      0.92      0.92      1704

Prediction Time: 0.0038089752197265625 seconds
********** decision_tree **********


Downloading artifacts: 100%|█████████████████████| 9/9 [00:00<00:00, 506.53it/s]


Test F1 Score: 0.9459728206827974
Classification Report:
              precision    recall  f1-score   support

    negative       0.59      0.57      0.58       199
    positive       0.94      0.95      0.95      1505

    accuracy                           0.90      1704
   macro avg       0.77      0.76      0.76      1704
weighted avg       0.90      0.90      0.90      1704

Prediction Time: 0.0038459300994873047 seconds
********** logistic_regression **********


Downloading artifacts: 100%|█████████████████████| 9/9 [00:00<00:00, 502.83it/s]


Test F1 Score: 0.934201507882111
Classification Report:
              precision    recall  f1-score   support

    negative       0.51      0.75      0.61       199
    positive       0.96      0.91      0.93      1505

    accuracy                           0.89      1704
   macro avg       0.74      0.83      0.77      1704
weighted avg       0.91      0.89      0.90      1704

Prediction Time: 0.003634214401245117 seconds
********** random_forest **********


Downloading artifacts: 100%|█████████████████████| 9/9 [00:00<00:00, 188.88it/s]


Test F1 Score: 0.955078125
Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.50      0.59       199
    positive       0.94      0.97      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.83      0.74      0.77      1704
weighted avg       0.91      0.92      0.91      1704

Prediction Time: 0.025712251663208008 seconds
********** svm **********


Downloading artifacts: 100%|█████████████████████| 9/9 [00:00<00:00, 455.17it/s]

Test F1 Score: 0.9559870550161812
Classification Report:
              precision    recall  f1-score   support

    negative       0.76      0.46      0.57       199
    positive       0.93      0.98      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.85      0.72      0.76      1704
weighted avg       0.91      0.92      0.91      1704

Prediction Time: 0.053421974182128906 seconds





In [18]:
# # Sample prediction using a model
# model = mlflow.sklearn.load_model("runs:/1b4fddbdaf3b43f58b38914759d2071b/model")

new_data = [
    "The Product is ridiculously awesome"
]

new_data_clean = [preprocess_text(doc) for doc in new_data]

prediction = model.predict(new_data_clean)

print("Prediction:", prediction)


Prediction: ['positive']
