In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv(r"C:\Users\keert\Downloads\reviews_badminton\data.csv")

# Function to clean and preprocess text
def preprocess_text(text):
    # Check if text is not NaN
    if pd.notnull(text):
        # Convert text to lowercase
        text = str(text).lower()  # Convert to string before applying lower() method
        
        # Remove special characters and punctuation
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenization
        tokens = text.split()
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        # Join the tokens back into a single string
        clean_text = ' '.join(tokens)
        
        return clean_text
    else:
        return ''  # Return empty string for NaN values

# Apply text preprocessing to 'Review text' column
df['Cleaned Text'] = df['Review text'].apply(preprocess_text)

# Define threshold for positive sentiment
threshold = 3

# Create 'Sentiment' column based on 'Ratings'
df['Sentiment'] = df['Ratings'].apply(lambda x: 1 if x >= threshold else 0)

# Check the first few rows of the DataFrame to verify the 'Sentiment' column
print(df.head())

# Now, you can proceed with the train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned Text'], df['Sentiment'], test_size=0.2, random_state=42)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


            Reviewer Name               Review Title  \
0            Kamal Suresh               Nice product   
1       Flipkart Customer     Don't waste your money   
2  A. S. Raja Srinivasan   Did not meet expectations   
3     Suresh Narayanasamy                       Fair   
4               ASHIK P A                Over priced   

               Place of Review  Up Votes  Down Votes     Month  \
0   Certified Buyer, Chirakkal     889.0        64.0  Feb 2021   
1   Certified Buyer, Hyderabad     109.0         6.0  Feb 2021   
2  Certified Buyer, Dharmapuri      42.0         3.0  Apr 2021   
3     Certified Buyer, Chennai      25.0         1.0       NaN   
4                          NaN     147.0        24.0  Apr 2016   

                                         Review text  Ratings  \
0  Nice product, good quality, but price is now r...        4   
1  They didn't supplied Yonex Mavis 350. Outside ...        1   
2  Worst product. Damaged shuttlecocks packed in ...        1   
3  Qui

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned Text'], df['Sentiment'], test_size=0.2, random_state=42)

# Define models and pipelines
models = [
    ('Random Forest', RandomForestClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC())
]

# Pipeline for numerical feature extraction and model training
pipelines = []
for model_name, model in models:
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('model', model)
    ])
    pipelines.append((model_name, pipeline))

# Train and evaluate models
results = {}
for model_name, pipeline in pipelines:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[model_name] = f1

# Find best model
best_model = max(results, key=results.get)
print("Best Model:", best_model)


Best Model: Logistic Regression


In [9]:
# Train and evaluate models
results = {}
for model_name, pipeline in pipelines:
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[model_name] = f1

# Print F1-scores of all models
print("F1-scores of all models:")
for model_name, f1 in results.items():
    print(f"{model_name}: {f1}")

# Find best model
best_model = max(results, key=results.get)
print("Best Model:", best_model)


F1-scores of all models:
Random Forest: 0.8976280869047202
Logistic Regression: 0.9065500042218775
Support Vector Machine: 0.901545417616285
Best Model: Logistic Regression


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Initialize logistic regression model
logistic_regression_model = LogisticRegression()

# Train the model
logistic_regression_model.fit(X_train_tfidf, y_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict the sentiment labels for test data
y_pred = logistic_regression_model.predict(X_test_tfidf)

# Save the trained model and TF-IDF vectorizer
joblib.dump(logistic_regression_model, "logistic_regression_model.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9196009389671361
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.39      0.53       199
           1       0.92      0.99      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.88      0.69      0.74      1704
weighted avg       0.91      0.92      0.91      1704

