In [2]:
import zipfile

zip_path = "reviews_data_dump.zip"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("flipkart_reviews")


In [7]:
import pandas as pd
import os

extracted_base_path = "flipkart_reviews"

# List all items in the extracted base path
if os.path.exists(extracted_base_path):
    print(f"Contents of '{extracted_base_path}':")
    items_in_base = os.listdir(extracted_base_path)
    for item in items_in_base:
        print(f"- {item}")

    dfs = {}
    for category_dir in items_in_base:
        category_path = os.path.join(extracted_base_path, category_dir)
        # Check if it's a directory and attempt to find a CSV inside it
        if os.path.isdir(category_path):
            csv_files_found = [f for f in os.listdir(category_path) if f.endswith('.csv')]

            if csv_files_found:
                # Assuming there's only one relevant CSV per directory, or taking the first one
                csv_file_name = csv_files_found[0]
                csv_file_path = os.path.join(category_path, csv_file_name)

                print(f"\nAttempting to read CSV from: {csv_file_path}")
                try:
                    df_category = pd.read_csv(csv_file_path)
                    dfs[category_dir] = df_category
                    print(f"Successfully loaded '{category_dir}' reviews. Head of DataFrame:")
                    display(df_category.head())
                except Exception as e:
                    print(f"Error reading {csv_file_path}: {e}")
            else:
                print(f"No CSV files found within '{category_path}'.")
        else:
            print(f"Skipping non-directory item: {category_dir}")

    if dfs:
        print("\nAll successfully loaded DataFrames:")
        for name in dfs.keys():
            print(f"- {name} (shape: {dfs[name].shape})")
        # Optionally, concatenate all DataFrames into a single one
        # all_reviews_df = pd.concat(dfs.values(), ignore_index=True)
        # print("\nCombined DataFrame head:")
        # display(all_reviews_df.head())
        # print(f"Combined DataFrame shape: {all_reviews_df.shape}")
    else:
        print("No CSV files were successfully loaded.")

else:
    print(f"Error: Base extraction directory '{extracted_base_path}' not found.")


Contents of 'flipkart_reviews':
- reviews_badminton
- reviews_tea
- reviews_tawa

Attempting to read CSV from: flipkart_reviews/reviews_badminton/data.csv
Successfully loaded 'reviews_badminton' reviews. Head of DataFrame:


Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust √¢?¬π620 ..from retailer.I didn'...,1



Attempting to read CSV from: flipkart_reviews/reviews_tea/data.csv
Successfully loaded 'reviews_tea' reviews. Head of DataFrame:


Unnamed: 0,reviewer_name,reviewer_rating,review_title,review_text,place_of_review,Date_of_review,up_votes,Down_votes
0,Subhro Banerjee,5,Worth every penny,Great product ü§ó with great deals üòçüòç Tata Tea G...,"Certified Buyer, Budge Budge",Subhro Banerjee,236,59
1,Shiv chandra Jha,5,Great product,Very nice and super qwality tea taste are grea...,"Certified Buyer, Saharsa",Shiv chandra Jha,225,79
2,Flipkart Customer,5,Highly recommended,Great test great quality great price point tim...,"Certified Buyer, Sri Ganganagar",Flipkart Customer,89,27
3,DTH Y,4,Very Good,Nice üòäREAD MORE,"Certified Buyer, Phaltan",DTH Y,30,6
4,Bhavesh Godhani,5,Classy product,Very Good Tata tea product.READ MORE,"Certified Buyer, Ahmedabad",Bhavesh Godhani,69,22



Attempting to read CSV from: flipkart_reviews/reviews_tawa/data.csv
Successfully loaded 'reviews_tawa' reviews. Head of DataFrame:


Unnamed: 0,Reviewer_Name,Reviewer_Rating,Review_Title,Review_Text,Place_of_Review,Date_of_Review,Up_Votes,Down_Votes
0,Sumit Kumar,5.0,Wonderful,I think In this price category it's best dosa ...,"Certified Buyer, Lakhisarai",Sumit Kumar,211,39
1,BHARAT GALAGALI,5.0,Mind-blowing purchase,perfect tawa for Dosa..READ MORE,"Certified Buyer, Hunsur",BHARAT GALAGALI,107,17
2,Paramjeet Singh,5.0,Awesome,Excellent tawa. Made Paneer Tikka on first day...,"Certified Buyer, Rampura Phul",Paramjeet Singh,59,8
3,Virendra Kumar,5.0,Great product,Nice üôÇ productREAD MORE,"Certified Buyer, Chengalpattu District",Virendra Kumar,77,12
4,jyoti solanki,5.0,Best in the market!,Delivery man is also good....READ MORE,"Certified Buyer, Mumbai",jyoti solanki,53,7



All successfully loaded DataFrames:
- reviews_badminton (shape: (8518, 8))
- reviews_tea (shape: (9170, 8))
- reviews_tawa (shape: (2531, 8))


In [9]:
#STEP 1: DATA LOADING (Badminton Only)
import pandas as pd

# The DataFrame for 'reviews_badminton' is already loaded and stored in the 'dfs' dictionary
# from the previous cell. We can directly access it.
if 'reviews_badminton' in dfs:
    df = dfs['reviews_badminton']
    df = df[['Review text', 'Ratings']]
    df.columns = ['review_text', 'rating']

    print("Successfully accessed 'reviews_badminton' DataFrame. Head of DataFrame:")
    display(df.head())
else:
    print("Error: 'reviews_badminton' DataFrame not found in 'dfs'. Please ensure the previous cells were executed successfully.")


Successfully accessed 'reviews_badminton' DataFrame. Head of DataFrame:


Unnamed: 0,review_text,rating
0,"Nice product, good quality, but price is now r...",4
1,They didn't supplied Yonex Mavis 350. Outside ...,1
2,Worst product. Damaged shuttlecocks packed in ...,1
3,"Quite O. K. , but nowadays the quality of the...",3
4,Over pricedJust √¢?¬π620 ..from retailer.I didn'...,1


In [12]:
#STEP 2: DATA CLEANING & LABEL CREATION
# Remove neutral reviews and create safe copy
df = df[df['rating'] != 3].copy()

# Sentiment labeling
df['sentiment'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)

# Text preprocessing
df['clean_review'] = df['review_text'].apply(preprocess)


In [13]:
#STEP 3: TEXT PREPROCESSING
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['clean_review'] = df['review_text'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
#STEP 4: FEATURE EXTRACTION
#Bag of Words (BoW)
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(max_features=5000)
X_bow = bow.fit_transform(df['clean_review'])


#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['clean_review'])


In [15]:
#STEP 5: TRAIN-TEST SPLIT
from sklearn.model_selection import train_test_split

y = df['sentiment']

Xb_train, Xb_test, y_train, y_test = train_test_split(
    X_bow, y, test_size=0.2, random_state=42
)

Xt_train, Xt_test, _, _ = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


### STEP 6: TRAIN MULTIPLE ALGORITHMS

In [16]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lr = LogisticRegression(max_iter=1000)
lr.fit(Xt_train, y_train)
lr_pred = lr.predict(Xt_test)

lr_f1 = f1_score(y_test, lr_pred)


In [17]:
#Naive Bayes
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(Xb_train, y_train)
nb_pred = nb.predict(Xb_test)

nb_f1 = f1_score(y_test, nb_pred)


In [18]:
#Support Vector Machine (SVM)
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(Xt_train, y_train)
svm_pred = svm.predict(Xt_test)

svm_f1 = f1_score(y_test, svm_pred)


In [19]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf.fit(Xt_train, y_train)
rf_pred = rf.predict(Xt_test)

rf_f1 = f1_score(y_test, rf_pred)


In [20]:
#STEP 7: MODEL COMPARISON
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes', 'SVM', 'Random Forest'],
    'F1 Score': [lr_f1, nb_f1, svm_f1, rf_f1]
})

results.sort_values(by='F1 Score', ascending=False)


Unnamed: 0,Model,F1 Score
2,SVM,0.96259
3,Random Forest,0.959198
0,Logistic Regression,0.958126
1,Naive Bayes,0.957929


In [21]:
#HYPERPARAMETER TUNING (HPT) ‚Äî ALL MODELS
X = df['clean_review']
y = df['sentiment']


In [22]:
#1. SVM ‚Äî Hyperparameter Tuning
#Pipeline + Grid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', LinearSVC())
])

svm_params = {
    'tfidf__max_features': [3000, 5000, 8000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'svm__C': [0.01, 0.1, 1, 10]
}

svm_grid = GridSearchCV(
    svm_pipeline,
    svm_params,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

svm_grid.fit(X, y)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [23]:
svm_grid.best_score_
svm_grid.best_params_


{'svm__C': 1, 'tfidf__max_features': 8000, 'tfidf__ngram_range': (1, 2)}

In [24]:
#2. LOGISTIC REGRESSION ‚Äî HPT
from sklearn.linear_model import LogisticRegression

lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
])

lr_params = {
    'tfidf__max_features': [3000, 5000, 8000],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'lr__C': [0.01, 0.1, 1, 10],
    'lr__penalty': ['l2']
}

lr_grid = GridSearchCV(
    lr_pipeline,
    lr_params,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

lr_grid.fit(X, y)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [25]:
lr_grid.best_score_
lr_grid.best_params_

{'lr__C': 10,
 'lr__penalty': 'l2',
 'tfidf__max_features': 5000,
 'tfidf__ngram_range': (1, 2)}

In [26]:
#3. NAIVE BAYES ‚Äî HPT
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

nb_pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('nb', MultinomialNB())
])

nb_params = {
    'bow__max_features': [3000, 5000, 8000],
    'bow__ngram_range': [(1,1), (1,2)],
    'nb__alpha': [0.1, 0.5, 1.0]
}

nb_grid = GridSearchCV(
    nb_pipeline,
    nb_params,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

nb_grid.fit(X, y)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [27]:
nb_grid.best_score_
nb_grid.best_params_


{'bow__max_features': 8000, 'bow__ngram_range': (1, 2), 'nb__alpha': 0.5}

In [28]:
#4. RANDOM FOREST ‚Äî HPT
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('rf', RandomForestClassifier())
])

rf_params = {
    'rf__n_estimators': [100, 300],
    'rf__max_depth': [None, 20, 50],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

rf_grid = GridSearchCV(
    rf_pipeline,
    rf_params,
    scoring='f1',
    cv=3,      # reduce CV due to cost
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X, y)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [29]:
rf_grid.best_score_
rf_grid.best_params_

{'rf__max_depth': None,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 5,
 'rf__n_estimators': 300}

In [30]:
import pandas as pd

final_results = pd.DataFrame({
    'Model': [
        'Naive Bayes (Tuned)',
        'Logistic Regression (Tuned)',
        'Random Forest (Tuned)',
        'SVM (Tuned)'
    ],
    'F1 Score': [
        nb_grid.best_score_,
        lr_grid.best_score_,
        rf_grid.best_score_,
        svm_grid.best_score_
    ]
})

final_results.sort_values(by='F1 Score', ascending=False)


Unnamed: 0,Model,F1 Score
3,SVM (Tuned),0.961316
1,Logistic Regression (Tuned),0.961111
0,Naive Bayes (Tuned),0.959886
2,Random Forest (Tuned),0.956584


In [31]:
#STEP 6: SAVE FINAL MODEL
import pickle
pickle.dump(svm_grid.best_estimator_, open("badminton_sentiment_model.pkl", "wb"))



In [34]:
sample = ["The badminton racket quality is excellent"]

# Use the best estimator directly from svm_grid, as it's still in scope
prediction = svm_grid.best_estimator_.predict(sample)
prediction

array([1])

In [37]:
import pickle

model = pickle.load(open("badminton_sentiment_model.pkl", "rb"))
print(type(model))


<class 'sklearn.pipeline.Pipeline'>


In [38]:
def predict_sentiment(review):
    pred = model.predict([review])[0]
    return "Positive" if pred == 1 else "Negative"


In [40]:
predict_sentiment("Very good shuttle, worth the price")

'Positive'

In [41]:
predict_sentiment("Worst quality, breaks easily")

'Negative'