In [None]:
#load the file in xlsx
import pandas as pd
df = pd.read_excel('/content/chatgpt_style_reviews_dataset.xlsx')
print(df.head())

In [None]:
df.info()   # Column names, non-null counts, data types
df.describe()  # Summary for numerical columns
df.isnull().sum()  # Count missing values per column


In [None]:
#convert date column into date and time
import pandas as pd

df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:

df.info()   # Column names, non-null counts, data types
df.describe()  # Summary for numerical columns

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')

In [None]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet



stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
negators = {"no","nor","not","don't","ain","aren","couldn","didn","doesn","hadn","hasn","haven","isn","mightn","mustn","needn","shan","shouldn","wasn","weren","won","wouldn"}
stop_words = stop_words - negators

def _to_wn(pos):
  return {"J": wordnet.ADJ, "V": wordnet.VERB, "N": wordnet.NOUN, "R": wordnet.ADV}.get(pos[0], wordnet.NOUN)

def preprocess_text(text):

    if pd.isnull(text):
      return ""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s ']", " ", text)
    #collapse extra spaces
    text = re.sub(r"\s+", " ", text)
    # Tokenize
    tokens = text.split()
    #remove stopwords but keep negators
    tokens = [token for token in tokens if token not in stop_words or token in negators]
    # Lemmatize
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, _to_wn(pos)) for token, pos in tagged_tokens]
    return " ".join(lemmatized_tokens)

df['cleaned_reviews']= df['review'].apply (preprocess_text)


In [None]:
stop_words

In [None]:
#fill missing value

df.fillna({'title': 'No title', 'platform': 'Unknown'}, inplace=True)

In [None]:
# Define mapping
platform_map = {
    "App Store": "Mobile",
    "Google Play": "Mobile",
    "Flipkart": "Web",
    "Amazon": "Web"
}

# Apply mapping
df['platform_grouped'] = df['platform'].map(platform_map)

# If some values are not in map, mark them as 'Other'
df['platform_grouped'] = df['platform_grouped'].fillna("Other")

print(df[['platform', 'platform_grouped']].head(10))
print(df['platform_grouped'].value_counts())


In [None]:
# create sentiment labels from ratings
def get_sentiment_label(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'
df['sentiment'] = df['rating'].apply(get_sentiment_label)

In [None]:

df.info()
df.head(3)

In [None]:
#QUICK SANITY STAT
print("Rows:", len(df))
print("Rating stats:\n", df["rating"].describe())
print("\nSentiment distribution:\n", df["sentiment"].value_counts(dropna=False))
if "platform" in df.columns:
    print("\nPlatforms:\n", df["platform"].value_counts().head())
if "location" in df.columns:
    print("\nLocations:\n", df["location"].value_counts().head())

In [None]:
df.drop(columns=['username'], inplace=True)


df.head()

In [None]:

#saving the data
df.to_csv("processed_cleaned_reviews.csv", index=False)
print("Saved: processed_cleaned_reviews.csv")


Trained Machine Learning/DL Model for Sentiment Analysis
# **installing requirements**

In [None]:

!pip -q install pandas numpy scikit-learn matplotlib plotly wordcloud nltk

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, auc, RocCurveDisplay,
    precision_recall_curve, average_precision_score
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import label_binarize
from sklearn.dummy import DummyClassifier

import matplotlib.pyplot as plt
import plotly.express as px

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# choose text column: prefer 'cleaned_review', else 'review'
text_col = "cleaned_reviews" if "cleaned_reviews" in df.columns else "review"
if text_col not in df.columns:
    raise ValueError("Need a text column: 'cleaned_reviews' or 'review'")

# need a label; use 'sentiment' (Positive/Neutral/Negative). If missing, derive from rating.
if "sentiment" not in df.columns:
    if "rating" not in df.columns:
        raise ValueError("Provide 'sentiment' or 'rating' to derive labels.")
    def assign_sentiment(r):
        try:
            r = float(r)
        except:
            return np.nan
        if r >= 4: return "Positive"
        if r == 3: return "Neutral"
        if r <= 2: return "Negative"
        return np.nan
    df["sentiment"] = df["rating"].apply(assign_sentiment)

# drop bad rows
df_text = df[[text_col, "sentiment"]].dropna().copy()
df_text[text_col] = df_text[text_col].astype(str)

# (optional) quick cleaning if you used raw 'review'
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def light_clean(x):
    x = x.lower()
    x = re.sub(r"[^a-z\s]", " ", x)
    toks = [t for t in x.split() if t not in stop_words]
    toks = [lemm.lemmatize(t) for t in toks]
    return " ".join(toks)

if text_col == "cleaned_reviews":
    df_text["text_proc"] = df_text[text_col].apply(light_clean)
    use_col = "text_proc"
else:
    use_col = text_col


df_text.head(10)

In [None]:

#Sentiment analysis by NLTK


import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

# Calculate polarity scores for each cleaned review and store them
df['polarity_scores'] = df["cleaned_reviews"].apply(sia.polarity_scores)

# Display the first few rows with the new column
print(df[['cleaned_reviews', 'polarity_scores']].head())

#saving
import joblib
joblib.dump(sia, 'sentiment_analyzer.joblib')


In [None]:

def get_sentiment_label(score):
    if score['compound'] >= 0.05:
        return "Positive"
    elif score['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

df['sentiment'] = df['polarity_scores'].apply(get_sentiment_label)


In [None]:

df.head()


In [None]:

#saving as csv updated
df.to_csv("cleaned_reviews.csv", index=False)
print("Saved: cleaned_reviews.csv")

In [None]:
print(df_text.columns)







In [None]:
removed_classes = df_text["sentiment"].value_counts()[df_text["sentiment"].value_counts() < 2]
print("Removed classes:", removed_classes.index.tolist())

logistic regression pipeline



In [None]:
# balance_and_train.py
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [None]:

# ---------- LOAD your CSV ----------
# change path/column names as needed
df = pd.read_csv('/content/processed_cleaned_reviews.csv')
# assume df has columns: 'cleaned_reviews' (text) and 'sentiment' (labels: 'positive','neutral','negative')
df = df[['cleaned_reviews','sentiment']].dropna()

In [None]:
# ---------- Inspect counts (optional) ----------
print("Before:", df['sentiment'].value_counts())


In [None]:
# ---------- UPSAMPLE to balance ----------
# target: match the maximum class count
max_count = df['sentiment'].value_counts().max()

balanced_parts = []
for label, group in df.groupby('sentiment'):
    if len(group) < max_count:
        up = resample(group,
                      replace=True,
                      n_samples=max_count,
                      random_state=42)
        balanced_parts.append(up)
    else:
        balanced_parts.append(group)

df_balanced = pd.concat(balanced_parts).sample(frac=1, random_state=42).reset_index(drop=True)

print("After:", df_balanced['sentiment'].value_counts())


In [None]:
# ---------- Train / Test split ----------
X = df_balanced['cleaned_reviews'].astype(str)
y = df_balanced['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# ---------- Vectorize ----------
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# ---------- Model (Logistic Regression) ----------
model = LogisticRegression(max_iter=1000, class_weight=None, solver='liblinear', random_state=42)
# (class_weight=None because we've balanced data; if you DON'T balance, use class_weight='balanced')

In [None]:
# quick cross-val on training data
cv_scores = cross_val_score(model, X_train_vec, y_train, cv=5, scoring='accuracy')
print("CV accuracy (train):", cv_scores.mean(), cv_scores)

In [None]:

# fit final model
model.fit(X_train_vec, y_train)

In [None]:
# ---------- Evaluation ----------
y_pred = model.predict(X_test_vec)
print("Classification report (test):\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# ---------- Save model + vectorizer ----------
joblib.dump(model, "text_classifier_balanced.joblib")
joblib.dump(vectorizer, "vectorizer_balanced.joblib")
print("Saved model -> text_classifier_balanced.joblib")
print("Saved vectorizer -> vectorizer_balanced.joblib")


# **checking**

In [None]:
!pip install nltk

In [None]:
# Install libraries
#!pip install nltk

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon
nltk.download('vader_lexicon')

# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Custom sentiment word lists
custom_positive = {
    "good", "great", "excellent", "love", "awesome", "fast",
    "smooth", "best", "amazing", "perfect", "nice", "better",
    "fantastic", "wonderful", "super"
}

custom_negative = {
    "bad", "worst", "poor", "crash", "crashes", "error",
    "slow", "terrible", "problem", "problems", "issue",
    "issues", "bug", "bugs", "fail", "failed", "lag", "laggy"
}

custom_neutral = {
    "acceptable", "average", "ok", "okay", "fine", "decent",
    "normal", "regular", "usual", "overall"
}

# -----------------------------
# WORD SENTIMENT FUNCTION
# -----------------------------
def word_sentiment(text):
    words = text.split()
    results = {}

    for w in words:
        clean = w.lower().strip(",.!?")

        if clean in custom_neutral:
            results[w] = "Neutral"
            continue
        if clean in custom_positive:
            results[w] = "Positive"
            continue
        if clean in custom_negative:
            results[w] = "Negative"
            continue

        # VADER fallback
        score = sid.polarity_scores(clean)["compound"]
        if score > 0.05:
            results[w] = "Positive"
        elif score < -0.05:
            results[w] = "Negative"
        else:
            results[w] = "Neutral"

    return results

# -----------------------------
# FINAL SENTENCE SENTIMENT
# -----------------------------
def final_sentiment(text):
    w = word_sentiment(text)

    pos = sum(1 for v in w.values() if v == "Positive")
    neg = sum(1 for v in w.values() if v == "Negative")

    # Rule-based hybrid
    if pos > neg:
        return "Positive"
    elif neg > pos:
        return "Negative"
    else:
        return "Neutral"


# -----------------------------
# ðŸ”¥ RUN & ENTER SENTENCE
# -----------------------------
sentence = input("Enter a sentence: ")

print("\nWORD-LEVEL SENTIMENT:")
print(word_sentiment(sentence))

print("\nFINAL SENTENCE SENTIMENT:")
print(final_sentiment(sentence))


In [None]:
from google.colab import drive
drive.mount('/content/drive')