<a href="https://colab.research.google.com/github/virtikam-commits/vartika-Mishra/blob/main/sentiment_Analysis_on_Product_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Step 1: Hugging Face और Language Detection के लिए ज़रूरी libraries import करना
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from langdetect import detect
import re
import emoji


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m522.2/981.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=1da0fd6ffd10d9e0199dea10c5ce5196b9acfbdc93893b7b38b6911078960e21
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [3]:
pip install emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [5]:
# Step 2: Multilingual Sentiment Analysis के लिए pretrained model का नाम चुनना
MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base-sentiment"


In [8]:
# Step 3: Text को clean करने के लिए function (unwanted spaces हटाना, emojis handle करना)
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)             # अगर text string नहीं है तो string बना दो
    text = text.strip()              # starting/ending spaces हटाना
    text = re.sub(r"\s+", " ", text) # multiple spaces को single space करना
    text = emoji.demojize(text)      # emojis को text में बदलना (e.g., 😍 -> :smiling_face_with_heart_eyes:)
    return text


'cardiffnlp/twitter-xlm-roberta-base-sentiment'

In [9]:
# Step 4: Safe language detection function (error आए तो 'unknown' return करे)
def detect_lang_safe(text):
    try:
        lang = detect(text)
        return lang
    except:
        return "unknown"


In [11]:
# Step 5: Model और tokenizer load करना
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)


Loading model...


In [12]:
# Step 6: Sentiment analysis pipeline बनाना
sentiment_pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, return_all_scores=True)


Device set to use cpu


In [13]:
# Step 7: Model के label को readable format में map करना
label_map = {
    "LABEL_0": "negative",
    "LABEL_1": "neutral",
    "LABEL_2": "positive"
}


In [14]:
# Step 8: Reviews analyze करने का function
def analyze_reviews(reviews):
    results = []
    cleaned = [clean_text(r) for r in reviews]          # हर review clean करना
    pipe_out = sentiment_pipe(cleaned, truncation=True) # batch में prediction लेना
    for orig, ctext, pout in zip(reviews, cleaned, pipe_out):
        best = max(pout, key=lambda x: x['score'])      # सबसे high score वाला sentiment चुनना
        label = label_map.get(best['label'], best['label'])
        score = float(best['score'])
        lang = detect_lang_safe(ctext)
        results.append({
            "original": orig,
            "cleaned": ctext,
            "lang_detected": lang,
            "predicted_label": label,
            "score": score,
            "all_scores": { label_map.get(x['label'], x['label']): float(x['score']) for x in pout }
        })
    return results


In [17]:
# Step 9: Sample product reviews (Hindi + English + Hinglish)
sample_reviews = [
    "This phone is amazing, battery life is excellent!",
    "फोन की बैटरी बहुत खराब है, पैसो का नुकसान।",
    "Camera thoda average hai but value for money.",
    "Not satisfied. The product stopped working in 2 days.",
    "बहुत बढ़िया! बिल्कुल recommend करूँगा।",
    "delivery late, but product ok.",
    "Achha hai 👍",
    "Product sahi hai, par packaging kharab thi.",
    "Worst experience ever!!!",
    "बहुत घटिया quality. बिलकुल नहीं लेना चाहिए।",
    "I loved it 😍 best purchase",
    "price zyada hai lekin quality theek hai",
    "ye product mast he, ekdum badiya",
    "ye ulaa is the best web browser hai. ",
    "isko zoho company ne banayi hai kam lagta me .",
    "indai ka first web browser hai .",
    "ye google chrome se fast hai .",
    "google chrome ka ab india se bye bye ."
]


In [18]:
# Step 10: Analysis run करना और output print करना
analysis = analyze_reviews(sample_reviews)

import json
print(json.dumps(analysis, ensure_ascii=False, indent=2))


[
  {
    "original": "This phone is amazing, battery life is excellent!",
    "cleaned": "This phone is amazing, battery life is excellent!",
    "lang_detected": "en",
    "predicted_label": "positive",
    "score": 0.9263970851898193,
    "all_scores": {
      "negative": 0.020413395017385483,
      "neutral": 0.05318951606750488,
      "positive": 0.9263970851898193
    }
  },
  {
    "original": "फोन की बैटरी बहुत खराब है, पैसो का नुकसान।",
    "cleaned": "फोन की बैटरी बहुत खराब है, पैसो का नुकसान।",
    "lang_detected": "hi",
    "predicted_label": "negative",
    "score": 0.9195963740348816,
    "all_scores": {
      "negative": 0.9195963740348816,
      "neutral": 0.0632391944527626,
      "positive": 0.01716446876525879
    }
  },
  {
    "original": "Camera thoda average hai but value for money.",
    "cleaned": "Camera thoda average hai but value for money.",
    "lang_detected": "en",
    "predicted_label": "positive",
    "score": 0.49968525767326355,
    "all_scores": {
 