<a href="https://colab.research.google.com/github/yasaswiyash18/SENTIMENT_ANALYSIS/blob/main/CodTech_Task_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import matplotlib.pyplot as plt
import re
import logging

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
class SentimentAnalyzer:
    def __init__(self):
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.model = None

    def create_sample_data(self):
        reviews = [
            "This movie was fantastic! Great acting and storyline",
            "Terrible waste of time. Poor acting and boring plot",
            "Really enjoyed this film, would watch again",
            "Not worth the money, very disappointed",
            "Average movie, nothing special but okay",
            "One of the best movies I've ever seen",
            "Complete disaster, avoid at all costs",
            "Pretty good entertainment value",
            "Absolutely loved every minute of it",
            "Could have been better, somewhat disappointing"
        ] * 5  # Increase dataset size
        ratings = [5, 1, 4, 1, 3, 5, 1, 4, 5, 2] * 5  # Ensure class balance
        return pd.DataFrame({'text': reviews, 'rating': ratings})

    def preprocess_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = self.tokenizer.tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words and len(token) > 2]
        return ' '.join(tokens)

    def prepare_data(self, df):
        df['processed_text'] = df['text'].apply(self.preprocess_text)
        df['sentiment'] = df['rating'].apply(lambda x: 'negative' if x <= 2 else 'positive' if x >= 4 else 'neutral')
        return df

    def train_model(self, X_train, y_train):
        self.model = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95, ngram_range=(1, 2))),
            ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
        ])
        self.model.fit(X_train, y_train)

    def evaluate_model(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, zero_division=0))

    def analyze_new_text(self, texts):
        processed_texts = [self.preprocess_text(text) for text in texts]
        predictions = self.model.predict(processed_texts)
        return pd.DataFrame({'text': texts, 'sentiment': predictions})

In [4]:
def main():
    analyzer = SentimentAnalyzer()
    df = analyzer.create_sample_data()
    df = analyzer.prepare_data(df)

    class_counts = df['sentiment'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    df = df[df['sentiment'].isin(valid_classes)]

    stratify_option = df['sentiment'] if len(valid_classes) > 1 else None
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_text'], df['sentiment'], test_size=0.2, random_state=42, stratify=stratify_option
    )

    analyzer.train_model(X_train, y_train)
    analyzer.evaluate_model(X_test, y_test)

    new_reviews = [
        "This product is amazing! I absolutely love it!",
        "Terrible experience, would not recommend to anyone.",
        "It's okay, nothing special but gets the job done."
    ]
    results = analyzer.analyze_new_text(new_reviews)
    print("\nSentiment Analysis Results:")
    print(results.to_string(index=False))


In [5]:
if __name__ == "__main__":
    main()


Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         4
     neutral       1.00      1.00      1.00         1
    positive       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Sentiment Analysis Results:
                                               text sentiment
     This product is amazing! I absolutely love it!  positive
Terrible experience, would not recommend to anyone.  positive
  It's okay, nothing special but gets the job done.   neutral
