<a href="https://colab.research.google.com/github/yakshteja2004/Oasis-Infobyte-/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from google.colab import files

# Download stopwords (first time only)
nltk.download('stopwords')
from nltk.corpus import stopwords

print(" Libraries imported successfully.")


 Libraries imported successfully.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Upload your sentiment dataset (CSV)
uploaded = files.upload()

# Replace 'sentiment_data.csv' with your actual file name
df = pd.read_csv('/content/Twitter_Data.csv')

print(" Dataset uploaded successfully.")
print(df.head())


Saving Twitter_Data.csv to Twitter_Data (1).csv
 Dataset uploaded successfully.
                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


In [31]:
# Explore dataset
df.info()

print("\n Sentiment Distribution:")
print(df['category'].value_counts())

print("\n Sample Rows:")
print(df.sample(5))

<class 'pandas.core.frame.DataFrame'>
Index: 162973 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162973 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB

 Sentiment Distribution:
category
 1.0    72250
 0.0    55213
-1.0    35510
Name: count, dtype: int64

 Sample Rows:
                                               clean_text  category
162378       thankful nehru letting modi work letting win       1.0
157200  sahi kaha tha tumhare liye wayanad wayanad tri...       1.0
146796  lead deliver evil named modi right hand amit shah      -1.0
82553   modis tenure expect mob lynchers hatemongers r...       0.0
75474   cant digest success modi know achievement modi...       1.0


In [18]:
# Clean and preprocess text

stop_words = stopwords.words('english')

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'\@w+|#', '', text)  # remove @mentions and hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # remove special chars and numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # remove stopwords
    return text

df['clean_text'] = df['clean_text'].astype(str).apply(clean_text)

print("Text cleaned successfully.")
print(df[['clean_text']].head())

Text cleaned successfully.
                                          clean_text
0  modi promised minimum government maximum gover...
1             talk nonsense continue drama vote modi
2  say vote modi welcome bjp told rahul main camp...
3  asking supporters prefix chowkidar names modi ...
4  answer among powerful world leader today trump...


In [24]:
#  Split data into train and test sets
X = df['clean_text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(" Data split into training and testing sets.")


 Data split into training and testing sets.


In [26]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF transformation completed.")


TF-IDF transformation completed.


In [27]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

print(" Model trained successfully.")


 Model trained successfully.


In [28]:
# Evaluate performance
y_pred = model.predict(X_test_tfidf)

print(" Model Evaluation Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


 Model Evaluation Results:
Accuracy: 0.8935112747353888

Classification Report:
               precision    recall  f1-score   support

        -1.0       0.88      0.78      0.83      7102
         0.0       0.86      0.97      0.92     11043
         1.0       0.93      0.89      0.91     14450

    accuracy                           0.89     32595
   macro avg       0.89      0.88      0.88     32595
weighted avg       0.90      0.89      0.89     32595


Confusion Matrix:
 [[ 5547   728   827]
 [  110 10759   174]
 [  650   982 12818]]


In [29]:
# Predict sentiment for custom input
def predict_sentiment(text):
    cleaned = clean_text(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return prediction

sample_texts = [
    "I love this movie!",
    "This was the worst experience ever.",
    "It’s okay, not too bad."
]

for text in sample_texts:
    print(f" Text: {text}")
    print(f" Sentiment: {predict_sentiment(text)}\n")


 Text: I love this movie!
 Sentiment: 1.0

 Text: This was the worst experience ever.
 Sentiment: -1.0

 Text: It’s okay, not too bad.
 Sentiment: -1.0



In [30]:
# Save model and vectorizer
import pickle

with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

print("Model and vectorizer saved as 'sentiment_model.pkl' and 'tfidf_vectorizer.pkl'")


Model and vectorizer saved as 'sentiment_model.pkl' and 'tfidf_vectorizer.pkl'
