In [5]:
# sentiment_analysis.py

# Import libraries
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import streamlit as st

# 1. Load Dataset
df = pd.read_csv("sentimentdataset.csv")  # Make sure your CSV has 'text' and 'emotion' columns

# 2. Preprocessing Function
def clean_text(Text):
    text = re.sub(r"http\\S+", "", Text)
    text = re.sub(r"@[A-Za-z0-9_]+", "", Text)
    text = re.sub(r"#[A-Za-z0-9_]+", "", Text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df["clean_text"] = df["Text"].apply(clean_text)

# 3. Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["clean_text"])
y = df["Sentiment"]

# 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train Logistic Regression Model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 6. Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 7. Streamlit Web App
st.title("🧠 Emotion Detector from Social Media Text")
user_input = st.text_area("Enter a tweet or comment:")

if st.button("Analyze Emotion"):
    cleaned = clean_text(user_input)
    vectorized = tfidf.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    st.success(f"Predicted Emotion: **{prediction}**")



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025-05-12 21:53:36.421 
  command:

    streamlit run C:\Users\SHANGAR\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-12 21:53:36.427 Session state does not function when running a script without `streamlit run`


Classification Report:
                         precision    recall  f1-score   support

         Acceptance          0.00      0.00      0.00         2
      Acceptance             0.00      0.00      0.00         0
           Admiration        0.00      0.00      0.00         1
        Admiration           0.00      0.00      0.00         1
         Affection           0.00      0.00      0.00         1
      Ambivalence            0.00      0.00      0.00         1
         Anger               0.00      0.00      0.00         1
        Anticipation         0.00      0.00      0.00         1
        Arousal              0.00      0.00      0.00         3
                  Awe        0.00      0.00      0.00         1
         Awe                 0.00      0.00      0.00         1
                  Bad        0.00      0.00      0.00         1
             Betrayal        0.00      0.00      0.00         2
        Betrayal             0.00      0.00      0.00         1
         Bitter

In [6]:
import pandas as pd

# Load the dataset
df = pd.read_csv("sentimentdataset.csv")

# Get the size and structure
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\nColumn Names:", df.columns.tolist())

# Preview the first 5 rows
print("\nFirst 5 records:")
print(df.head())


Number of rows: 732
Number of columns: 15

Column Names: ['Unnamed: 0.1', 'Unnamed: 0', 'Text', 'Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month', 'Day', 'Hour']

First 5 records:
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! 💪          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter   

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# Load dataset
df = pd.read_csv("sentimentdataset.csv")

# --- Handle Missing Values ---
print("Missing values before:\n", df.isnull().sum())

# Drop rows with missing text or label
df.dropna(subset=['Text', 'Sentiment'], inplace=True)

# Fill missing numerical columns if any
df.fillna(0, inplace=True)

print("Missing values after:\n", df.isnull().sum())

# --- Remove Duplicates ---
initial_len = len(df)
df.drop_duplicates(inplace=True)
print(f"Removed {initial_len - len(df)} duplicate rows.")

# --- Outlier Handling (if numerical columns exist) ---
# Example: Remove outliers in a column named 'length'
if 'length' in df.columns:
    q1 = df['length'].quantile(0.25)
    q3 = df['length'].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df = df[(df['length'] >= lower) & (df['length'] <= upper)]
    print("Outliers removed from 'length' column.")

# --- Label Encoding for Target ---
le = LabelEncoder()
df['emotion_encoded'] = le.fit_transform(df['Sentiment'])

# --- Feature Scaling (if numeric features exist) ---
scaler = StandardScaler()

# Example: scaling if numeric features exist
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

if numeric_cols:
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print("Numerical features scaled.")

# Final structure check
print(df.head())


Missing values before:
 Unnamed: 0.1    0
Unnamed: 0      0
Text            0
Sentiment       0
Timestamp       0
User            0
Platform        0
Hashtags        0
Retweets        0
Likes           0
Country         0
Year            0
Month           0
Day             0
Hour            0
dtype: int64
Missing values after:
 Unnamed: 0.1    0
Unnamed: 0      0
Text            0
Sentiment       0
Timestamp       0
User            0
Platform        0
Hashtags        0
Retweets        0
Likes           0
Country         0
Year            0
Month           0
Day             0
Hour            0
dtype: int64
Removed 0 duplicate rows.
Numerical features scaled.
   Unnamed: 0.1  Unnamed: 0  \
0     -1.733763   -1.741727   
1     -1.729032   -1.737017   
2     -1.724301   -1.732306   
3     -1.719570   -1.727595   
4     -1.714839   -1.722884   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traff

In [11]:
# sentiment_eda_preprocessing.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
df = pd.read_csv("sentimentdataset.csv")  # Make sure this file is in your working directory

# Rename columns for consistency
df.rename(columns={"Text": "text", "Sentiment": "emotion"}, inplace=True)

# --- Handle Missing Values ---
df.dropna(subset=["text", "emotion"], inplace=True)
df.fillna(0, inplace=True)

# --- Remove Duplicates ---
df.drop_duplicates(inplace=True)

# --- Add Text Length Feature ---
df["text_length"] = df["text"].apply(lambda x: len(str(x).split()))

# --- Handle Outliers (Optional for numeric features like text_length) ---
q1 = df["text_length"].quantile(0.25)
q3 = df["text_length"].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
df = df[(df["text_length"] >= lower) & (df["text_length"] <= upper)]

# --- Encode Labels ---
le = LabelEncoder()
df["emotion_encoded"] = le.fit_transform(df["emotion"])

# --- Scale Numerical Features ---
scaler = StandardScaler()
df[["text_length_scaled"]] = scaler.fit_transform(df[["text_length"]])

# --- EDA Visuals ---

# Emotion Distribution Histogram
plt.figure(figsize=(10, 5))
sns.countplot(x="emotion", data=df, order=df["emotion"].value_counts().index, palette="viridis")
plt.title("Distribution of Emotions")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Boxplot of Text Length by Emotion
plt.figure(figsize=(10, 6))
sns.boxplot(x="emotion", y="text_length", data=df, palette="Set2")
plt.title("Text Length by Emotion")
plt.xlabel("Emotion")
plt.ylabel("Word Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Correlation Heatmap
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
if len(numeric_cols) >= 2:
    plt.figure(figsize=(8, 6))
    sns.heatmap(df[numeric_cols].corr(), annot=True, cmap="coolwarm")
    plt.title("Correlation Heatmap")
    plt.tight_layout()
    plt.show()
else:
    print("Not enough numeric columns for a heatmap.")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x="emotion", data=df, order=df["emotion"].value_counts().index, palette="viridis")
  plt.show()

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="emotion", y="text_length", data=df, palette="Set2")
  plt.show()
  plt.show()


In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df.rename(columns={"Text": "text", "Sentiment": "emotion"}, inplace=True)

# Drop missing values
df.dropna(subset=["text", "emotion"], inplace=True)

# --- 1. New Feature Creation ---
df["text_length"] = df["text"].apply(lambda x: len(str(x).split()))
df["punctuation_count"] = df["text"].apply(lambda x: sum([1 for char in str(x) if char in "!?."]))
df["capital_word_count"] = df["text"].apply(lambda x: sum([1 for word in str(x).split() if word.isupper()]))

# --- 2. Label Encoding ---
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["emotion_encoded"] = le.fit_transform(df["emotion"])

# --- 3. TF-IDF Vectorization ---
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df["text"]).toarray()

# Combine with new features
X_extra = df[["text_length", "punctuation_count", "capital_word_count"]].values
X_combined = np.hstack((X_tfidf, X_extra))
y = df["emotion_encoded"]

# --- 4. Feature Scaling ---
scaler = StandardScaler()
X_combined[:, -3:] = scaler.fit_transform(X_combined[:, -3:])  # Scale extra features



# --- 6. Visualization of Impactful New Features ---
plt.figure(figsize=(8, 5))
sns.boxplot(x="emotion", y="text_length", data=df)
plt.title("Text Length vs Emotion")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


  plt.show()


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# 1. Load and clean dataset
df = pd.read_csv("sentimentdataset.csv")
df.rename(columns={"Text": "text", "Sentiment": "emotion"}, inplace=True)
df.dropna(subset=["text", "emotion"], inplace=True)

# 2. Filter for selective emotions
selected_emotions = ["joy", "sadness", "anger", "fear"]
df = df[df["emotion"].isin(selected_emotions)]

# 3. Encode labels
le = LabelEncoder()
df["emotion_encoded"] = le.fit_transform(df["emotion"])

# 4. Text vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"])
y = df["emotion_encoded"]

# 5. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# 7. Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_test, y_pred, target_names=le.classes_))


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# 1. Load dataset
df = pd.read_csv("sentimentdataset.csv")
df.rename(columns={"Text": "text", "Sentiment": "emotion"}, inplace=True)

# 2. Filter only desired emotions
selected_emotions = ["joy", "sadness", "anger", "fear"]
df = df[df["emotion"].isin(selected_emotions)]

# 3. Clean and validate text column
df["text"] = df["text"].astype(str).str.strip()
df = df[df["text"].str.len() > 0]  # Remove empty strings
df = df[~df["text"].isin(ENGLISH_STOP_WORDS)]  # Optional: remove entries that are only stopwords

# 4. Encode target labels
le = LabelEncoder()
df["emotion_encoded"] = le.fit_transform(df["emotion"])

# 5. Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"])
y = df["emotion_encoded"]

# 6. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# 8. Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nClassification Report for {name}:\n")
    print(classification_report(y_test, y_pred, target_names=le.classes_))


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# 1. Load dataset
df = pd.read_csv("sentimentdataset.csv")
df.rename(columns={"Text": "text", "Sentiment": "emotion"}, inplace=True)

# 2. Filter only desired emotions
selected_emotions = ["joy", "sadness", "anger", "fear"]
df = df[df["emotion"].isin(selected_emotions)]

# 3. Clean and validate text column
df["text"] = df["text"].astype(str).str.strip()
df = df[df["text"].apply(lambda x: len(x.split()) > 1)]  # Remove rows with <= 1 word



# 6. Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Define models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# 8. Train and evaluate each model
target_names = sorted(df["emotion"].unique())  # Get unique emotion names in consistent order

for name, model in models.items():
    print(f"\n=== Training: {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_test, y_pred, target_names=target_names))



KeyError: 'emotion'

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df.rename(columns={"Text": "text", "Sentiment": "emotion"}, inplace=True)

# Filter for selective emotions
selected_emotions = ["joy", "sadness", "anger", "fear"]
df = df[df["emotion"].isin(selected_emotions)]

# Clean and validate text column
df["text"] = df["text"].astype(str).str.strip()
df = df[df["text"].apply(lambda x: len(x.split()) > 1)]  # Keep only texts with more than one word

# Encode emotion labels
le = LabelEncoder()
df["emotion_encoded"] = le.fit_transform(df["emotion"])

# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words=None)
X = tfidf.fit_transform(df["text"])
y = df["emotion_encoded"]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Get class names
target_names = le.classes_

# Train and evaluate models
for name, model in models.items():
    print(f"\n=== Training: {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nClassification Report for {name}:\n")
    print(classification_report(y_test, y_pred, target_names=target_names))


KeyError: 'emotion'

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    mean_squared_error
)
from sklearn.preprocessing import label_binarize

# Assume df, X_train, X_test, y_train, y_test, models are already defined
# And label encoder was already applied -> le

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    prec = precision_score(y_test, y_pred, average="weighted")
    rec = recall_score(y_test, y_pred, average="weighted")
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1,
        "RMSE": rmse
    })

    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

    # ROC Curve (for models that support predict_proba)
    if y_proba is not None and len(le.classes_) == 2:
        fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1])
        auc = roc_auc_score(y_test, y_proba[:, 1])
        plt.figure()
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")
        plt.plot([0, 1], [0, 1], linestyle='--')
        plt.title("ROC Curve")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.legend()
        plt.tight_layout()
        plt.show()

# Model Comparison Table
results_df = pd.DataFrame(results)
print("\n=== Model Comparison Table ===")
print(results_df.sort_values(by="F1-Score", ascending=False))



=== Logistic Regression ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AttributeError: 'LabelEncoder' object has no attribute 'classes_'

In [36]:
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK data
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv("sentimentdataset.csv")

# Preview the data
print(df.head())

# Assuming the columns are 'text' and 'sentiment'. Adjust if needed.
text_column = 'Text'       # change to the correct column name
label_column = 'Sentiment' # change to the correct column name

# Clean text data
def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation/numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

df[text_column] = df[text_column].astype(str).apply(preprocess)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df[text_column], df[label_column], test_size=0.2, random_state=42
)

# Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model training
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Prediction and evaluation
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHANGAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! 💪          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   ChefCook        Instagram    

                                     Hashtags  Retweets  Likes     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    mean_squared_error,
    classification_report,
    roc_curve,
    auc,
    roc_auc_score
)

# Download stopwords
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("sentimentdataset.csv")

# Adjust column names as needed
text_column = 'Text'       # Replace with actual column name
label_column = 'Sentiment' # Replace with actual column name

# Preprocess text
def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

df[text_column] = df[text_column].astype(str).apply(preprocess)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df[text_column], df[label_column], test_size=0.2, random_state=42
)

# Vectorize text
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Predict
y_pred = model.predict(X_test_vec)
y_proba = model.predict_proba(X_test_vec)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

# ROC Curve (only for binary classification)
if len(model.classes_) == 2:
    fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1], pos_label=model.classes_[1])
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()
else:
    print("ROC Curve is not supported for multi-class without binarization.")

# Error Analysis
errors = pd.DataFrame({
    'Text': X_test,
    'Actual': y_test,
    'Predicted': y_pred
})
errors = errors[errors['Actual'] != errors['Predicted']]
print("Sample Misclassifications:\n", errors.head())

# Optional: Model comparison placeholder
# You can extend this if you train more models
results = pd.DataFrame({
    'Model': ['Naive Bayes'],
    'Accuracy': [accuracy],
    'F1 Score': [f1],
    'RMSE': [rmse]
})
print("\nModel Comparison:\n", results)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHANGAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: could not convert string to float: ' Curiosity '

In [40]:
from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error, roc_curve, auc
# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # 'weighted' handles class imbalance
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")




ValueError: could not convert string to float: ' Curiosity '

In [41]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, confusion_matrix, roc_auc_score)
from sklearn.preprocessing import label_binarize
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # 'weighted' handles imbalance
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
# ROC Curve (if binary or one-vs-rest)
if len(model.classes_) == 2:
    y_test_bin = label_binarize(y_test, classes=model.classes_)
    y_pred_proba = model.predict_proba(X_test_vec)[:, 1]
    auc = roc_auc_score(y_test_bin, y_pred_proba)
    print(f"ROC AUC Score: {auc:.4f}")
else:
    print("ROC Curve skipped: multi-class ROC not visualized here.")


Accuracy: 0.1088
F1 Score (Weighted): 0.0312

Classification Report:
                         precision    recall  f1-score   support

         Acceptance          0.00      0.00      0.00         2
           Admiration        0.00      0.00      0.00         1
        Admiration           0.00      0.00      0.00         1
         Affection           0.00      0.00      0.00         1
      Ambivalence            0.00      0.00      0.00         1
         Anger               0.00      0.00      0.00         1
        Anticipation         0.00      0.00      0.00         1
        Arousal              0.00      0.00      0.00         3
                  Awe        0.00      0.00      0.00         1
         Awe                 0.00      0.00      0.00         1
                  Bad        0.00      0.00      0.00         1
             Betrayal        0.00      0.00      0.00         2
        Betrayal             0.00      0.00      0.00         1
         Bitter              0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC Curve skipped: multi-class ROC not visualized here.


  plt.show()


In [42]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np

# Binarize the labels
classes = model.classes_  # or: np.unique(y)
y_test_bin = label_binarize(y_test, classes=classes)
n_classes = y_test_bin.shape[1]

# Predict probabilities for each class
y_score = model.predict_proba(X_test_vec)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot all ROC curves
plt.figure(figsize=(10, 7))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f"Class {classes[i]} (AUC = {roc_auc[i]:.2f})")

plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Multi-Class ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


  plt.show()


In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# 1. Load dataset
df = pd.read_csv("sentimentdataset.csv")
df.rename(columns={"Text": "text", "Sentiment": "emotion"}, inplace=True)

# 2. Select specific emotions
selected_emotions = ["joy", "sadness", "anger", "fear"]
df = df[df["emotion"].isin(selected_emotions)]

# 3. Clean text column
df["text"] = df["text"].astype(str).str.strip()
df = df[df["text"].apply(lambda x: len(x.split()) > 1)]

# 4. Encode emotion labels
le = LabelEncoder()
df["emotion_encoded"] = le.fit_transform(df["emotion"])

# 5. TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text"])
y = df["emotion_encoded"]

# 6. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Define models with reasons
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),  # 🔹 Baseline model: fast, interpretable
    "Naive Bayes": MultinomialNB(),                           # 🔹 Probabilistic, good for text frequency
    "Random Forest": RandomForestClassifier(n_estimators=100),# 🔹 Handles non-linear patterns well
    "Gradient Boosting": GradientBoostingClassifier(),        # 🔹 Advanced ensemble, often very accurate
    "Support Vector Machine": SVC(kernel='linear')            # 🔹 Great for high-dimensional TF-IDF text
}

# 8. Train and evaluate each model
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))


KeyError: 'emotion'