1.1  Data Preparation 

In [2]:
# IMPORTING LIBRARIES 
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pickle
import streamlit as st

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tahsi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tahsi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

1.2 Load the Dataset

In [4]:
# Load the dataset
data = pd.read_excel('F:\Projects\project_7_sentiment_analysis\Sentiment_analysis\LabeledText.xlsx')  # Replace with the path

# Display the first few rows
print(data.head())


  data = pd.read_excel('F:\Projects\project_7_sentiment_analysis\Sentiment_analysis\LabeledText.xlsx')  # Replace with the path


  File Name                                            Caption     LABEL
0     1.txt      How I feel today #legday #jelly #aching #gym   negative
1    10.txt  @ArrivaTW absolute disgrace two carriages from...  negative
2   100.txt  This is my Valentine's from 1 of my nephews. I...  positive
3  1000.txt  betterfeelingfilms: RT via Instagram: First da...   neutral
4  1001.txt         Zoe's first love #Rattled @JohnnyHarper15   positive


1.3 Text Preprocessing

In [6]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'\W+', ' ', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the 'Caption' column
data['cleaned_caption'] = data['Caption'].apply(preprocess_text)


1.4 Encode the Labels

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['LABEL'])  # E.g., Positive = 2, Negative = 0, Neutral = 1


2: Split the Data

In [8]:
X = data['cleaned_caption']
y = data['encoded_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


3: Text Vectorization

In [9]:
vectorizer = TfidfVectorizer(max_features=100000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


4: Model Training

In [13]:
#  Define 5 Base Models
base_models = [
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', probability=True, random_state=42)),
    ('Logistic Regression', LogisticRegression(max_iter=500, random_state=42)),
    ('Naive Bayes', MultinomialNB()),
    ('KNN', KNeighborsClassifier(n_neighbors=5))
]

#  Define Meta-Model
meta_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

#  Create StackingClassifier
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=StratifiedKFold(n_splits=5)
)

#  Train Stacked Model
print("Training Stacked Model...")
stacked_model.fit(X_train_vectorized, y_train)

Training Stacked Model...


 5: Model Evaluation

In [14]:
y_pred = stacked_model.predict(X_test_vectorized)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.7094455852156057
              precision    recall  f1-score   support

    negative       0.70      0.76      0.73       284
     neutral       0.67      0.62      0.64       367
    positive       0.75      0.77      0.76       323

    accuracy                           0.71       974
   macro avg       0.71      0.72      0.71       974
weighted avg       0.71      0.71      0.71       974



6: Save the Model and Vectorizer

In [15]:
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(stacked_model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


7.1: Create the Streamlit App


install: pip install --user streamlit

7.2 Create the app.py File

7.3 Run the Streamlit App
streamlit run app.py


In [12]:
import sys
print(sys.executable)


c:\Python312\python.exe


In [22]:
import matplotlib.pyplot as plt

# Count the occurrences of each sentiment class
class_counts = data['LABEL'].value_counts()

# Plot the bar chart
class_counts.plot(kind='bar', color=['blue', 'green', 'orange'])
plt.title('Sentiment Class Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.xticks(rotation=0)

# For Jupyter Notebook
# %matplotlib inline

# Save or show plot based on environment
plt.savefig('sentiment_distribution.png')  # Saves the chart as a file
plt.show()  # Displays the chart


  plt.show()  # Displays the chart


In [24]:
import matplotlib.pyplot as plt

# Calculate sentiment class proportions
class_counts = data['LABEL'].value_counts()

# Plot the pie chart
class_counts.plot(kind='pie', autopct='%1.1f%%', colors=['blue', 'green', 'orange'])
plt.title('Sentiment Class Proportions')
plt.ylabel('')  # Remove default y-axis label

# Save and display the chart
plt.savefig('sentiment_pie_chart.png')  # Save as PNG file
plt.show()  # Try displaying the plot


  plt.show()  # Try displaying the plot


In [33]:
from collections import Counter
from nltk.tokenize import word_tokenize

# Tokenize all text in the dataset
all_words = ' '.join(data['Caption']).split()  # Split words
word_counts = Counter(all_words).most_common(10)  # Get top 10 most common words

# Extract words and their counts
words, counts = zip(*word_counts)

# Plot bar chart
plt.bar(words, counts, color='purple')
plt.title('Top 10 Most Common Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

# Save and display the chart
plt.savefig('common_words_bar_chart.png')  # Save as PNG file
plt.show()


  plt.show()


In [35]:
import matplotlib.pyplot as plt

# Data
categories = ['Negative', 'Neutral', 'Positive']
precision = [0.70, 0.67, 0.75]
recall = [0.76, 0.62, 0.77]
f1_score = [0.73, 0.64, 0.76]

# Bar chart
x = range(len(categories))
plt.figure(figsize=(8, 6))
plt.bar(x, precision, width=0.25, label='Precision', color='blue')
plt.bar([p + 0.25 for p in x], recall, width=0.25, label='Recall', color='green')
plt.bar([p + 0.5 for p in x], f1_score, width=0.25, label='F1-score', color='orange')

# Formatting
plt.xlabel('Sentiment Classes')
plt.ylabel('Scores')
plt.title('Performance Metrics by Sentiment Class')
plt.xticks([p + 0.25 for p in x], categories)
plt.legend()
plt.tight_layout()
plt.savefig('Accuracy_chart.png')  # Save as PNG file
plt.show()


  plt.show()
