In [2]:
# Sentiment Analysis on Tweets
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk

ModuleNotFoundError: No module named 'kagglehub'

In [None]:
path = kagglehub.dataset_download("saurabhshahane/twitter-sentiment-dataset")
print("Path to dataset files:", path)
# Load and Inspect Data
df = pd.read_csv(path + "/Twitter_Data.csv")
print(df.head())
print(df.info())
print(df.describe())
# Check for missing values
print("Missing values:\n", df.isnull().sum())

In [None]:
# EDA - Sentiment Distribution
# Drop rows with missing values in 'category' before plotting
df_cleaned = df.dropna(subset=['category']).copy()
df_cleaned['category'] = df_cleaned['category'].astype(int) # Convert category to int
sns.countplot(x='category', data=df_cleaned)
plt.title("Sentiment Distribution")
plt.show()
# Text Preprocessing Functions
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def clean_text(text):
    # Ensure text is a string before applying regex
    text = str(text)
    # Remove URLs, mentions, hashtags, numbers, punctuation
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Lowercase
    text = text.lower()
    # Tokenization + stopword removal + stemming
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)
# Apply cleaning
df_cleaned['clean_text'] = df_cleaned['clean_text'].astype(str).apply(lambda x: clean_text(x) if pd.notna(x) else "")
print(df_cleaned[['clean_text','category']].head())

In [None]:
# Drop rows with missing values in 'clean_text' or 'category' before splitting
df_processed = df.dropna(subset=['clean_text', 'category']).copy()
# Train-Test Split
X = df_processed['clean_text']
y = df_processed['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build Pipeline (TF-IDF + Naive Bayes)
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])
# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Sentiment Prediction Function
def predict_sentiment(text):
    clean = clean_text(text)
    prediction = model.predict([clean])[0]
    return prediction
# Test on Sample Tweets
sample_tweet1 = "I love this product, it is amazing!"
sample_tweet2 = "This is the worst service ever, totally disappointed."
print("Tweet:", sample_tweet1, "=> Sentiment:", predict_sentiment(sample_tweet1))
print("Tweet:", sample_tweet2, "=> Sentiment:", predict_sentiment(sample_tweet2))