In [None]:
import pandas as pd


In [None]:
df = pd.read_csv("data/twitter_training.csv", header=None)
df.head()


In [None]:
# Assign column names
df.columns = ['tweet_id', 'entity', 'sentiment', 'sentence']

# Drop tweet_id and entity
df = df[['sentence', 'sentiment']]

# Show first 5 rows
df.head()


In [None]:
# Check for missing/null values
print("Missing values:\n", df.isnull().sum())

# Show value counts of each sentiment class
print("\nClass distribution:\n", df['sentiment'].value_counts())


In [None]:
# Remove rows with missing sentence text
df.dropna(subset=['sentence'], inplace=True)

# Keep only Positive, Negative, and Neutral classes
df = df[df['sentiment'].isin(['Positive', 'Negative', 'Neutral'])]

# Reset index
df.reset_index(drop=True, inplace=True)

# Show updated class distribution
df['sentiment'].value_counts()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='sentiment', hue='sentiment', palette='pastel', legend=False)
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


In [None]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)         # Remove URLs
    text = re.sub(r"@\w+", "", text)            # Remove mentions
    text = re.sub(r"#\w+", "", text)            # Remove hashtags
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)             # Remove digits
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text


In [None]:
df['cleaned'] = df['sentence'].apply(clean_text)
df[['sentence', 'cleaned']].head()


In [None]:
from sklearn.preprocessing import LabelEncoder


In [None]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

# Check how each sentiment is mapped to a number
df[['sentiment', 'label']].drop_duplicates()


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned'], df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

# Check the shape of the splits
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit on training data, transform both sets
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Show shape of transformed data
print("TF-IDF shape (train):", X_train_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)


In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [None]:
rf_pred = rf_model.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


In [None]:
print("Accuracy:", accuracy_score(y_test, rf_pred))


In [None]:
cm = confusion_matrix(y_test, rf_pred, labels=range(len(le.classes_)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap="Blues", values_format="d")
plt.show()

In [None]:
print("Classification Report:\n")
print(classification_report(y_test, rf_pred, target_names=le.classes_))

In [None]:
import pickle
import os


In [None]:
# Ensure the models folder exists
os.makedirs("models", exist_ok=True)

In [None]:
# Save TF-IDF vectorizer
with open("models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [None]:
# Save trained Random Forest model
with open("models/sentiment_rf_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)
