In [1]:
import pandas as pd

# Load both datasets
train = pd.read_csv("twitter_training.csv", lineterminator='\n')
test = pd.read_csv("twitter_validation.csv", lineterminator='\n')

print("🔍 Train columns:", train.columns.tolist())
print("🔍 Test columns:", test.columns.tolist())

train.head()

🔍 Train columns: ['2401', 'Borderlands', 'Positive', 'im getting on borderlands and i will murder you all ,\r']
🔍 Test columns: ['3364', 'Facebook', 'Irrelevant', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣\r']


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,\r"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [2]:
# Step 2: Load and Clean Data
import pandas as pd
import re

# Load data
# The dataframes were already loaded in the previous cell, reusing them
train_df = train.copy()
test_df = test.copy()

# Rename columns for clarity
train_df.columns = ['tweet_id', 'entity', 'sentiment', 'tweet']
test_df.columns = ['tweet_id', 'entity', 'sentiment', 'tweet']


# Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|@\w+|#\w+|[^a-z\s]", "", text)
    return text

train_df['cleaned_text'] = train_df['tweet'].apply(clean_text)
test_df['cleaned_text'] = test_df['tweet'].apply(clean_text)

display(train_df.head())
display(test_df.head())

Unnamed: 0,tweet_id,entity,sentiment,tweet,cleaned_text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder y...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...


Unnamed: 0,tweet_id,entity,sentiment,tweet,cleaned_text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claim...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,why do i pay for word when it functions so po...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking is so full of closet hacking ...
3,4433,Google,Neutral,Now the President is slapping Americans in the...,now the president is slapping americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,hi ive had madeleine mccann in my cellar for ...


In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['sentiment'])
test_df['label'] = le.transform(test_df['sentiment'])


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['cleaned_text'])
X_test = vectorizer.transform(test_df['cleaned_text'])

y_train = train_df['label']
y_test = test_df['label']


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

models = {
    "Naive Bayes": MultinomialNB(),
    "SVM (Linear SVC)": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    results[name] = acc * 100

    print(f"\n🔍 {name} Accuracy: {acc * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))



🔍 Naive Bayes Accuracy: 70.77%
Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.81      0.50      0.62       171
    Negative       0.64      0.85      0.73       266
     Neutral       0.82      0.57      0.67       285
    Positive       0.68      0.84      0.75       277

    accuracy                           0.71       999
   macro avg       0.74      0.69      0.69       999
weighted avg       0.73      0.71      0.70       999


🔍 SVM (Linear SVC) Accuracy: 82.88%
Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.78      0.74      0.76       171
    Negative       0.80      0.91      0.85       266
     Neutral       0.91      0.76      0.83       285
    Positive       0.83      0.88      0.85       277

    accuracy                           0.83       999
   macro avg       0.83      0.82      0.82       999
weighted avg       0.83      0.83      0.83       999


🔍 Random Fores

In [10]:
print("\n📈 Final Accuracy Comparison:")
for model_name, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model_name}: {acc:.2f}%")



📈 Final Accuracy Comparison:
Random Forest: 97.20%
SVM (Linear SVC): 82.88%
Naive Bayes: 70.77%
