In [1]:
!pip install pandas scikit-learn nltk datasets xgboost



In [1]:
import re
import string
import pandas as pd
import nltk
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# data
dataset = load_dataset('carblacac/twitter-sentiment-analysis', trust_remote_code=True)

df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
# stop words
stop_words = set(stopwords.words('english'))
#lemmatizer
lemmatizer = WordNetLemmatizer()

In [15]:
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
classifiers = {
    'Ridge Classifier': RidgeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Store results
results = {}

In [4]:
# preprocessing
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # del urls
    text = re.sub(r'\@\w+|\#', '', text)  # del mentions and hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # del punctuation
    tokens = word_tokenize(text)  # tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # del stopwords & lemmatize
    return ' '.join(tokens)

In [7]:
# apply
df_train['clean_text'] = df_train['text'].apply(preprocess_text)
df_test['clean_text'] = df_test['text'].apply(preprocess_text)

In [16]:
for name, clf in classifiers.items():
    print(f"Training with {name}...")

    model = Pipeline([
        ('vectorizer', TfidfVectorizer()),  # TF-IDF feature extraction
        ('classifier', clf)  # Classifier
    ])

    # train
    model.fit(df_train['clean_text'], df_train['feeling'])

    y_pred = model.predict(df_test['clean_text'])

    accuracy = accuracy_score(df_test['feeling'], y_pred)
    results[name] = {
        'accuracy': accuracy,
        'report': classification_report(df_test['feeling'], y_pred)
    }
    print(f"Accuracy with {name}: {accuracy}")
    print(classification_report(df_test['feeling'], y_pred))
    print("\n")

Training with Ridge Classifier...
Accuracy with Ridge Classifier: 0.7559114810155166
              precision    recall  f1-score   support

           0       0.76      0.74      0.75     30969
           1       0.75      0.77      0.76     31029

    accuracy                           0.76     61998
   macro avg       0.76      0.76      0.76     61998
weighted avg       0.76      0.76      0.76     61998



Training with Logistic Regression...
Accuracy with Logistic Regression: 0.7671053904964676
              precision    recall  f1-score   support

           0       0.77      0.75      0.76     30969
           1       0.76      0.78      0.77     31029

    accuracy                           0.77     61998
   macro avg       0.77      0.77      0.77     61998
weighted avg       0.77      0.77      0.77     61998



Training with Naive Bayes...
Accuracy with Naive Bayes: 0.7515081131649408
              precision    recall  f1-score   support

           0       0.74      0.78   

In [17]:
# results
print("\nFinal Accuracy Comparison:")
for name, metrics in results.items():
    print(f"{name}: {metrics['accuracy']:.4f}")


Final Accuracy Comparison:
Ridge Classifier: 0.7559
Logistic Regression: 0.7671
Naive Bayes: 0.7515
Random Forest: 0.7535
KNN: 0.5920
