<a href="https://colab.research.google.com/github/wyciszone/twitter_sentiment_analysis/blob/main/twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas scikit-learn nltk datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import re
import string
import pandas as pd
import nltk
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# data
dataset = load_dataset('carblacac/twitter-sentiment-analysis', trust_remote_code=True)

df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# stop words
stop_words = set(stopwords.words('english'))
#lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# preprocessing
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # del urls
    text = re.sub(r'\@\w+|\#', '', text)  # del mentions and hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # del punctuation
    tokens = word_tokenize(text)  # tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # del stopwords & lemmatize
    return ' '.join(tokens)

In [None]:
# apply
df_train['clean_text'] = df_train['text'].apply(preprocess_text)
df_test['clean_text'] = df_test['text'].apply(preprocess_text)

In [None]:
# model
model = Pipeline([
    ('vectorizer', TfidfVectorizer()),  # Convert text into numerical TF-IDF features
    ('classifier', MultinomialNB())  # Naive Bayes classifier
])


In [None]:
# train
model.fit(df_train['clean_text'], df_train['feeling'])

# predict
y_pred = model.predict(df_test['clean_text'])

In [None]:
# evaluate
print(classification_report(df_test['feeling'], y_pred))
print(f"Accuracy: {accuracy_score(df_test['feeling'], y_pred)}")

              precision    recall  f1-score   support

           0       0.74      0.78      0.76     30969
           1       0.77      0.73      0.75     31029

    accuracy                           0.75     61998
   macro avg       0.75      0.75      0.75     61998
weighted avg       0.75      0.75      0.75     61998

Accuracy: 0.7515081131649408
