# Naive Bayes classifier

![Alt texts](1.png)

## Task 0. 
Execute the notebook.

### 1. Prepare models

### 1.1. Use function from lab3

### -------------------------------------------------------------------------------

In [3]:
# Imports
import nltk
from nltk.corpus import twitter_samples
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
import re
import string
from nltk.corpus import wordnet

In [4]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def process_tokens(tweet_tokens):
    cleaned_tokens = []

    def get_wordnet_pos(tag):
        if tag.startswith('NN'):
            return wordnet.NOUN
        elif tag.startswith('VB'):
            return wordnet.VERB
        elif tag.startswith('JJ'):
            return wordnet.ADJ
        elif tag.startswith('RB'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Default to noun

    for token, tag in pos_tag(tweet_tokens):
        token_lower = token.lower()
        # Delete URLs and mentions
        if re.match(r'^https?://', token_lower) or token_lower.startswith('@'):
            continue
        # Delete stop words and punctuation
        if token_lower in stop_words or token_lower in string.punctuation:
            continue
        # Lemmatize the token
        pos = get_wordnet_pos(tag)
        lemmatized_token = lemmatizer.lemmatize(token_lower, pos)
        cleaned_tokens.append(lemmatized_token)

    return cleaned_tokens

In [5]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [process_tokens(tokens) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [process_tokens(tokens) for tokens in negative_tweet_tokens]

### -------------------------------------------------------------------------------

In [6]:
[tweet_tokens for tweet_tokens in positive_cleaned_tokens_list][0]

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']

In [7]:
def get_token_dict(tokens):
    return dict([token, True] for token in tokens)
    
def get_tweets_for_model(cleaned_tokens_list):   
    return [get_token_dict(tweet_tokens) for tweet_tokens in cleaned_tokens_list]

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [8]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

# 2. Run the Naive Bayes classifier.

In [9]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9973333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2067.6 : 1.0
                      :) = True           Positi : Negati =    979.9 : 1.0
                follower = True           Positi : Negati =     38.1 : 1.0
                     sad = True           Negati : Positi =     37.2 : 1.0
                    glad = True           Positi : Negati =     20.9 : 1.0
                     via = True           Positi : Negati =     16.6 : 1.0
                followed = True           Negati : Positi =     15.5 : 1.0
                    miss = True           Negati : Positi =     14.5 : 1.0
                  arrive = True           Positi : Negati =     13.8 : 1.0
               community = True           Positi : Negati =     13.6 : 1.0
None


In [10]:
#nltk.download('punkt_tab')

In [11]:
from nltk.tokenize import word_tokenize

custom_tweet = "the service was so bad"

custom_tokens = process_tokens(word_tokenize(custom_tweet))

print(classifier.classify(get_token_dict(custom_tokens)))

Negative


In [12]:
def get_sentiment(text):
    custom_tokens = process_tokens(word_tokenize(text))
    return classifier.classify(get_token_dict(custom_tokens))

texts = ["bad", "service is bad", "service is really bad", "service is so terrible", "great service", "they stole my money"]
for t in texts:
    print(t, ": ", get_sentiment(t))

bad :  Negative
service is bad :  Negative
service is really bad :  Negative
service is so terrible :  Negative
great service :  Positive
they stole my money :  Negative


## Task 1.
Re-train the classifier on a different set of data. For instance, use a dataset from HuggingFace or Kaggle.

In [13]:
#pip install datasets

In [14]:
from datasets import load_dataset

# Завантаження датасету
dataset = load_dataset("gxb912/large-twitter-tweets-sentiment")


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def get_token_dict(tokens):
    return dict([token, True] for token in tokens)

def get_tweets_for_model(cleaned_tokens_list):
    return [get_token_dict(tweet_tokens) for tweet_tokens in cleaned_tokens_list]


In [16]:
# Обробка текстів
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for example in dataset['train']:
    tokens = word_tokenize(example['text'])
    cleaned_tokens = process_tokens(tokens)
    if example['sentiment'] == 1:
        positive_cleaned_tokens_list.append(cleaned_tokens)
    else:
        negative_cleaned_tokens_list.append(cleaned_tokens)

# Підготовка даних для моделі
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)


In [18]:
import random

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

dataset_combined = positive_dataset + negative_dataset
random.shuffle(dataset_combined)

train_data = dataset_combined[:7000]
test_data = dataset_combined[7000:]


In [19]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))
classifier.show_most_informative_features(10)


Accuracy is: 0.7049221075753634
Most Informative Features
                     vip = True           Positi : Negati =     18.0 : 1.0
                 welcome = True           Positi : Negati =     13.4 : 1.0
                     sad = True           Negati : Positi =     12.3 : 1.0
                    hurt = True           Negati : Positi =     10.9 : 1.0
                   sadly = True           Negati : Positi =     10.9 : 1.0
                   upset = True           Negati : Positi =     10.9 : 1.0
               wonderful = True           Positi : Negati =     10.5 : 1.0
                     gah = True           Negati : Positi =     10.0 : 1.0
                    grrr = True           Negati : Positi =     10.0 : 1.0
                    poor = True           Negati : Positi =     10.0 : 1.0


In [20]:
def get_sentiment(text):
    tokens = word_tokenize(text)
    cleaned_tokens = process_tokens(tokens)
    return classifier.classify(get_token_dict(cleaned_tokens))

# Приклади
texts = ["bad", "service is bad", "service is really bad", "service is so terrible", "great service", "they stole my money"]
for t in texts:
    print(f"{t} : {get_sentiment(t)}")


bad : Negative
service is bad : Negative
service is really bad : Negative
service is so terrible : Negative
great service : Positive
they stole my money : Negative


## Task 2. 
Try to use Logistic Regression classifier instead and compare the results with Naive Bayes.

In [21]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Витягуємо ознаки (X) та мітки (y)
X_dict = [x[0] for x in dataset_combined]
y = [x[1] for x in dataset_combined]

# Векторизація ознак
vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(X_dict)

# Розбиваємо на train/test
X_train = X[:7000]
X_test = X[7000:]
y_train = y[:7000]
y_test = y[7000:]


In [22]:
# Створення та навчання моделі
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Оцінка точності
y_pred = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Logistic Regression Accuracy: 0.7319922541113905
              precision    recall  f1-score   support

    Negative       0.72      0.60      0.65     72980
    Positive       0.74      0.83      0.78    100015

    accuracy                           0.73    172995
   macro avg       0.73      0.71      0.72    172995
weighted avg       0.73      0.73      0.73    172995



In [23]:
print("Naive Bayes Accuracy:", classify.accuracy(classifier, test_data))


Naive Bayes Accuracy: 0.7049221075753634


In [24]:
def get_sentiment_logreg(text):
    tokens = word_tokenize(text)
    cleaned_tokens = process_tokens(tokens)
    token_dict = get_token_dict(cleaned_tokens)
    X_new = vectorizer.transform([token_dict])
    return log_reg.predict(X_new)[0]

texts = ["bad", "service is bad", "service is really bad", "service is so terrible", "great service", "they stole my money"]
for t in texts:
    print(f"{t} : {get_sentiment_logreg(t)}")


bad : Negative
service is bad : Negative
service is really bad : Negative
service is so terrible : Negative
great service : Positive
they stole my money : Negative
