In [None]:
import nltk
nltk.download()

In [None]:
import numpy as np
import re
import csv
import html
import math

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import LancasterStemmer
from string import punctuation
from nltk.corpus import stopwords
from nltk import FreqDist

class MultinomialNaiveBayes:

  def __init__(self, nb_classes, nb_words, pseudocount):

    self.nb_classes = nb_classes
    self.nb_words = nb_words
    self.pseudocount = pseudocount

  def fit(self, X, Y):
    nb_examples = X.shape[0]

    self.priors = np.bincount(Y) / nb_examples

    print('\nPriors:')
    print(self.priors)

    occs = np.zeros((self.nb_classes, self.nb_words))
    for i in range(nb_examples):
      c = Y[i]
      for w in range(self.nb_words):
        cnt = X[i][w]
        occs[c][w] += cnt

    print('\nOccurences:')
    print(occs)

    self.like = np.zeros((self.nb_classes, self.nb_words))
    for c in range(self.nb_classes):
      for w in range(self.nb_words):
        up = occs[c][w] + self.pseudocount
        down = np.sum(occs[c]) + self.nb_words*self.pseudocount
        if down == 0:
          self.like[c][w] = 0
        else:
          self.like[c][w] = up / down

    print('\nLikelihoods:')
    print(self.like)
    print()

  def predict(self, bow):
    # Laplace smoothing: add pseudocount to priors and likelihoods
    smooth_priors = self.priors + 1e-10  # Adding a small epsilon to avoid division by zero
    smooth_likelihoods = self.like + 1e-10  # Adding a small epsilon to avoid division by zero

    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
        prob = np.log(smooth_priors[c])
        for w in range(self.nb_words):
            cnt = bow[w]
            prob += cnt * np.log(smooth_likelihoods[c][w])
        probs[c] = prob

    prediction = np.argmax(probs)
    return prediction


  def predict_multiply(self, bow):
    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
      prob = self.priors[c]
      for w in range(self.nb_words):
        cnt = bow[w]
        prob *= self.like[c][w] ** cnt
      probs[c] = prob

    print(probs)
    prediction = np.argmax(probs)
    return prediction


regex_url = r'https?://\S+'
regex_dots = r'[.:;]'
regex_currencies= r'\b(?:\$|€|¥|£)\S+'
regex_spec_chars = r'[^a-zA-Z0-9\s]'
regex_numeric = r'\b\d+\b|\b\d+/\d+/\d+\b'
regex_containing_digits = r'\b\w*\d+\w*\b'


np.set_printoptions(precision = 2, linewidth = 200)

stopwords_punc = set(stopwords.words('english')).union(set(punctuation))
lancaster = LancasterStemmer()

def clean_tweet(tweet):

  # removes html escape characters
  tweet = html.unescape(tweet)
  # removes urls
  tweet = re.sub(regex_url, '', tweet)
  # replaces . or : with space
  tweet = re.sub(regex_dots, ' ', tweet)
  # removes spec chars and leaves the words attached to them
  tweet = re.sub(regex_spec_chars, '', tweet)
  # removes stock market info, prices, etc.
  tweet = re.sub(regex_currencies, '', tweet)
  # removes numbers, dates
  tweet = re.sub(regex_numeric, '', tweet)
  # removes entire words with digit or digits
  tweet = re.sub(regex_containing_digits, '', tweet)

  return tweet

def tokenize_word(word):
  return lancaster.stem(word)

def tokenize_tweet(tweet):

  # exclude trash input
  tweet = clean_tweet(tweet)
  # split tweet into an array of words
  words = word_tokenize(tweet)
  # all lower case
  words_lower = [w.lower() for w in words]
  # remove stopwords, punctuation
  words_filtered = [tokenize_word(w) for w in words_lower if w not in stopwords_punc]

  return words_filtered


tadija_path = '/content/drive/MyDrive/ML2024_D1/disaster-tweets.csv'
mina_path = '/content/drive/MyDrive/6003 ML/data/disaster-tweets.csv'

dataset = []
tweet_column_idx, target_column_idx = 3, 4
sample_nb = 7614


Y_targets = np.zeros(sample_nb, dtype=np.int64)

with open(tadija_path, 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)

    #skip the header at 1st line
    next(csv_reader)

    for idx, row in enumerate(csv_reader):

      if idx >= sample_nb:
          break

      tweet = row[tweet_column_idx]
      dataset.append(tokenize_tweet(tweet))

      Y_targets[idx] = int(row[target_column_idx])



# IDF MAP
token_count = dict()
lr_map = dict()

for idx, sample in enumerate(dataset):

    for token in sample:

        if token not in token_count:
            token_count[token] = 0
            lr_map[token] = [0, 0]


        token_count[token] += 1
        lr_map[token][Y_targets[idx]] += 1

most_common_pos = list(dict(sorted(lr_map.items(), key=lambda item: item[1][0], reverse = True)).items())[:5]
most_common_neg = list(dict(sorted(lr_map.items(), key=lambda item: item[1][1], reverse = True)).items())[:5]

print('Five most commonly used words in positive tweets: ', most_common_pos)
print('Five most commonly used words in negative tweets: ', most_common_neg, '\n')


lr_map = {key: value for key, value in lr_map.items() if value[0] > 10 and value[1] > 10}
lr_map_sorted = dict(sorted(lr_map.items(), key=lambda item: item[1][0] / item[1][1]))

first_five = dict(list(lr_map_sorted.items())[:5])
last_five = dict(list(lr_map_sorted.items())[-5:])

print('Tokens with biggest LR scores')

for token, counts in first_five.items():
  print(f'Token: "{token}" | positive mentions: {counts[0]} | negative mentions: {counts[1]} | LR score = {(counts[0]/counts[1]):.5f}')


print('\nTokens with lowest LR scores')

for token, counts in last_five.items():
  print(f'Token: "{token}" | positive mentions: {counts[0]} | negative mentions: {counts[1]} | LR score = {(counts[0]/counts[1]):.5f}')

print('\nThe entire LR matrix: ', lr_map, '\n')

token_count = dict(sorted(token_count.items(), key=lambda entry: entry[1], reverse=True))
print('Vocabulary: ', token_count)


def score_token(token, sample, token_count):

  inverse_doc_freq = math.log10(len(dataset)/token_count[token])

  local_freq = sample.count(token) / len(sample)

  # return local_freq * inverse_doc_freq
  return 1 if token in sample else 0

# feature vector dimension limit
feat_max = 10000

# if nb of unique tokens in vocab exceeds int_max we trim the excess
if(len(token_count) > feat_max):
    token_count = dict(list(token_count.items())[:feat_max])


X_features = np.zeros((sample_nb, len(token_count)), dtype=np.float32)

sample_idx, token_idx = 0, 0

for sample in dataset:

  token_idx = 0

  for token, count in token_count.items():

    token_score = score_token(token=token,sample=sample,token_count=token_count)
    X_features[sample_idx][token_idx] = token_score
    token_idx += 1

  sample_idx += 1

sum_accuracy = 0
avg_accuracy = 0.0

# spliting the data 80:20
train_test_split = int(len(X_features) * 0.8)

n = 3
for i in range (1, n+1):

  curr_accuracy = 0

  # shuffling data
  indices = np.random.permutation(len(X_features))

  X_features = X_features[indices]
  Y_targets = Y_targets[indices]

  X_train = X_features[:train_test_split]
  Y_train = Y_targets[:train_test_split]

  X_test = X_features[train_test_split:]
  Y_test = Y_targets[train_test_split:]

  model = MultinomialNaiveBayes(nb_classes=2, nb_words=len(token_count), pseudocount=1)
  model.fit(X_features, Y_targets)


  hit_count = 0

  for idx in range(len(X_test)):

    prediction = model.predict(X_test[idx])
    # prediction = model.predict_multiply(X_test[idx])


    if prediction == Y_test[idx]:
      hit_count += 1

    if idx % 100 == 0:
      print(f'Prediction for sample number: {str(idx)} / {len(X_test)} is {str(prediction)}')

  curr_accuracy += hit_count / len(X_test)
  print('Accuracy: ', str(curr_accuracy), 'for try number', str(i))

  sum_accuracy += curr_accuracy

avg_accuracy = sum_accuracy / n
print('\nAverage accuracy for all three tries: {:.5f}'.format(avg_accuracy))


Five most commonly used words in positive tweets:  [('lik', [308, 102]), ('im', [262, 69]), ('new', [228, 195]), ('get', [223, 88]), ('dont', [154, 55])]
Five most commonly used words in negative tweets:  [('fir', [92, 272]), ('bomb', [51, 242]), ('new', [228, 195]), ('kil', [27, 163]), ('via', [99, 121])] 

Tokens with biggest LR scores
Token: "kil" | positive mentions: 27 | negative mentions: 163 | LR score = 0.16564
Token: "train" | positive mentions: 19 | negative mentions: 109 | LR score = 0.17431
Token: "report" | positive mentions: 14 | negative mentions: 72 | LR score = 0.19444
Token: "bomb" | positive mentions: 51 | negative mentions: 242 | LR score = 0.21074
Token: "near" | positive mentions: 14 | negative mentions: 63 | LR score = 0.22222

Tokens with lowest LR scores
Token: "fuck" | positive mentions: 77 | negative mentions: 14 | LR score = 5.50000
Token: "ful" | positive mentions: 88 | negative mentions: 16 | LR score = 5.50000
Token: "yo" | positive mentions: 69 | negativ

Metrika positive/negative daje veci broj sto je broj koriscenja reci u pozitivnim tvitovima veci u odnosu na broj koriscenja iste reci u negativnim tvitovima, a sto je veci broj koriscenja reci u negativnim nego u pozitivnim tvitovima, to je vrednost metrike bliza nuli. Metrika primenjena na konkretnu rec moze nam sluziti kao vid aproksimacije verovatnoce da je tvit koji sadrzi ovu rec pozitivan ili negativan.

Na osnovu broja pojavljivanja ove reci metrika LR br_poz_koriscenja/br_neg_koriscenja moze predstavljati "tezinu" uticaja konkretne reci na to da li je tvit pozitivan ili negativan. Kao i Naive Bayes model, ova metrika moze dati neku aproksimaciju na osnovu samog broja koriscenja svake reci u tvitovima bez uzimanja konteksta koriscenja reci u obzir.