In [1]:
!pip install nltk



In [2]:
import csv
import random
import re
import math
from collections import Counter
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import  word_tokenize
from nltk.tag import pos_tag
import requests
import io
from google.colab import files

In [3]:
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
uploaded = files.upload()

Saving disaster-tweets.csv to disaster-tweets.csv


In [5]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [6]:
def load_and_clean_data(path):
    data = []

    stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
    custom_stopwords = set(stopwords_list.decode().splitlines())
    custom_stopwords.add("amp")

    nltk_stopwords = set(stopwords.words("english"))
    all_stopwords = nltk_stopwords.union(custom_stopwords)

    lemmatizer = WordNetLemmatizer()

    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            text = row["text"].lower()
            text = re.sub(r"http\S+|www\S+", "", text)
            text = re.sub(r"[^a-z\s]", " ", text)
            text = re.sub(r"\s+", " ", text).strip()

            tokens = word_tokenize(text)
            tagged = pos_tag(tokens)

            words = [
                lemmatizer.lemmatize(w, get_wordnet_pos(pos))
                for w, pos in tagged
                if w not in all_stopwords and len(w) > 2
            ]
            data.append((words, int(row["target"])))
    return data

In [7]:
def build_vocab(data, max_words=10000):
    freq = Counter()
    for words, _ in data:
        freq.update(words)
    most_common = freq.most_common(max_words)
    vocab = {word: idx for idx, (word, _) in enumerate(most_common)}
    return vocab

def vectorize(data, vocab):
    vectors = []
    labels = []
    for words, label in data:
        vec = [0] * len(vocab)
        for word in words:
            if word in vocab:
                vec[vocab[word]] += 1
        vectors.append(vec)
        labels.append(label)
    return vectors, labels

In [8]:
class MultinomialNaiveBayes:
    def __init__(self, num_classes, num_words, pseudocount=1):
        self.num_classes = num_classes
        self.num_words = num_words
        self.pseudocount = pseudocount

    def fit(self, X, Y):
        self.priors = [0] * self.num_classes
        self.likelihoods = [[0]*self.num_words for _ in range(self.num_classes)]
        class_counts = [0] * self.num_classes
        word_counts = [[0]*self.num_words for _ in range(self.num_classes)]

        for x, y in zip(X, Y):
            self.priors[y] += 1
            for i, count in enumerate(x):
                word_counts[y][i] += count
            class_counts[y] += sum(x)

        total = len(Y)
        self.priors = [math.log(p / total) for p in self.priors]

        for c in range(self.num_classes):
            denom = class_counts[c] + self.num_words * self.pseudocount
            for i in range(self.num_words):
                num = word_counts[c][i] + self.pseudocount
                self.likelihoods[c][i] = math.log(num / denom)

    def predict(self, x):
        scores = []
        for c in range(self.num_classes):
            score = self.priors[c]
            for i in range(self.num_words):
                score += x[i] * self.likelihoods[c][i]
            scores.append(score)
        return scores.index(max(scores))

In [9]:
def run_part_a(path, runs=3):
    data = load_and_clean_data(path)
    accuracies = []
    for _ in range(runs):
        random.shuffle(data)
        split = int(0.8 * len(data))
        train_data = data[:split]
        test_data = data[split:]

        vocab = build_vocab(train_data)
        X_train, Y_train = vectorize(train_data, vocab)
        X_test, Y_test = vectorize(test_data, vocab)

        model = MultinomialNaiveBayes(2, len(vocab))
        model.fit(X_train, Y_train)

        correct = 0
        for x, y in zip(X_test, Y_test):
            pred = model.predict(x)
            if pred == y:
                correct += 1
        acc = correct / len(Y_test)
        print(f"Accuracy: {acc*100:.2f}%")
        accuracies.append(acc)

    avg = sum(accuracies) / len(accuracies)
    print(f"Prosečna tačnost u {runs} pokretanja: {avg*100:.2f}%")

In [10]:
def analyze_words(path):
    data = load_and_clean_data(path)
    pos_counter = Counter()
    neg_counter = Counter()

    for words, label in data:
        word_freq = Counter(words)
        if label == 1:
            pos_counter.update(word_freq)
        else:
            neg_counter.update(word_freq)

    print("\nTop 5 reči u pozitivnim tvitovima:")
    for word, count in pos_counter.most_common(5):
        print(f"{word}: {count}")

    print("\nTop 5 reči u negativnim tvitovima:")
    for word, count in neg_counter.most_common(5):
        print(f"{word}: {count}")

    #LR metrika
    lr_scores = {}
    for word in pos_counter:
        if pos_counter[word] >= 10 and neg_counter[word] >= 10:
            lr_scores[word] = pos_counter[word] / neg_counter[word]

    top5_high = sorted(lr_scores.items(), key=lambda x: x[1], reverse=True)[:5]
    top5_low = sorted(lr_scores.items(), key=lambda x: x[1])[:5]

    print("\nTop 5 reči sa NAJVEĆOM LR metrikom:")
    for word, score in top5_high:
        print(f"{word}: {score:.2f}")

    print("\nTop 5 reči sa NAJMANJOM LR metrikom:")
    for word, score in top5_low:
        print(f"{word}: {score:.2f}")


In [11]:
if __name__ == "__main__":
    run_part_a("disaster-tweets.csv")
    analyze_words("disaster-tweets.csv")

Accuracy: 79.19%
Accuracy: 78.66%
Accuracy: 79.51%
Prosečna tačnost u 3 pokretanja: 79.12%

Top 5 reči u pozitivnim tvitovima:
kill: 159
news: 151
bomb: 132
disaster: 122
california: 115

Top 5 reči u negativnim tvitovima:
body: 119
love: 117
time: 109
bag: 109
day: 104

Top 5 reči sa NAJVEĆOM LR metrikom:
kill: 8.37
train: 5.61
report: 5.50
fire: 5.06
fatal: 4.58

Top 5 reči sa NAJMANJOM LR metrikom:
love: 0.13
scream: 0.17
feel: 0.19
play: 0.20
wreck: 0.22


Najčešće korišćene reči u pozitivnim i negativnim tvitovima pokazuju jasnu razliku u temama koje se obrađuju.
U pozitivnim tvitovima (relevantnim za katastrofe) preovlađuju reči poput "fire", "kill", "news", "bomb", "disaster", koje jasno asociraju na hitne slučajeve, nasilje i vanredne događaje.
Nasuprot tome, u negativnim (nerelevantnim) tvitovima najzastupljenije su neutralne reči, često prisutne u svakodnevnoj komunikaciji, kao što su "get", "like", "new", "one", "make", što sugeriše da ti tvitovi ne sadrže elemente povezane sa katastorfama.

LR metrika se koristi kao mera koja pokazuje koliko je neka reč indikativna za pozitivne ili negativne tvitove.
Računa se po formuli: LR(reč) = broj pojavljivanja reči u pozitivnim tvitovima / broj pojavljivanja reči u negativnim tvitovima.
U obzir se uzimaju samo reči koje se pojavljuju najmanje deset puta u oba korpusa, kako bi se izbegla pristrasnost izazvana retkim rečima.

Reči sa NAJVEĆOM LR metrikom:'kill', 'train', 'report', 'fatal', 'evacuation', su reči jake indikacije za katastrofe, nesreće i vanredne situacije. Značajno se češće pojavljuju u pozitivnim tvitovima (relevantnim za katastrofe) nego u negativnim, pa imaju visoke LR vrednosti.
Reči sa NAJMANJOM LR metrikom:'feel', 'love', 'scream', 'play', 'wreck', predstavljaju svakodnevne izraze ili izraze emocija koji se češće koriste u opštim, nerelevantnim tvitovima. Njihovo prisustvo sugeriše da tvit verovatno nije vezan za katastrofu, pa su korisni negativni indikatori.
LR metrika je korisna jer ne meri samo učestalost reči, već i koliko doprinosi razdvajanju klasa. Reči sa visokom LR vrednošću pomažu modelu da tačnije klasifikuje tvit kao pozitivan, dok one sa niskom vrednošću mogu doprineti negativnoj klasifikaciji. U poređenju sa prostom učestalošću, LR pruža bolji uvid u informativnost reči za klasifikaciju.
