In [12]:
import os
import re
import string
from collections import defaultdict, Counter
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import regexp_tokenize

# Initialize the stemmer for stemming version
stemmer = PorterStemmer()

def load_data_from_files(directory):
    """
    Load text files from a directory.
    """
    data = []
    for filename in sorted(os.listdir(directory)):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            data.append(file.read())
    return data

def preprocess_text(text, stem=False):
    """
    Preprocesses text by tokenizing, handling punctuation, emoticons, and stemming if specified.
    """
    # Tokenization pattern that handles words, punctuation, and emoticons
    pattern = r"""(?x)                  
                  (?:[A-Z]\.)+            # Initials like U.S.A.
                  |\$?\d+(?:\.\d+)?%?     # Currency/Percentages
                  |\w+(?:[-']\w+)*        # Words with optional internal hyphens/apostrophes
                  |[.,;"'?():-_`]+        # Punctuation
                  |[:;=8][\-o\*\']?[)\](\d]  # Basic emoticons (like :) or :-))
              """
    tokens = regexp_tokenize(text.lower(), pattern)
    tokens = [re.sub(r'<.*?>', '', word) for word in tokens]  # Remove HTML tags
    tokens = [stemmer.stem(word) if stem else word for word in tokens]
    return tokens

def build_vocabulary(data, stem=False):
    """
    Build vocabulary from the training data.
    """
    vocabulary = defaultdict(int)
    for text in data:
        tokens = preprocess_text(text, stem=stem)
        for word in tokens:
            vocabulary[word] += 1
    return vocabulary

# Load Data
train_positive_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/train/positive')
train_negative_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/train/negative')
test_positive_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/test/positive')
test_negative_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/test/negative')

# Combine training data
train_data = train_positive_data + train_negative_data
train_labels = [1] * len(train_positive_data) + [0] * len(train_negative_data)
test_data = test_positive_data + test_negative_data
test_labels = [1] * len(test_positive_data) + [0] * len(test_negative_data)

# Build Vocabulary (With and Without Stemming)
vocabulary_without_stemming = build_vocabulary(train_data, stem=False)
vocabulary_with_stemming = build_vocabulary(train_data, stem=True)

# Convert Documents to Feature Vectors (Binary and Frequency)
def document_to_vector(text, vocabulary, stem=False, binary=False):
    tokens = preprocess_text(text, stem=stem)
    word_counts = Counter(tokens)
    vector = np.zeros(len(vocabulary))
    
    for word, count in word_counts.items():
        if word in vocabulary:
            index = list(vocabulary.keys()).index(word)
            vector[index] = 1 if binary else count
    return vector

# Convert all training and test data into vectors
train_vectors_freq = [document_to_vector(text, vocabulary_without_stemming, stem=False, binary=False) for text in train_data]
train_vectors_bin = [document_to_vector(text, vocabulary_without_stemming, stem=False, binary=True) for text in train_data]
test_vectors_freq = [document_to_vector(text, vocabulary_without_stemming, stem=False, binary=False) for text in test_data]
test_vectors_bin = [document_to_vector(text, vocabulary_without_stemming, stem=False, binary=True) for text in test_data]

# Naive Bayes Model
class NaiveBayesClassifier:
    def __init__(self):
        self.class_word_counts = {0: defaultdict(int), 1: defaultdict(int)}
        self.class_totals = {0: 0, 1: 0}
        self.class_priors = {}

    def fit(self, data, labels):
        """
        Train Naive Bayes by calculating word frequencies for each class.
        """
        for text_vector, label in zip(data, labels):
            self.class_totals[label] += np.sum(text_vector)
            for i, count in enumerate(text_vector):
                self.class_word_counts[label][i] += count
        self.class_priors[0] = sum(1 for lbl in labels if lbl == 0) / len(labels)
        self.class_priors[1] = sum(1 for lbl in labels if lbl == 1) / len(labels)

    def predict(self, data):
        """
        Predict class labels for test data.
        """
        predictions = []
        for text_vector in data:
            log_prob_0 = np.log(self.class_priors[0])
            log_prob_1 = np.log(self.class_priors[1])
            for i, count in enumerate(text_vector):
                log_prob_0 += count * np.log((self.class_word_counts[0][i] + 1) / (self.class_totals[0] + len(self.class_word_counts[0])))
                log_prob_1 += count * np.log((self.class_word_counts[1][i] + 1) / (self.class_totals[1] + len(self.class_word_counts[1])))
            predictions.append(1 if log_prob_1 > log_prob_0 else 0)
        return predictions

    def accuracy(self, data, labels):
        predictions = self.predict(data)
        accuracy = np.mean(np.array(predictions) == np.array(labels))
        return accuracy

    def confusion_matrix(self, data, labels):
        predictions = self.predict(data)
        tp = sum(1 for p, l in zip(predictions, labels) if p == l == 1)
        tn = sum(1 for p, l in zip(predictions, labels) if p == l == 0)
        fp = sum(1 for p, l in zip(predictions, labels) if p == 1 and l == 0)
        fn = sum(1 for p, l in zip(predictions, labels) if p == 0 and l == 1)
        return np.array([[tn, fp], [fn, tp]])

# Train and Evaluate
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(train_vectors_freq, train_labels)

# Evaluate on test data
accuracy = nb_classifier.accuracy(test_vectors_freq, test_labels)
conf_matrix = nb_classifier.confusion_matrix(test_vectors_freq, test_labels)

print(f"Model Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

# Optional: Save results to a log file
with open('/Users/saivaruntanjoreraghavendra/Desktop/fall 2024/results.log', 'w') as log_file:
    log_file.write(f"Model Accuracy: {accuracy:.4f}\n")
    log_file.write("Confusion Matrix:\n")
    log_file.write(str(conf_matrix))


Model Accuracy: 0.8950
Confusion Matrix:
[[2922   78]
 [ 361  821]]


In [14]:
import os
import re
import string
from collections import defaultdict, Counter
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import regexp_tokenize

# Initialize the stemmer for stemming version
stemmer = PorterStemmer()

def load_data_from_files(directory):
    """
    Load text files from a directory.
    """
    data = []
    for filename in sorted(os.listdir(directory)):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            data.append(file.read())
    return data

def preprocess_text(text, stem=False):
    """
    Preprocesses text by tokenizing, handling punctuation, emoticons, and stemming if specified.
    """
    pattern = r"""(?x)                  
                  (?:[A-Z]\.)+            # Initials like U.S.A.
                  |\$?\d+(?:\.\d+)?%?     # Currency/Percentages
                  |\w+(?:[-']\w+)*        # Words with optional internal hyphens/apostrophes
                  |[.,;"'?():-_`]+        # Punctuation
                  |[:;=8][\-o\*\']?[)\](\d]  # Basic emoticons (like :) or :-))
              """
    tokens = regexp_tokenize(text.lower(), pattern)
    tokens = [re.sub(r'<.*?>', '', word) for word in tokens]  # Remove HTML tags
    tokens = [stemmer.stem(word) if stem else word for word in tokens]
    return tokens

def build_vocabulary(data, stem=False):
    """
    Build vocabulary from the training data.
    """
    vocabulary = defaultdict(int)
    for text in data:
        tokens = preprocess_text(text, stem=stem)
        for word in tokens:
            vocabulary[word] += 1
    return vocabulary

# Load Data
train_positive_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/train/positive')
train_negative_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/train/negative')
test_positive_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/test/positive')
test_negative_data = load_data_from_files('/Users/saivaruntanjoreraghavendra/Downloads/tweet 2/test/negative')

# Combine training data
train_data = train_positive_data + train_negative_data
train_labels = [1] * len(train_positive_data) + [0] * len(train_negative_data)
test_data = test_positive_data + test_negative_data
test_labels = [1] * len(test_positive_data) + [0] * len(test_negative_data)

# Build Vocabulary (With and Without Stemming)
vocabulary = build_vocabulary(train_data, stem=False)

# Calculate IDF for each term in the vocabulary
def compute_idf(data, vocabulary):
    """
    Calculate IDF for each word in the vocabulary.
    """
    doc_count = defaultdict(int)
    num_docs = len(data)
    
    for text in data:
        tokens = set(preprocess_text(text))
        for token in tokens:
            if token in vocabulary:
                doc_count[token] += 1

    idf = {word: np.log(num_docs / (1 + doc_count[word])) for word in vocabulary}
    return idf

idf_values = compute_idf(train_data, vocabulary)

# Convert Documents to TF-IDF Vectors
def document_to_tfidf_vector(text, vocabulary, idf_values, stem=False):
    """
    Convert a document into a TF-IDF vector.
    """
    tokens = preprocess_text(text, stem=stem)
    word_counts = Counter(tokens)
    vector = np.zeros(len(vocabulary))
    
    for word, count in word_counts.items():
        if word in vocabulary:
            tf = count / len(tokens)  # Term Frequency
            tf_idf = tf * idf_values[word]  # TF-IDF
            index = list(vocabulary.keys()).index(word)
            vector[index] = tf_idf
    return vector

# Convert all training and test data into TF-IDF vectors
train_vectors_tfidf = [document_to_tfidf_vector(text, vocabulary, idf_values, stem=False) for text in train_data]
test_vectors_tfidf = [document_to_tfidf_vector(text, vocabulary, idf_values, stem=False) for text in test_data]

# Naive Bayes Model
class NaiveBayesClassifier:
    def __init__(self):
        self.class_word_counts = {0: defaultdict(int), 1: defaultdict(int)}
        self.class_totals = {0: 0, 1: 0}
        self.class_priors = {}

    def fit(self, data, labels):
        """
        Train Naive Bayes by calculating word frequencies for each class.
        """
        for text_vector, label in zip(data, labels):
            self.class_totals[label] += np.sum(text_vector)
            for i, count in enumerate(text_vector):
                self.class_word_counts[label][i] += count
        self.class_priors[0] = sum(1 for lbl in labels if lbl == 0) / len(labels)
        self.class_priors[1] = sum(1 for lbl in labels if lbl == 1) / len(labels)

    def predict(self, data):
        """
        Predict class labels for test data.
        """
        predictions = []
        for text_vector in data:
            log_prob_0 = np.log(self.class_priors[0])
            log_prob_1 = np.log(self.class_priors[1])
            for i, count in enumerate(text_vector):
                log_prob_0 += count * np.log((self.class_word_counts[0][i] + 1) / (self.class_totals[0] + len(self.class_word_counts[0])))
                log_prob_1 += count * np.log((self.class_word_counts[1][i] + 1) / (self.class_totals[1] + len(self.class_word_counts[1])))
            predictions.append(1 if log_prob_1 > log_prob_0 else 0)
        return predictions

    def accuracy(self, data, labels):
        predictions = self.predict(data)
        accuracy = np.mean(np.array(predictions) == np.array(labels))
        return accuracy

    def confusion_matrix(self, data, labels):
        predictions = self.predict(data)
        tp = sum(1 for p, l in zip(predictions, labels) if p == l == 1)
        tn = sum(1 for p, l in zip(predictions, labels) if p == l == 0)
        fp = sum(1 for p, l in zip(predictions, labels) if p == 1 and l == 0)
        fn = sum(1 for p, l in zip(predictions, labels) if p == 0 and l == 1)
        return np.array([[tn, fp], [fn, tp]])

# Train and Evaluate
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(train_vectors_tfidf, train_labels)

# Evaluate on test data
accuracy = nb_classifier.accuracy(test_vectors_tfidf, test_labels)
conf_matrix = nb_classifier.confusion_matrix(test_vectors_tfidf, test_labels)

print(f"Model Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

# Optional: Save results to a log file
with open('/Users/saivaruntanjoreraghavendra/Desktop/fall 2024/results-tfidf.log', 'w') as log_file:
    log_file.write(f"Model Accuracy: {accuracy:.4f}\n")
    log_file.write("Confusion Matrix:\n")
    log_file.write(str(conf_matrix))


Model Accuracy: 0.7944
Confusion Matrix:
[[2997    3]
 [ 857  325]]
