In [1]:
## Importing Libraries and Data
import pandas as pd
import numpy as np
import math
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [18]:
# download stopwords，quiet=True means hiding the download information
nltk.download('stopwords', quiet=True)
# set up stopwords
stop_words = set(stopwords.words('english'))
# Initialize the Porter stemmer
stemmer = PorterStemmer()

In [21]:
def generate_ngrams(words, n):
    """
    Generate n-gram sequences
    :param words: word list
    :param n: The value of n for n-grams
    :return: generate n-gram list
    """
    return [' '.join(words[i:i+n]) for i in range(len(words) - n + 1)]

In [23]:
def preprocess_with_trigrams(text):
    """
    Text preprocessing functions, including conversion to lowercase, punctuation removal, stop word filtering, stemming, and generating unigrams, bigrams, and trigrams
    :param text: Original text
    :return: List of processed unigram, bigram and trigram combinations
    """
    #Convert to lowercase
    text = text.lower()
    #Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Word segmentation, stop word filtering, stemming
    words = [stemmer.stem(w) for w in text.split() if w not in stop_words]
    # Generating unigrams
    unigrams = words
    # Generating bigram
    bigrams = generate_ngrams(words, 2)
    #Generating trigram
    trigrams = generate_ngrams(words, 3)
    # Returns the characteristics of the combination
    return unigrams + bigrams + trigrams

In [25]:
def train_naive_bayes_trigrams(x_train, y_train):
    """
    Train the Naive Bayes model to count word frequency and category information
    :param x_train: training text data
    :param y_train: training label data
    :return: word frequency dictionary, category count, total number of words in each category, vocabulary
    """
    # Initialize word lists and counts for each category
    class_words = {'A': [], 'G': [], 'S': [], 'W': []}
    class_counts = {'A': 0, 'G': 0, 'S': 0, 'W': 0}
    # Initialize the word frequency dictionary for each category
    word_freqs = {'A': {}, 'G': {}, 'S': {}, 'W': {}}
    # Initialize vocabulary
    vocabulary = set()
    #You must not throw the test data in, so we use the divided x_train
    for i in range(len(x_train)):
        #the data is divided into training set and test set,
        # so the index of the data is no longer continuous.
        # when looking for the corresponding y, you have to use its own row number, that is, y_train.iloc[i], not y_train[i]
        label = y_train.iloc[i]
        #Preprocess text to obtain features
        words = preprocess_with_trigrams(x_train.iloc[i])
        #class counter +1
        class_counts[label] += 1
        class_words[label].extend(words)
        #Update word frequency and vocabulary
        #word_freqs[label][word]：Record the number of occurrences (word frequency) of each word under each category (label)
        #word_freqs[label]：Access the word frequency dictionary for the current category label (e.g. word_freqs['A'])。
        # .get(word, 0)：Try getting the current count for word from the word frequency dictionary:
        # If word exists, returns its current value (word frequency). If word does not exist, returns the default value of 0.
        # + 1： word_freqs[label][word]。Add 1 to the word frequency (the current word appears once). Store the updated word frequency back to word_freqs[label][word].
        for word in words:
            word_freqs[label][word] = word_freqs[label].get(word, 0) + 1
            vocabulary.add(word)

    total_words = {label: len(words) for label, words in class_words.items()}
    return word_freqs, class_counts, total_words, list(vocabulary)

In [27]:
def naive_bayes_predict_trigrams(test_texts, word_freqs, class_counts, total_words, vocabulary, alpha=0.5):
    result = []
    labels = ['A', 'G', 'S', 'W']
    #Calculate the total number of documents
    total_docs = sum(class_counts.values())
    # Calculating vocabulary size
    vocab_size = len(vocabulary)

    for text in test_texts:
        #Processing text to obtain features
        words = preprocess_with_trigrams(text)
        log_probs = {}
        #Calculate the log probability for each class
        for label in labels:
            # Calculate the prior probability (take the logarithm)
            prior = math.log(class_counts[label] / total_docs, 2)
            likelihood = 0
            # Calculate the denominator
            denom = total_words[label] + alpha * vocab_size
            # Calculate the likelihood probability (take the logarithm)
            for word in words:
                count = word_freqs[label].get(word, 0)
                likelihood += math.log((count + alpha) / denom, 2)
            # Store log probability
            log_probs[label] = prior + likelihood
        # Select the category with the highest probability as the prediction result
        predicted_label = max(log_probs, key=log_probs.get)
        result.append(predicted_label)
    return result

In [29]:
def cross_validate_naive_bayes_trigrams(x, y, k=5):
    #Initialize cross validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    acc_list = []
    # Go through each fold
    for fold, (train_idx, val_idx) in enumerate(kf.split(x), 1):
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        #Training the model
        word_freqs, class_counts, total_words, vocabulary = train_naive_bayes_trigrams(x_train, y_train)
        # prediction
        y_pred = naive_bayes_predict_trigrams(x_val, word_freqs, class_counts, total_words, vocabulary, alpha=0.5)
        # Accuracy
        acc = accuracy_score(y_val, y_pred)
        acc_list.append(acc)
        print(f"{fold} fold accuracy (Trigrams): {acc:.4f}")

    print(f"\n Average accuracy (Trigrams): {np.mean(acc_list):.4f} ± {np.std(acc_list):.4f}")

In [31]:
# Loading data and training (using Trigrams functions)
data = pd.read_csv("train.csv")
x = data["Description"]
y = data["Class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

word_freqs, class_counts, total_words, vocabulary = train_naive_bayes_trigrams(x_train, y_train)
y_pred = naive_bayes_predict_trigrams(x_test, word_freqs, class_counts, total_words, vocabulary, alpha=0.3)

# Evaluate
print("\nAccuracy after using Trigrams:", accuracy_score(y_test, y_pred))
print("\nDetailed evaluation report after using Trigrams:")
print(classification_report(y_test, y_pred))

# Run cross validation (using Trigrams function)
#cross_validate_naive_bayes_trigrams(x, y, k=5)


Accuracy after using Trigrams: 0.9818181818181818

Detailed evaluation report after using Trigrams:
              precision    recall  f1-score   support

           A       0.98      0.99      0.99       346
           G       0.97      0.91      0.94        43
           S       0.81      0.96      0.88        23
           W       0.99      0.98      0.99       468

    accuracy                           0.98       880
   macro avg       0.94      0.96      0.95       880
weighted avg       0.98      0.98      0.98       880



In [32]:
# Predictions on the test set (function using Trigrams)
test_set = pd.read_csv("test.csv")
test = test_set["Description"]
predict = naive_bayes_predict_trigrams(test, word_freqs, class_counts, total_words, vocabulary, alpha=0.3)
test_set["Class"] = predict
#Save the prediction results to a CSV file
test_set.drop(["Description"], axis=1).to_csv("tst_kaggle_trigrams.csv", index=False)