In [282]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
from sklearn.model_selection import train_test_split

In [283]:
# Load datasets
df_train = pd.read_csv("../data/datav6.csv")
df_test = pd.read_csv("../data/test.csv")

# Extract features and labels
X_train = df_train["comment"]
y_train = df_train["label"].map(
    {"complaint": 0, "inquiry": 1, "suggestion": 2, "compliment": 3})
X_test = df_test["comment"]
y_test = df_test["label"].map(
    {"complaint": 0, "inquiry": 1, "suggestion": 2, "compliment": 3})


In [284]:
# df = pd.read_csv("../data/claude.csv")
# X = df["comment"]
# y = df["label"].map({"complaint": 0, "inquiry": 1, "suggestion": 2, "compliment": 3})
#
# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [285]:
def compute_tfidf(corpus, vocab_index=None):
    word_counts = [Counter(text.split()) for text in corpus]

    if vocab_index is None:  # Create vocabulary only for training data
        doc_freq = Counter(word for doc in word_counts for word in doc)
        vocab = list(doc_freq.keys())
        vocab_index = {word: i for i, word in enumerate(vocab)}
    else:
        vocab = list(vocab_index.keys())

    num_docs = len(corpus)
    tfidf_matrix = np.zeros((num_docs, len(vocab)))

    for i, doc in enumerate(word_counts):
        for word, count in doc.items():
            if word in vocab_index:  # Use existing vocab
                tf = count / sum(doc.values())  # Term Frequency
                idf = np.log((len(vocab) + 1) / (
                        1 + sum(1 for d in word_counts if word in d)))  # IDF
                tfidf_matrix[i, vocab_index[word]] = tf * idf

    return tfidf_matrix, vocab_index

In [286]:
# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# X_train_tfidf, vocab_index = compute_tfidf(X_train)
# X_test_tfidf, _ = compute_tfidf(X_test)


In [287]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Numerical stability
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)


def train_logistic_regression(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    k = len(np.unique(y))  # Number of classes
    weights = np.zeros((n, k))
    bias = np.zeros(k)
    y_one_hot = np.eye(k)[y]  # One-hot encoding

    for _ in range(epochs):
        linear_model = np.dot(X, weights) + bias
        predictions = softmax(linear_model)

        error = predictions - y_one_hot

        weights -= lr * np.dot(X.T, error) / m
        bias -= lr * np.mean(error, axis=0)

    return weights, bias


def predict(X, weights, bias):
    linear_model = np.dot(X, weights) + bias
    predictions = softmax(linear_model)
    return np.argmax(predictions, axis=1)



In [288]:
# Train model
weights, bias = train_logistic_regression(X_train_tfidf, y_train.to_numpy(), epochs=100)

In [289]:
# Predictions
y_pred = predict(X_test_tfidf, weights, bias)


In [290]:
# Convert numerical predictions back to labels
label_map = {0: "Complaint", 1: "Inquiry", 2: "Suggestion", 3: "Compliment"}
y_pred_labels = np.array([label_map[val] for val in y_pred])
y_test_labels = np.array([label_map[val] for val in y_test])

# Print results
print("Accuracy on Training Data:",
      accuracy_score(y_train, predict(X_train_tfidf, weights, bias)))
print("Accuracy on Testing Data:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels, zero_division=0))


Accuracy on Training Data: 0.26666666666666666
Accuracy on Testing Data: 0.17647058823529413
Classification Report:
              precision    recall  f1-score   support

   Complaint       0.00      0.00      0.00        18
  Compliment       0.00      0.00      0.00        18
     Inquiry       0.00      0.00      0.00        20
  Suggestion       0.18      1.00      0.30        12

    accuracy                           0.18        68
   macro avg       0.04      0.25      0.07        68
weighted avg       0.03      0.18      0.05        68

