# NMSU CSCI-5435 Assignment 2

## Relevent Information

In [48]:
#Name:               Tianjie Chen
#Email:              tvc5586@nmsu.edu
#File Creation Date: Feb/7/2025
#Purpose of File:    NMSU CSCI-5435 Assignment 2
#Last Edit Date:     Feb/7/2025
#Last Edit Note:     File creation
#GenAI used:         False

## Load libraries

In [49]:
import string

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

## Preprocess data

### Load data

In [50]:
df = pd.read_csv("yelp_labelled.txt", sep = '\t', names=["Sentence", "Score"])

In [51]:
X, y = list(df.iloc[:, 0]), list(df.iloc[:, 1])

### Remove Punctuation

Referenced the [Remove punctuation Tutorial](https://www.geeksforgeeks.org/python-remove-punctuation-from-string/)

In [52]:
# Remove punctuation
translator = str.maketrans('', '', string.punctuation)

for i in range(len(X)):
    X[i] = X[i].translate(translator)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

### Word tokenization

Referenced the [Remove Stopwords Tutorial](https://www.geeksforgeeks.org/removing-stop-words-nltk-python/)

In [54]:
# Create CountVectorizer
word_vectorizer = CountVectorizer(stop_words = list(set(stopwords.words('english'))))

In [55]:
# Vectorization
word_train = word_vectorizer.fit_transform(X_train)
word_test  = word_vectorizer.transform(X_test)

### Sub-word tokenization

In [56]:
from transformers import BertTokenizerFast

old_tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [57]:
# Get tokens
train_tokens = []
test_tokens = []

for i in X_train:
    train_tokens.append(old_tokenizer.tokenize(i))

for j in X_test:
    test_tokens.append(old_tokenizer.tokenize(j))

In [58]:
# As demosntrated during Feb 6 class
def dummy(doc):
    return doc

In [59]:
# Create CountVectorizer
subword_vectorizer = CountVectorizer(
    tokenizer = dummy, 
    preprocessor = dummy, 
    stop_words = list(set(stopwords.words('english')))
)

In [60]:
# Vectorization
subword_train = subword_vectorizer.fit_transform(train_tokens)
subword_test  = subword_vectorizer.transform(test_tokens)



## Training

### Naive Bayes

In [61]:
from sklearn.naive_bayes import MultinomialNB

In [62]:
NB_word = MultinomialNB().fit(word_train.toarray(), y_train)
NB_subword = MultinomialNB().fit(subword_train.toarray(), y_train)

### Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression

In [64]:
LR_word = LogisticRegression().fit(word_train.toarray(), y_train)
LR_subword = LogisticRegression().fit(subword_train.toarray(), y_train)

## Evaluation

In [65]:
from sklearn.metrics import accuracy_score

In [66]:
# Get score
NB_word_acc    = accuracy_score(y_test, NB_word.predict(word_test))
NB_subword_acc = accuracy_score(y_test, NB_subword.predict(subword_test))
LR_word_acc    = accuracy_score(y_test, LR_word.predict(word_test))
LR_subword_acc = accuracy_score(y_test, LR_subword.predict(subword_test))

In [67]:
# Show score
print(f"""
    Navie Bayes Accuracy on word-based tokens: {NB_word_acc}
    Navie Bayes Accuracy on subword-based tokens: {NB_subword_acc}
    Logistic Regression Accuracy on word-based tokens: {LR_word_acc}
    Logistic Regression Accuracy on subword-based tokens: {LR_subword_acc}
    """
)


    Navie Bayes Accuracy on word-based tokens: 0.79
    Navie Bayes Accuracy on subword-based tokens: 0.725
    Logistic Regression Accuracy on word-based tokens: 0.79
    Logistic Regression Accuracy on subword-based tokens: 0.75
    


Based on these scores, word tokenization performs better than sub-word tokenization on this yelp dataset, regardless of the classifier chosen

## Export Datasets

In [68]:
# Recombine datasets
train_set, test_set = [], []

for i, j in zip(X_train, y_train):
    train_set.append([i, j])
    
for i, j in zip(X_test, y_test):
    test_set.append([i, j])

In [69]:
# Export to txt files
pd.DataFrame(train_set, columns = ['sentence', 'score']).to_csv("yelp_train.txt", index = False, header = True)
pd.DataFrame(test_set, columns = ['sentence', 'score']).to_csv("yelp_test.txt", index = False, header = True)