# **CS72 Final Project: A comparison of sentiment analysis methods on Nüshu** <br>
Authors: Youmi Ji (youmi.ji.28@dartmouth.edu), Alice Liu (lingxiao.liu.28@dartmouth.edu) <br>
Date: 06/03/25 <br>
Purpose: Using multiple methods to conduct sentiment analysis on Nushu characters, and comparing their effectiveness <br>

# Step 1: Label sentiment categories in Nushu500 <br>

As native speakers of Mandarin, we were able to manually annotate the 500 simplified Chinese sentences in Nushu500. We classified them into 0 (negative), 1 (neutral), and 2 (positive), based on certain combinations of characters that contained sentiment.


# Step 2 : Fine-tune bert-base-chinese on ancient Chinese poetry <br>

We used the code from HW 6.1 and changed the model to bert-base-chinese to yield more accurate results.

In [None]:
!pip install transformers



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('chineseemo.tsv', delimiter='\t', header=None)

In [None]:
batch_1 = df[:500]

In [None]:
batch_1[1].value_counts()

Unnamed: 0_level_0,count
1,Unnamed: 1_level_1
2,225
1,172
0,103


In [None]:
# Load the pre-trained bert model

from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Tokenization
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# Padding ensures that the tokenized sequences have the same length
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(500, 20)

In [None]:
# Masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(500, 20)

In [None]:
# With a CPU, it takes about 4 minutes

input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = batch_1[1]

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42)

In [None]:
# Logistics Regression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
# Evaluating new model
lr_clf.score(test_features, test_labels)

0.56

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.448 (+/- 0.01)


In [None]:
# https://huggingface.co/transformers/model_doc/distilbert.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
new_input_ids = torch.tensor(tokenizer.encode("独在异乡为异客", add_special_tokens=True)).unsqueeze(0) # Translation: being a stranger in a foreign land
new_outputs = model(new_input_ids)
new_last_hidden_states = [new_outputs[0].detach().numpy()[0][0]]
lr_clf.predict_proba(new_last_hidden_states)

array([[0.59768565, 0.15241737, 0.24989699]])

In [None]:
# Classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
y_pred = lr_clf.predict(test_features)
print(classification_report(test_labels, y_pred))
print(confusion_matrix(test_labels, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.57      0.54        23
           1       0.49      0.49      0.49        45
           2       0.64      0.61      0.62        57

    accuracy                           0.56       125
   macro avg       0.55      0.56      0.55       125
weighted avg       0.56      0.56      0.56       125

[[13  4  6]
 [ 9 22 14]
 [ 3 19 35]]


# Step 3: Transfer the sentiment labels onto their Nushu counterparts using the parallel corpus

# Step 4: Conduct sentiment anlaysis on Nushu with 3 methods:

# Method 1: Fine-tuning a BERT on Nushu+MandarinEmotions <br>


In [None]:
df = pd.read_csv('nushuemo.tsv', delimiter='\t', header=None)

In [None]:
batch_1 = df[:500]

In [None]:
batch_1[1].value_counts()

Unnamed: 0_level_0,count
1,Unnamed: 1_level_1
2,225
1,172
0,103


In [None]:
# Load the pre-trained bert model

from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Tokenization
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# Padding ensures that the tokenized sequences have the same length
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(500, 5)

In [None]:
# Masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(500, 5)

In [None]:
# With a CPU, it takes about 4 minutes

input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = batch_1[1]

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,random_state = 42)

In [None]:
# Logistics Regression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
# Evaluating new model
lr_clf.score(test_features, test_labels)

0.456

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.448 (+/- 0.01)


In [None]:
# https://huggingface.co/transformers/model_doc/distilbert.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
new_input_ids = torch.tensor(tokenizer.encode("𛉻𛊼𛇍𛆹𛇤𛉚𛆸𛇾𛆹𛇹𛆬𛇩𛆹𛇤𛆳𛆷𛇃𛇲", add_special_tokens=True)).unsqueeze(0) # translation: but this is my truest feeling at the time I can't escape it
new_outputs = model(new_input_ids)
new_last_hidden_states = [new_outputs[0].detach().numpy()[0][0]]
lr_clf.predict_proba(new_last_hidden_states)

array([[0.21389725, 0.33690667, 0.44919609]])

In [None]:
# Classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
y_pred = lr_clf.predict(test_features)
print(classification_report(test_labels, y_pred))
print(confusion_matrix(test_labels, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.00      0.00      0.00        45
           2       0.46      1.00      0.63        57

    accuracy                           0.46       125
   macro avg       0.15      0.33      0.21       125
weighted avg       0.21      0.46      0.29       125

[[ 0  0 23]
 [ 0  0 45]
 [ 0  0 57]]


# Method 2: Fine-tuning a CNN on Nushu+MandarinEmotions <br>
In this method, we will use Keras to train a character-level convolutional neural network. The code is based on the template for "Homework 5.2: Convolutional Neural Networks and ASL." Some major changes that I made to the code include changing from Conv2D to Conv1D,

Description: the following program uses convolutional neural network to recognize patterns in Nushu text (database 'Nushu500' provided by Ivory Yang, a graduate student at Dartmouth College).

*A convolutional neural network (ConvNet/CNN) is optimized to understand visual data. This code in particular comes from this URL:
https://github.com/samurainote/CNN_for_Sign_Language_Images/blob/master/CNN_for_Sign_Language_Images.ipynb*


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
# Load and prepare data
train_df = pd.read_csv("nvshu_train.tsv", sep="\t")
test_df = pd.read_csv("nvshu_test.tsv", sep="\t")
train_df.columns = ["Chinese", "Nushu", "Label"]
test_df.columns = ["Chinese", "Nushu", "Label"]

train_texts = train_df["Nushu"]
test_texts = test_df["Nushu"]
train_labels = train_df["Label"]
test_labels = test_df["Label"]

In [None]:
# Compute Class Weights to Handle Imbalance
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

train_labels_int = train_df["Label"].values

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels_int),
    y=train_labels_int
)

class_weight_dict = dict(enumerate(class_weights))
print("Class Weights:", class_weight_dict)



Class Weights: {0: np.float64(1.41869918699187), 1: np.float64(0.9943019943019943), 2: np.float64(0.7755555555555556)}


In [None]:
# Tokenize text
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train_texts)
x_train_seq = tokenizer.texts_to_sequences(train_texts)
x_test_seq = tokenizer.texts_to_sequences(test_texts)

maxlen = max(max(len(seq) for seq in x_train_seq), max(len(seq) for seq in x_test_seq))
x_train_pad = pad_sequences(x_train_seq, maxlen=maxlen, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=maxlen, padding='post')

# Encode labels
label_binarizer = LabelBinarizer()
y_train = label_binarizer.fit_transform(train_labels)
y_test = label_binarizer.transform(test_labels)

In [None]:
# CNN Model
vocab_size = len(tokenizer.word_index) + 1
num_classes = y_train.shape[1]

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train/validation split and model training
x_train, x_val, y_train_split, y_val = train_test_split(x_train_pad, y_train, test_size=0.2, random_state=42)

history = model.fit(x_train, y_train_split, validation_data=(x_val, y_val), epochs=10, batch_size=64)

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 132ms/step - accuracy: 0.3457 - loss: 1.0950 - val_accuracy: 0.2714 - val_loss: 1.0959
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.4626 - loss: 1.0694 - val_accuracy: 0.3571 - val_loss: 1.0911
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.5529 - loss: 1.0564 - val_accuracy: 0.4429 - val_loss: 1.0834
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.4583 - loss: 1.0257 - val_accuracy: 0.4429 - val_loss: 1.0819
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.4492 - loss: 1.0203 - val_accuracy: 0.4429 - val_loss: 1.0842
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5287 - loss: 0.9953 - val_accuracy: 0.4429 - val_loss: 1.0838
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━

In [None]:
# Evaluation
test_loss, test_accuracy = model.evaluate(x_test_pad, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

y_pred = model.predict(x_test_pad)
y_true_labels = np.argmax(y_test, axis=1)
y_pred_labels = np.argmax(y_pred, axis=1)

print("Classification Report:")
print(classification_report(y_true_labels, y_pred_labels))
print("Confusion Matrix:")
print(confusion_matrix(y_true_labels, y_pred_labels))

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5411 - loss: 0.9864 
Test Accuracy: 0.6390
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.09      0.17        64
           1       0.65      0.58      0.61       110
           2       0.63      0.87      0.73       175

    accuracy                           0.64       349
   macro avg       0.76      0.52      0.50       349
weighted avg       0.70      0.64      0.59       349

Confusion Matrix:
[[  6  13  45]
 [  0  64  46]
 [  0  22 153]]


# Method 3: Tokenize Nüshu and do Naive Bayes classification <br>

Count Vectorizer converts a collection of text documents to a matrix of token counts.
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

On the other hand, the Keras Tokenizer is usually used on deep learning models such as RNN, which works better on a larger dataset. Given that we only have 500 sentences, using the count vectorizer would yield stronger results.

We use scikit-learn's Multinomial Naive Bayes classfier, which is suitable for classification with discrete features: https://scikit-learn.org/1.6/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [None]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('nushuemo.tsv', delimiter='\t', header=None)

In [None]:
print(df.head())

            0  1
0       𛆂𛅽𛈁𛈍𛉾  2
1   𛊌𛊢𛆠𛉤𛈣𛊤𛆽𛇮𛆑  2
2  𛉩𛋡𛊨𛈣𛊟𛊚𛆰𛇪𛊗𛆹  2
3  𛊄𛈰𛋂𛇫𛅼𛆞𛆄𛋓𛉾𛋙  2
4  𛆳𛈷𛋂𛉂𛆱𛊩𛇆𛆤𛆈𛅳  2


In [None]:
# Train test split 80-20
X_train, X_test, y_train, y_test = train_test_split(
    df[0], df[1],
    test_size=0.2, stratify=df[1], random_state=42)

In [None]:
# Use character-level n-gram vectorization
vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 3))  # unigrams to trigrams
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Train the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

In [None]:
# Predict on test set
y_pred = clf.predict(X_test_vec)

# Print evaluation results
print("Classification Report:\n")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# Show most informative character n-grams
feature_names = vectorizer.get_feature_names_out()
class_labels = clf.classes_
topn = 10

print("\nTop informative features per class:")
for i, class_label in enumerate(class_labels):
    topn_indices = clf.feature_log_prob_[i].argsort()[-topn:]
    print(f"\nClass: {class_label}")
    for j in reversed(topn_indices):
        print(f"{feature_names[j]}: {clf.feature_log_prob_[i][j]:.4f}")

Classification Report:

              precision    recall  f1-score   support

           0       0.75      0.14      0.24        21
           1       0.50      0.29      0.37        34
           2       0.51      0.87      0.64        45

    accuracy                           0.52       100
   macro avg       0.59      0.43      0.42       100
weighted avg       0.56      0.52      0.47       100

Confusion Matrix:

[[ 3  4 14]
 [ 1 10 23]
 [ 0  6 39]]

Top informative features per class:

Class: 0
𛆳: -6.2849
𛇃: -6.3455
𛅳: -6.4791
𛈌: -6.5532
𛇯: -6.5532
𛇏: -6.6332
𛇁: -6.7202
𛆭: -6.7202
𛈒: -6.7202
𛇄: -6.7202

Class: 1
𛈌: -6.0367
𛆱: -6.1646
𛇏: -6.1646
𛆳: -6.2599
𛋆: -6.3112
𛆑: -6.3112
𛇈: -6.3652
𛈤: -6.3652
𛆈: -6.4224
𛇯: -6.4224

Class: 2
𛆳: -5.6353
𛆱: -5.7935
𛅳: -5.9149
𛈌: -5.9816
𛇯: -6.0167
𛇏: -6.0167
𛈤: -6.0167
𛊤: -6.2134
𛇁: -6.2578
𛊝: -6.2578


# Method 4: Random Forest Algorithm
Description: This program performs sentiment analysis on a dataset of Nüshu script text using a Random Forest classifier. It begins by reading a TSV file (`nvshu_500.tsv`) that contains rows of Chinese phrases, their Nüshu script versions, and sentiment labels. Since machine learning models like Random Forest require numerical input, the program extracts simple character-based features from each Nüshu text, such as the total number of characters and the number of unique characters. It then splits the data into training and testing sets, trains the Random Forest model, and evaluates its accuracy using standard metrics like accuracy score and classification report. Finally, it makes a prediction on one sample from the test set and prints the predicted sentiment label.

Documentation: https://www.geeksforgeeks.org/random-forest-algorithm-in-machine-learning/ <br>


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load the Nüshu dataset: assumed columns = [Chinese, Nushu, Label]
df = pd.read_csv("nvshu_500.tsv", sep="\t", header=None)
df.columns = ["Chinese", "Nushu", "Label"]

# Encode labels to integers
label_encoder = LabelEncoder()
df["LabelEncoded"] = label_encoder.fit_transform(df["Label"])

# Use a simple character-level feature representation (length, counts, etc.)
# You can later swap this with real embeddings if desired
df["NushuLength"] = df["Nushu"].apply(len)
df["NumUniqueChars"] = df["Nushu"].apply(lambda x: len(set(x)))

# You can engineer more features if helpful
X = df[["NushuLength", "NumUniqueChars"]]
y = df["LabelEncoded"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Initialize and train Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_.astype(str))

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)

# Predict a sample row
sample = X_test.iloc[0:1]
prediction = rf_classifier.predict(sample)
sample_dict = sample.iloc[0].to_dict()

print(f"\nSample Features: {sample_dict}")
print(f"Predicted Sentiment: {label_encoder.inverse_transform(prediction)[0]}")


Accuracy: 0.48

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.15      0.21        20
           1       0.38      0.19      0.25        32
           2       0.52      0.81      0.63        48

    accuracy                           0.48       100
   macro avg       0.41      0.38      0.36       100
weighted avg       0.44      0.48      0.43       100


Sample Features: {'NushuLength': 7, 'NumUniqueChars': 7}
Predicted Sentiment: 2
