In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, recall_score, accuracy_score
import numpy as np
import re
import jieba

# Read the CSV file and select the relevant columns
df = pd.read_csv('ChnSentiCorp_htl_all_translated.csv', usecols=['review', 'label'])
df['review'] = df['review'].astype(str)

# Define a function to keep only Chinese characters in a string
def keep_chinese(text):
    pattern = re.compile(r'[^\u4e00-\u9fff\s]')
    chinese_only = pattern.sub('', text)
    return chinese_only.strip()

# Tokenize the Chinese words and keep only the Chinese characters in each word
df['words-chinese'] = df['review'].apply(lambda x: [keep_chinese(word) for word in jieba.cut(x, cut_all=False) if keep_chinese(word) != ''])

# Train a Word2Vec model and calculate the mean of word embeddings for each tokenized sentence
word2vec_model = Word2Vec(sentences=df['words-chinese'], vector_size=100, window=5, min_count=1, workers=4)

def mean_word2vec(words, model):
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)


# Calculate the TF-IDF values for each tokenized sentence
vectorizer = TfidfVectorizer(use_idf=True)
X_tfidf = vectorizer.fit_transform(df['words-chinese'].apply(lambda x: ' '.join(x)))

# Combine the reshaped 'tf-idf' values and 'word2vec' values
X = np.hstack((X_tfidf.toarray(), np.vstack(df['words-chinese'].apply(lambda x: mean_word2vec(x, word2vec_model)))))



In [1]:
# PCA
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import numpy as np
import re
import jieba

# Read the CSV file and select the relevant columns
df = pd.read_csv('ChnSentiCorp_htl_all_translated.csv', usecols=['review', 'label'])
df['review'] = df['review'].astype(str)

# Define a function to keep only Chinese characters in a string
def keep_chinese(text):
    pattern = re.compile(r'[^\u4e00-\u9fff\s]')
    chinese_only = pattern.sub('', text)
    return chinese_only.strip()

# Tokenize the Chinese words and keep only the Chinese characters in each word
df['words-chinese'] = df['review'].apply(lambda x: [keep_chinese(word) for word in jieba.cut(x, cut_all=False) if keep_chinese(word) != ''])

# Train a Word2Vec model and calculate the mean of word embeddings for each tokenized sentence
word2vec_model = Word2Vec(sentences=df['words-chinese'], vector_size=100, window=5, min_count=1, workers=4)

def mean_word2vec(words, model):
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Calculate the TF-IDF values for each tokenized sentence
vectorizer = TfidfVectorizer(use_idf=True)
X_tfidf = vectorizer.fit_transform(df['words-chinese'].apply(lambda x: ' '.join(x)))

# Combine the reshaped 'tf-idf' values and 'word2vec' values
X = np.hstack((X_tfidf.toarray(), np.vstack(df['words-chinese'].apply(lambda x: mean_word2vec(x, word2vec_model)))))

# Apply PCA to reduce the dimensionality of the features
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X)

# Print the percentage of variance explained by each principal component
print(pca.explained_variance_ratio_)

# Print the new shape of the feature matrix after applying PCA
print(X_pca.shape)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_9/2z47shtd34x8z5gsxhf75_jr0000gn/T/jieba.cache
Loading model cost 0.447 seconds.
Prefix dict has been built successfully.


[0.3036845  0.10010424 0.06963339 0.06195853 0.04571151 0.02953455
 0.02483473 0.0168398  0.01310509 0.0106093  0.00924079 0.00741225
 0.00638246 0.00516653 0.00369271 0.00349248 0.00322177 0.00288083
 0.00235013 0.00223343 0.00176105 0.00146854 0.00144397 0.0012046
 0.00113593 0.00109908 0.00094081 0.00091306 0.00086346 0.00082774
 0.00077604 0.00074911 0.00073865 0.00073007 0.00071538 0.0006869
 0.00067211 0.00065493 0.00063193 0.00062415 0.00061302 0.00060007
 0.00057928 0.00056785 0.00055777 0.00054662 0.000542   0.0005278
 0.00052325 0.00052204]
(7766, 50)


In [2]:
X = X_pca

In [None]:
# use the following code to verify if 0.7 or 0.6 is better

In [4]:
# Split the data into training and testing sets and test different ratios
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, recall_score, accuracy_score
ratios = [0.7, 0.6]
results = []
for ratio in ratios:
    X_train, X_test, y_train, y_test = train_test_split(X, df['label'].values, test_size=(1-ratio), random_state=42)

    # Train the SVM classifier and predict on the testing set
    svm_model = SVC(kernel='linear', random_state=42)
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)

    # Calculate and store the F1 score, recall, and accuracy
    results.append((ratio, f1_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted'), accuracy_score(y_test, y_pred)))

# Print the results
print('Train/Test Ratio\tF1 Score\tRecall\tAccuracy')
for result in results:
    print('{}\t{:.4f}\t\t{:.4f}\t{:.4f}'.format(result[0], result[1], result[2], result[3]))

Train/Test Ratio	F1 Score	Recall	Accuracy
0.7	0.8468		0.8481	0.8481
0.6	0.8399		0.8416	0.8416


In [None]:
# use the following code to verify if k = 5 or k = 10 is better

In [None]:
# Test with different k values for k-fold cross-validation

from sklearn.model_selection import cross_validate

ks = [5, 10]
results = []
for k in ks:
    svm_model = SVC(kernel='linear', random_state=42)
    scores = cross_validate(svm_model, X, df['label'].values, cv=k, scoring=('f1_weighted', 'recall_weighted', 'accuracy'))

    f1_scores = scores['test_f1_weighted']
    recall_scores = scores['test_recall_weighted']
    accuracy_scores = scores['test_accuracy']

    # Calculate and store the mean and standard deviation of F1 score, recall, and accuracy
    results.append((k, np.mean(f1_scores), np.std(f1_scores), np.mean(recall_scores), np.std(recall_scores), np.mean(accuracy_scores), np.std(accuracy_scores)))

# Print the results
print('K\tF1 Score (mean ± std)\tRecall (mean ± std)\tAccuracy (mean ± std)')
for result in results:
    print('{}\t{:.4f} ± {:.4f}\t\t{:.4f} ± {:.4f}\t\t{:.4f} ± {:.4f}'.format(result[0], result[1], result[2], result[3], result[4], result[5], result[6]))

In [1]:
import pandas as pd
import random

# Read the original CSV file
df = pd.read_csv('ChnSentiCorp_htl_all_translated.csv', usecols=['review', 'label', 'translated_review'])

# Randomly select 5000 records
df_5000 = df.sample(n=5000, random_state=42)

# Save the selected records to a new CSV file
df_5000.to_csv('ChnSentiCorp_htl_first_5000.csv', index=False)

# Remove the selected records from the original DataFrame
df_rest = df.drop(df_5000.index)

# Save the remaining records to a new CSV file
df_rest.to_csv('ChnSentiCorp_htl_rest.csv', index=False)

In [20]:
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, recall_score, accuracy_score
import numpy as np
import re
import jieba

# Read the original CSV file
df = pd.read_csv('ChnSentiCorp_htl_all_translated.csv', usecols=['review', 'label'])

df_5000 = pd.read_csv('ChnSentiCorp_htl_first_5000.csv', usecols=['review', 'label'])


df_rest = pd.read_csv('ChnSentiCorp_htl_rest.csv', usecols=['review', 'label'])
df_rest = df_rest.dropna(subset=['review'])

# Define a function to keep only Chinese characters in a string
def keep_chinese(text):
    pattern = re.compile(r'[^\u4e00-\u9fff\s]')
    chinese_only = pattern.sub('', text)
    return chinese_only.strip()

# Tokenize the Chinese words and keep only the Chinese characters in each word
df_5000['words-chinese'] = df_5000['review'].apply(lambda x: [keep_chinese(word) for word in jieba.cut(x, cut_all=False) if keep_chinese(word) != ''])
df_rest['words-chinese'] = df_rest['review'].apply(lambda x: [keep_chinese(word) for word in jieba.cut(x, cut_all=False) if keep_chinese(word) != ''])

df_rest = df_rest.dropna(subset=['review'])

# Train a Word2Vec model and calculate the mean of word embeddings for each tokenized sentence
word2vec_model = Word2Vec(sentences=df_5000['words-chinese'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.build_vocab(df_rest['words-chinese'], update=True)
word2vec_model.train(df_rest['words-chinese'], total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

# Calculate the TF-IDF values for each tokenized sentence
vectorizer = TfidfVectorizer(use_idf=True)
X_tfidf_5000 = vectorizer.fit_transform(df_5000['words-chinese'].apply(lambda x: ' '.join(x)))
X_tfidf_rest = vectorizer.transform(df_rest['words-chinese'].apply(lambda x: ' '.join(x)))

X_5000 = np.hstack((X_tfidf_5000.toarray(), np.vstack(df_5000['words-chinese'].apply(lambda x: mean_word2vec(x, word2vec_model)))))
X_rest = np.hstack((X_tfidf_rest.toarray(), np.vstack(df_rest['words-chinese'].apply(lambda x: mean_word2vec(x, word2vec_model)))))


X_train, X_test, y_train, y_test = train_test_split(X_5000, df_5000['label'].values, test_size=(1 - 0.7), random_state=42)

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)


f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
print(f"Metrics for SVM model on testing set (n=1500):\nF1 Score: {f1:.4f}\nRecall: {recall:.4f}\nAccuracy: {accuracy:.4f}")

y_pred_rest = svm_model.predict(X_rest)
f1_rest = f1_score(df_rest['label'].values, y_pred_rest, average='weighted')
recall_rest = recall_score(df_rest['label'].values, y_pred_rest, average='weighted')
accuracy_rest = accuracy_score(df_rest['label'].values, y_pred_rest)
print(f"\nMetrics for SVM model on remaining records (n={len(df_rest)}):\nF1 Score: {f1_rest:.4f}\nRecall: {recall_rest:.4f}\nAccuracy: {accuracy_rest:.4f}")

Metrics for SVM model on testing set (n=1500):
F1 Score: 0.8779
Recall: 0.8787
Accuracy: 0.8787

Metrics for SVM model on remaining records (n=2765):
F1 Score: 0.8747
Recall: 0.8745
Accuracy: 0.8745


In [17]:
len(X[0])

15106

In [None]:
# k-fold Chinese

In [None]:
# Chinese k-fold k = [5, 10]

# Test different k-values
k_values = [5, 10]
results = []
for k in k_values:
    # Initialize the KFold cross-validator with the current k-value
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)

    # Initialize lists to store performance metrics for each fold
    f1_scores = []
    recall_scores = []
    accuracy_scores = []
    precision_scores = []

    # Iterate over the k folds
    for train_index, test_index in kfold.split(X):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the SVM classifier
        svm_model = SVC(kernel='linear', random_state=42)
        svm_model.fit(X_train, y_train)

        # Predict on the testing set
        y_pred = svm_model.predict(X_test)

        # Calculate and store the performance metrics for the current fold
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
        recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='weighted'))

    # Calculate and store the average performance metrics across all k folds for the current k-value
    average_f1 = np.mean(f1_scores)
    average_recall = np.mean(recall_scores)
    average_accuracy = np.mean(accuracy_scores)
    average_precision = np.mean(precision_scores)

    results.append((k, average_f1, average_recall, average_accuracy, average_precision))

    # Print the results for the current k-value
    print(f'k={k}, F1 Score: {average_f1:.4f}, Recall: {average_recall:.4f}, Accuracy: {average_accuracy:.4f}, Precision: {average_precision:.4f}')

In [None]:
# English below

In [None]:
# Language: English
# k-fold comparison

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
import numpy as np
import re
import nltk

df_English = pd.read_csv('ChnSentiCorp_htl_all_translated.csv')
df_English['review'] = df['review'].apply(str)
df_English['translated_review'] = df_English['translated_review'].apply(str)

# Define a function to keep only English characters in a string
def keep_english(text):
    pattern = re.compile(r'[^a-zA-Z\s]')
    english_only = pattern.sub('', text)
    return english_only.strip()

# Tokenize the English words and keep only the English characters in each word
df_English['words-english'] = df_English['translated_review'].apply(lambda x: [keep_english(word) for word in nltk.word_tokenize(x) if keep_english(word) != ''])

# Calculate the mean of TF-IDF values for each tokenized sentence
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df_English['words-english'].apply(lambda x: ' '.join(x)))
df_English['tf-idf'] = np.mean(X_tfidf.toarray(), axis=1)

# Train a Word2Vec model and calculate the mean of word embeddings for each tokenized sentence
word2vec_model = Word2Vec(sentences=df_English['words-english'], vector_size=100, window=5, min_count=1, workers=4)

def mean_word2vec(words, model):
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df_English['word2vec'] = df_English['words-english'].apply(lambda x: mean_word2vec(x, word2vec_model))

# Combine the reshaped 'tf-idf' values and 'word2vec' values
X = np.hstack((df_English['tf-idf'].values.reshape(-1, 1), np.vstack(df_English['word2vec'].values)))
y = df_English['label'].values

# Define the values of k to iterate over
k_values = [5, 10]

# Initialize an empty list to store the results for each value of k
results = []

# Iterate over the k values and perform k-fold cross-validation
for k in k_values:
    # Initialize the KFold cross-validator with the current value of k
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)

    # Initialize lists to store performance metrics for each fold
    f1_scores = []
    recall_scores = []
    accuracy_scores = []
    precision_scores = []

    # Iterate over the k folds
    for train_index, test_index in kfold.split(X):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the SVM classifier
        svm_model = SVC(kernel='linear', random_state=42)
        svm_model.fit(X_train, y_train)

        # Predict on the testing set
        y_pred = svm_model.predict(X_test)

        # Calculate and store the performance metrics for the current fold
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
        recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='weighted'))

    # Calculate and store the average performance metrics for the current value of k
    average_f1 = np.mean(f1_scores)
    average_recall = np.mean(recall_scores)
    average_accuracy = np.mean(accuracy_scores)
    average_precision = np.mean(precision_scores)
    results.append([k, average_f1, average_recall, average_accuracy, average_precision])
    # Print the results for the current k-value
    print(f'k={k}, F1 Score: {average_f1:.4f}, Recall: {average_recall:.4f}, Accuracy: {average_accuracy:.4f}, Precision: {average_precision:.4f}')

In [None]:
# train/test ratio for English
# Define the train-test split ratios
ratios = [0.7, 0.6]

# Initialize lists to store performance metrics for each ratio
f1_scores = []
recall_scores = []
accuracy_scores = []
precision_scores = []
# Iterate over the ratios
for ratio in ratios:
    # Split the data into training and testing sets using the current ratio
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    
    # Train the SVM classifier
    svm_model = SVC(kernel='linear', random_state=42)
    svm_model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = svm_model.predict(X_test)

    # Calculate and store the performance metrics for the current ratio
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    
    f1_scores.append(f1)
    recall_scores.append(recall)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    results.append((ratio, f1, recall, accuracy, precision))

# Print the performance metrics for each ratio
print(f'Ratio\t\tF1 Score\tRecall\t\tAccuracy\tPrecision')
for i in range(len(ratios)):
    print(f'{ratios[i]:.1f}\t\t{f1_scores[i]:.4f}\t\t{recall_scores[i]:.4f}\t\t{accuracy_scores[i]:.4f}\t\t{precision_scores[i]:.4f}')