In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, recall_score, accuracy_score
import numpy as np
import re
import jieba


df = pd.read_csv('ChnSentiCorp_htl_all_translated.csv')
df['review'] = df['review'].apply(str)
df['translated_review'] = df['translated_review'].apply(str)

import re
def clean_sentence(sen):
    clean_sen = re.sub(r'[^\w\s]', '', sen)
    return clean_sen

df['review'] = df['review'].apply(clean_sentence)



# Define a function to keep only Chinese characters in a string
def keep_chinese(text):
    pattern = re.compile(r'[^\u4e00-\u9fff\s]')
    chinese_only = pattern.sub('', text)
    return chinese_only.strip()

# Segment the Chinese words using Jieba and keep only the Chinese characters in each word
df['words-chinese'] = df['review'].apply(lambda x: [keep_chinese(word) for word in jieba.cut(x, cut_all=False) if keep_chinese(word) != ''])


In [6]:
# Calculate the mean of TF-IDF values for each tokenized sentence
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['words-chinese'].apply(lambda x: ' '.join(x)))
df['tf-idf'] = np.mean(X_tfidf.toarray(), axis=1)

# Train a Word2Vec model and calculate the mean of word embeddings for each tokenized sentence
word2vec_model = Word2Vec(sentences=df['words-chinese'], vector_size=100, window=5, min_count=1, workers=4)
df['word2vec'] = df['words-chinese'].apply(lambda x: np.mean([word2vec_model.wv[word] for word in x], axis=0))

# Reshape the 'tf-idf' values array to have the same number of columns as the 'word2vec' values array
tf_idf_reshaped = df['tf-idf'].values.reshape(-1, 1)

# Combine the reshaped 'tf-idf' values and 'word2vec' values
X = np.hstack((tf_idf_reshaped, np.vstack(df['word2vec'].values)))
y = df['label'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Predict on the testing set
y_pred = svm_model.predict(X_test)

# Calculate and report the F1 score, recall, and accuracy
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print('F1 Score:', f1)
print('Recall:', recall)
print('Accuracy:', accuracy)

  return _methods._mean(a, axis=axis, dtype=dtype,


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 100 and the array at index 6374 has size 1

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Concatenate all the words in df['words-chinese'] into a single string
chinese_words = ' '.join(word for review in df['words-chinese'] for word in review)

# Create the TfidfVectorizer object with the vocabulary set to the unique words in the concatenated string
vectorizer = TfidfVectorizer(vocabulary=set(chinese_words.split()))

# Fit the vectorizer on the concatenated string
vectorizer.fit([chinese_words])

# Compute the TF-IDF features for the df['review'] column
tfidf_features_chinese = vectorizer.transform(df['review'])

# Get the feature names from the vectorizer object
feature_names = vectorizer.get_feature_names()

# Create a pandas DataFrame from the sparse matrix of the TF-IDF features
df_new_chinese = pd.DataFrame.sparse.from_spmatrix(tfidf_features_chinese, columns=feature_names)



In [25]:
len(chinese_words)

1358587

In [24]:
df_new_chinese['nonzero_columns'] = df_new_chinese.apply(lambda x: (x != 0).sum(), axis=1)
df_new_chinese['nonzero_columns']

0       0
1       0
2       0
3       0
4       0
       ..
7761    0
7762    0
7763    0
7764    0
7765    0
Name: nonzero_columns, Length: 7766, dtype: int64

In [26]:
nonzero_rows = (df_new_chinese != 0).sum(axis=1)
count_nonzero_rows = (nonzero_rows != 0).sum()
print(count_nonzero_rows)

3


In [36]:
# Train Word2Vec and create the feature for Chinese sentences
from gensim.models import Word2Vec
import numpy as np

sentences = df['words-chinese']

model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

w2v_features = pd.DataFrame([
    np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0)
    for sentence in sentences if np.any([model.wv[word] for word in sentence if word in model.wv])
])

In [37]:
print(w2v_features)

            0         1         2         3         4         5         6   \
0    -0.196257  0.277021  0.104843  0.082773 -0.114960 -0.236471  0.165146   
1    -0.055645  0.227108  0.184772 -0.663236 -0.845417  0.232540 -0.069390   
2    -0.032474  0.161153  0.191869  0.072029 -0.152428 -0.174706  0.220331   
3    -0.078297  0.171496  0.304925 -0.169283 -0.253415 -0.310697  0.176672   
4    -0.430415  0.490308  0.104770  0.000661 -0.397055 -0.230514  0.280223   
...        ...       ...       ...       ...       ...       ...       ...   
7759 -0.220421  0.337080  0.093458  0.112587 -0.195769 -0.394888  0.243077   
7760 -0.175233  0.161407  0.203813 -0.047241 -0.182284 -0.178380  0.211565   
7761 -0.133019  0.172838  0.315441 -0.106743 -0.273937 -0.104692  0.160264   
7762 -0.203847  0.337881  0.150153  0.015690 -0.034044 -0.382851  0.254389   
7763 -0.009343  0.031065  0.197352 -0.047649  0.266214 -0.216281  0.183524   

            7         8         9   ...        90        91    

In [39]:
import pandas as pd
from scipy.sparse import csr_matrix

# converting csr_matrix to dataframe
w2v_features_df = pd.DataFrame(w2v_features)

# concatenating the dataframes
combined_features = pd.concat([tfidf_features_chinese, w2v_features_df], axis=1)

TypeError: cannot concatenate object of type '<class 'scipy.sparse.csr.csr_matrix'>'; only Series and DataFrame objs are valid

In [49]:
# import pandas as pd
# from snownlp import SnowNLP
# from collections import Counter
# import jieba
# import re
# from sklearn.feature_extraction.text import TfidfVectorizer
# from gensim.models import Word2Vec
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# # Read in the data
# df = pd.read_csv('ChnSentiCorp_htl_all_translated.csv')
# df['review'] = df['review'].apply(str)
# df['translated_review'] = df['translated_review'].apply(str)

# # Clean the sentences
# def clean_sentence(sen):
#     clean_sen = re.sub(r'[^\w\s]', '', sen)
#     return clean_sen
# df['review'] = df['review'].apply(clean_sentence)

# # Define a function to keep only Chinese characters in a string
# def keep_chinese(text):
#     pattern = re.compile(r'[^\u4e00-\u9fff\s]')
#     chinese_only = pattern.sub('', text)
#     return chinese_only.strip()

# # Segment the Chinese words using Jieba and keep only the Chinese characters in each word
# df['chinese-words'] = df['review'].apply(lambda x: [keep_chinese(word) for word in jieba.cut(x, cut_all=False) if keep_chinese(word) != ''])

# # Create the TfidfVectorizer object with the vocabulary set to the unique words in the concatenated string
# chinese_words = ' '.join(word for review in df['chinese-words'] for word in review)
# vectorizer = TfidfVectorizer(vocabulary=set(chinese_words.split()))

# # Fit the vectorizer on the concatenated string and compute the TF-IDF features for the df['chinese-words'] column
# vectorizer.fit([chinese_words])
# tfidf_features_chinese = vectorizer.transform([' '.join(words) for words in df['chinese-words']])

# # Get the feature names from the vectorizer object and create a pandas DataFrame from the sparse matrix of the TF-IDF features
# feature_names = vectorizer.get_feature_names()
# df_tfidf_chinese = pd.DataFrame.sparse.from_spmatrix(tfidf_features_chinese, columns=feature_names)

# # Train Word2Vec and create the feature for Chinese sentences
# sentences = df['chinese-words']
# model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)
# w2v_features = pd.DataFrame([
#     np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0)
#     for sentence in sentences if np.any([model.wv[word] for word in sentence if word in model.wv])
# ])

# # Concatenate the dataframes
# combined_features = pd.concat([df_tfidf_chinese, w2v_features], axis=1)

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(w2v_features, df['label'], test_size=0.2, random_state=42)

# # Train a logistic regression model and evaluate the accuracy on the test set
# lr_model = LogisticRegression()
# lr_model.fit(X_train, y_train)
# y_pred = lr_model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")



ValueError: Found input variables with inconsistent numbers of samples: [7764, 7766]

In [47]:
import pandas as pd
df.isnull().sum()

label                0
review               0
translated_review    0
chinese-words        0
dtype: int64

In [71]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
import jieba

df = pd.read_csv('ChnSentiCorp_htl_all_translated.csv')
df['review'] = df['review'].apply(str)
df['translated_review'] = df['translated_review'].apply(str)

def clean_sentence(sen):
    clean_sen = re.sub(r'[^\w\s]', '', sen)
    return clean_sen

df['review'] = df['review'].apply(clean_sentence)

def keep_chinese(text):
    pattern = re.compile(r'[^\u4e00-\u9fff\s]')
    chinese_only = pattern.sub('', text)
    return chinese_only.strip()

df['chinese-words'] = df['review'].apply(
    lambda x: [keep_chinese(word) for word in jieba.cut(x, cut_all=False) if keep_chinese(word) != '']
)

sentences = df['chinese-words']
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

valid_indices = []
for i, sentence in enumerate(sentences):
    if np.any([model.wv[word] for word in sentence if word in model.wv]):
        valid_indices.append(i)

df_filtered = df.iloc[valid_indices]

w2v_features = []
for sentence in sentences:
    if np.any([model.wv[word] for word in sentence if word in model.wv]):
        w2v_features.append(np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0))

w2v_features = pd.DataFrame(w2v_features)

X_train, X_test, y_train, y_test = train_test_split(w2v_features, df_filtered['label'], test_size=0.2, random_state=42)

In [68]:
# Train a logistic regression model and evaluate the accuracy on the test set
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8235672891178365


In [69]:
from sklearn.svm import SVC

# Train an SVM model and evaluate the accuracy on the test set
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8139085640695428


In [60]:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes model and evaluate the accuracy on the test set
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [70]:
# Train a logistic regression model with L2 regularization and evaluate the accuracy on the test set
lr_model = LogisticRegression(penalty='l2', max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8235672891178365
