In [1]:
import numpy as np
import pandas as pd
import math
import re
from nltk.corpus import stopwords
import seaborn as sns
import unidecode
from sklearn.utils import resample
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns 
warnings.filterwarnings('ignore')

In [2]:
df_2022 = pd.read_csv('../data/tweet_data_2022.csv')
df_2023 = pd.read_csv('../data/tweet_data_2023.csv')

### Data Preprocessing Functions

In [3]:
round_number = 3
random_state = 42
categories = ['SentimentScore']

In [4]:
def standardize_sent(sent):
    if ((sent == 0) | (sent == 1) | (sent == 2)):
        return 'Negative'
    elif (sent == 3):
        return 'Neutral'
    else:
        return 'Positive'

In [5]:
def clean_text(text):
    if type(text) == np.float:
        return ""
    temp = text.lower() # to lower case
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove @s
    temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r'http\S+', '', temp) # remove links
    temp = re.sub(r"www.\S+", "", temp) # remove links
    temp = re.sub(r'\n|[^a-zA-Z]', ' ', temp) # remove punctuation
    temp = temp.replace("\n", " ").split()
    temp = [w for w in temp if not w in stopwords_] # remove stopwords
    temp = [w for w in temp if not w.isdigit()] # remove numbers
    temp = [unidecode.unidecode(w) for w in temp] # turn non-enlish letters to english letters
    temp = " ".join(word for word in temp)
    return temp

In [6]:
def partition_data(df, ratio, time):
    #partiton
    if time:
        df.sort_values(by=['date'], inplace=True)
        df.reset_index(drop=True, inplace=True)
    df_rows = df.shape[0]
    seed_num = math.floor(df_rows * ratio[0])
    seed = df[:seed_num]
    unlabeled_num = seed_num + (math.floor(df_rows * ratio[1]))
    unlabeled = df[seed_num:unlabeled_num]
    test = df[unlabeled_num:]
    return seed, unlabeled, test

In [7]:
def train_model(seed, model_index):
    cv = 5
    train, test = train_test_split(seed, random_state=random_state, test_size=0.2, shuffle=True)
    X_train, X_test, Y_train, Y_test = train[['text_cleaned']], test[['text_cleaned']], train[['SentimentScore']], test[['SentimentScore']]
    #Wrap in ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("tf", CountVectorizer(stop_words=stopwords_), 'text_cleaned'),
            ("tfidf", TfidfVectorizer(stop_words=stopwords_), 'text_cleaned')]
    )
    #Define the model
    model_lst = [
#                  SVC(kernel='rbf', probability=True, random_state=random_state), #SVC RBF
#                  SVC(kernel='poly', probability=True, degree=4, random_state=random_state), #SVC Poly
                 MultinomialNB(fit_prior=True, class_prior=None), #Multinomial Naive Bayes
                 RandomForestClassifier(random_state=random_state), #Random Forest
                 LogisticRegression(solver='sag', random_state=random_state), #Logistic Regression (Ridge)
                 LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=random_state), #Logistic Regression (Lasso)
            ]
    model = model_lst[model_index]
    
    #Build the pipeline
    pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', OneVsRestClassifier(model, n_jobs=1)),
            ])
    #Train the model
    pipeline.fit(X_train, Y_train)
    # compute the testing accuracy
    prediction = pipeline.predict(pd.DataFrame(X_test))
    #Cross Validation
    val_score = round(np.mean(cross_val_score(pipeline, X_test, Y_test, cv=cv)), round_number)
    return pipeline, val_score

In [8]:
def calc_entropy(x):
    entropy = 0
    for i in x:
        entropy += i * math.log2(1/i)
    return entropy

In [9]:
def choose_unlabeled(pipeline, unlabeled):
    unlabeled_x = unlabeled[['text_cleaned']]
    unlabeled_y = unlabeled[['SentimentScore']]
    prob = pipeline.predict_proba(unlabeled_x)
    unlabeled['prob'] = list(prob)
    unlabeled['entropy'] = unlabeled['prob'].apply(calc_entropy)
    unlabeled.sort_values(by=['entropy'], ascending=False, inplace=True)

In [10]:
def active_learning(pipeline, seed, unlabeled, instances, model_index):
    # Sort the unlabeled data based on informativeness level
    choose_unlabeled(pipeline, unlabeled)
    # Update the unlabeled data and the info_data
    info_data, unlabeled = unlabeled.iloc[:instances], unlabeled.iloc[instances:]
    # Add selected data to the training set
    seed = pd.concat([seed, info_data[['date', 'text', 'SentimentScore', 'text_cleaned']]])
    pipeline, validation_score = train_model(seed, model_index)
    return pipeline, validation_score

## Data Preprocessing

In [11]:
original_stopwords = stopwords.words('english')
additional_stopwords = ['none']
original_stopwords.extend(additional_stopwords)
stopwords_ = set(original_stopwords)

#Selects only the tweets about China
df = df_2022[df_2022['country']=='China']
df = df[['date', 'text', 'id', 'Bucket', 'SentimentScore']]

#Shuffle the data
df = df.sample(frac=1, replace=False, random_state=1) 
df.reset_index(drop=True, inplace=True)

#Step 1: Remove tweets that do not have sentiment score
#Step 2: Average the sentiment score for each unique tweet
df = df.copy()[['date', 'text', 'id', 'SentimentScore']]
df.dropna(subset=['SentimentScore'], inplace=True)

df = pd.DataFrame(df.groupby(['date', 'text', 'id'])['SentimentScore'].mean())
df.reset_index(inplace=True)

#Remove ambiguous labels
range_lst = [0, 1, 2, 3, 4, 5]
df = df[df['SentimentScore'].apply(lambda x: True if x in range_lst else False)]
df['SentimentScore'] = df['SentimentScore'].apply(standardize_sent)

#Remove duplicates
df = df.drop_duplicates(subset=['id']).reset_index(drop=True)
df = df[['date', 'text', 'SentimentScore']]
df["text_cleaned"] = [clean_text(t) for t in df["text"]]

In [12]:
def trial(df, model_name, model_index, training_method, balance, sampling_size, sort_by_time, partition_ratio):
    output = {}
    output['model_name'] = model_name
    output['training_method'] = training_method
    output['balance'] = balance
    output['sampling_size'] = sampling_size
    output['sort_by_time'] = sort_by_time
    output['partition_ratio'] = partition_ratio
        
    # 1. Balance dataset 
    df_1, df_2, df_3 = df[df.SentimentScore=='Negative'], df[df.SentimentScore=='Neutral'], df[df.SentimentScore=='Positive']
    df_lst = [df_1, df_2, df_3]
    
    # 1.1 Balance the label distribution  (33% Negative vs. 33% Neutral vs. 33% Positive)
    if balance:
        sample_size = min(df_1.shape[0], df_2.shape[0], df_3.shape[0])
        if df_1.shape[0] > sample_size:
            df_1 = resample(df_1, replace=False, n_samples=sample_size, random_state=random_state)
        if df_2.shape[0] > sample_size:
            df_2 = resample(df_2, replace=False, n_samples=sample_size, random_state=random_state)
        if df_3.shape[0] > sample_size:
            df_3 = resample(df_3, replace=False, n_samples=sample_size, random_state=random_state)

    # 1.2 Keep the natural label distribution
    seed_1, unlabeled_1, test_1 = partition_data(df_1, partition_ratio, sort_by_time)
    seed_2, unlabeled_2, test_2 = partition_data(df_2, partition_ratio, sort_by_time)
    seed_3, unlabeled_3, test_3 = partition_data(df_3, partition_ratio, sort_by_time)
    seed, unlabeled, test = pd.concat([seed_1, seed_2, seed_3]), pd.concat([unlabeled_1, unlabeled_2, unlabeled_3]), pd.concat([test_1, test_2, test_3])
    output['seed_size'], output['unlabeled_size'], output['test_size'] = seed.shape[0], unlabeled.shape[0], test.shape[0]
    
    initial_seed = seed.copy()
    initial_unlabeled = unlabeled.copy()
    
    # 2. Train the model
    initial_pipeline, initial_val_score = train_model(initial_seed, model_index)
    
    # 3. Active Learning
    if sampling_size == 0:
        pipeline, val_score = initial_pipeline, initial_val_score
        
    # 3.1 Initial Model + Random Sampling
    elif training_method == 'random_sampling':
        if initial_unlabeled.shape[0] >= sampling_size:
            sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=False, random_state=random_state)
        else:
            sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=True, random_state=random_state)
        seed_and_sample_unlabeled_df = pd.concat([initial_seed, sample_unlabeled])
        pipeline, val_score = train_model(seed_and_sample_unlabeled_df, model_index)
        
    # 3.2 Initial Model + Active Learning
    else:
        pipeline, val_score = active_learning(initial_pipeline, initial_seed, initial_unlabeled, sampling_size, model_index)

    # 4. Report Model Accuracy
    X_test, Y_test = test[['text_cleaned']], test[['SentimentScore']]
    prediction = pipeline.predict(pd.DataFrame(X_test))
    accuracy = round(accuracy_score(Y_test, prediction), round_number)
    
    f1_micro = round(f1_score(np.array(Y_test), prediction, average='micro'), round_number)
    f1_macro = round(f1_score(np.array(Y_test), prediction, average='macro'), round_number)
    f1_weighted = round(f1_score(np.array(Y_test), prediction, average='weighted'), round_number)
    
    precision_micro = round(precision_score(np.array(Y_test), prediction, average='micro'), round_number)
    precision_macro = round(precision_score(np.array(Y_test), prediction, average='macro'), round_number)
    precision_weighted = round(precision_score(np.array(Y_test), prediction, average='weighted'), round_number)
        
    recall_micro = round(recall_score(np.array(Y_test), prediction, average='micro'), round_number)
    recall_macro = round(recall_score(np.array(Y_test), prediction, average='macro'), round_number)
    recall_weighted = round(recall_score(np.array(Y_test), prediction, average='weighted'), round_number)
    
    output['val_score'] = val_score
    output['accuracy'] = accuracy
    output['f1_micro'], output['f1_macro'], output['f1_weighted'] = f1_micro, f1_macro, f1_weighted
    output['precision_micro'], output['precision_macro'], output['precision_weighted'] = precision_micro, precision_macro, precision_weighted
    output['recall_micro'], output['recall_macro'], output['recall_weighted'] = recall_micro, recall_macro, recall_weighted
    return output

In [13]:
# model_name = ['SVC RBF', 'SVC Poly', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 
#               'Logistic Regression', 'Lasso', 'Ridge']
model_name = ['Multinomial Naive Bayes', 'Random Forest Classifier', 'Logistic Regression (Ridge)', 'Logistic Regression (Lasso)']
training_method = ['random_sampling', 'active_learning']
balanced = [True, False]
sampling_size = [0, 100, 200, 300, 400, 500, 600]
sort_by_time = [True, False]
partition_ratio = [[0.2, 0.4, 0.4], [0.5, 0.25, 0.25], [0.8, 0.1, 0.1]]


# model_name = ['Multinomial Naive Bayes', 'Random Forest Classifier', 'Logistic Regression (Ridge)', 'Logistic Regression (Lasso)']
# training_method = ['active_learning']
# balanced = [True]
# sampling_size = [10]
# sort_by_time = [False]
# partition_ratio = [[0.8, 0.1, 0.1]]

In [14]:
model_result_df = pd.DataFrame()
index = 1
for mn in range(len(model_name)):
    for tm in training_method:
        for b in balanced:
            for ss in sampling_size:
                for t in sort_by_time:
                    for r in partition_ratio:
                        print(index)
                        print(model_name[mn])
                        model_output = trial(df, model_name[mn], mn, tm, b, ss, t, r)
                        if index == 0:
                            model_result_df = pd.DataFrame(model_output, index=index)
                        else:
                            model_result_df = model_result_df.append(pd.DataFrame([model_output],index=[index]))
                        index += 1

1
Multinomial Naive Bayes
2
Multinomial Naive Bayes
3
Multinomial Naive Bayes
4
Multinomial Naive Bayes
5
Multinomial Naive Bayes
6
Multinomial Naive Bayes
7
Multinomial Naive Bayes
8
Multinomial Naive Bayes
9
Multinomial Naive Bayes
10
Multinomial Naive Bayes
11
Multinomial Naive Bayes
12
Multinomial Naive Bayes
13
Multinomial Naive Bayes
14
Multinomial Naive Bayes
15
Multinomial Naive Bayes
16
Multinomial Naive Bayes
17
Multinomial Naive Bayes
18
Multinomial Naive Bayes
19
Multinomial Naive Bayes
20
Multinomial Naive Bayes
21
Multinomial Naive Bayes
22
Multinomial Naive Bayes
23
Multinomial Naive Bayes
24
Multinomial Naive Bayes
25
Multinomial Naive Bayes
26
Multinomial Naive Bayes
27
Multinomial Naive Bayes
28
Multinomial Naive Bayes
29
Multinomial Naive Bayes
30
Multinomial Naive Bayes
31
Multinomial Naive Bayes
32
Multinomial Naive Bayes
33
Multinomial Naive Bayes
34
Multinomial Naive Bayes
35
Multinomial Naive Bayes
36
Multinomial Naive Bayes
37
Multinomial Naive Bayes
38
Multino

292
Logistic Regression (Ridge)
293
Logistic Regression (Ridge)
294
Logistic Regression (Ridge)
295
Logistic Regression (Ridge)
296
Logistic Regression (Ridge)
297
Logistic Regression (Ridge)
298
Logistic Regression (Ridge)
299
Logistic Regression (Ridge)
300
Logistic Regression (Ridge)
301
Logistic Regression (Ridge)
302
Logistic Regression (Ridge)
303
Logistic Regression (Ridge)
304
Logistic Regression (Ridge)
305
Logistic Regression (Ridge)
306
Logistic Regression (Ridge)
307
Logistic Regression (Ridge)
308
Logistic Regression (Ridge)
309
Logistic Regression (Ridge)
310
Logistic Regression (Ridge)
311
Logistic Regression (Ridge)
312
Logistic Regression (Ridge)
313
Logistic Regression (Ridge)
314
Logistic Regression (Ridge)
315
Logistic Regression (Ridge)
316
Logistic Regression (Ridge)
317
Logistic Regression (Ridge)
318
Logistic Regression (Ridge)
319
Logistic Regression (Ridge)
320
Logistic Regression (Ridge)
321
Logistic Regression (Ridge)
322
Logistic Regression (Ridge)
323
Logi

549
Logistic Regression (Lasso)
550
Logistic Regression (Lasso)
551
Logistic Regression (Lasso)
552
Logistic Regression (Lasso)
553
Logistic Regression (Lasso)
554
Logistic Regression (Lasso)
555
Logistic Regression (Lasso)
556
Logistic Regression (Lasso)
557
Logistic Regression (Lasso)
558
Logistic Regression (Lasso)
559
Logistic Regression (Lasso)
560
Logistic Regression (Lasso)
561
Logistic Regression (Lasso)
562
Logistic Regression (Lasso)
563
Logistic Regression (Lasso)
564
Logistic Regression (Lasso)
565
Logistic Regression (Lasso)
566
Logistic Regression (Lasso)
567
Logistic Regression (Lasso)
568
Logistic Regression (Lasso)
569
Logistic Regression (Lasso)
570
Logistic Regression (Lasso)
571
Logistic Regression (Lasso)
572
Logistic Regression (Lasso)
573
Logistic Regression (Lasso)
574
Logistic Regression (Lasso)
575
Logistic Regression (Lasso)
576
Logistic Regression (Lasso)


In [15]:
model_result_df.head()
print(model_result_df.shape)

(576, 20)


In [16]:
model_result_df.to_csv('sentiment_model_result.csv')  

In [17]:
model_result_df

Unnamed: 0,model_name,training_method,balance,sampling_size,sort_by_time,partition_ratio,seed_size,unlabeled_size,test_size,val_score,accuracy,f1_micro,f1_macro,f1_weighted,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted
1,Multinomial Naive Bayes,random_sampling,True,0,True,"[0.2, 0.4, 0.4]",81,165,171,0.517,0.450,0.450,0.417,0.417,0.450,0.412,0.412,0.450,0.450,0.450
2,Multinomial Naive Bayes,random_sampling,True,0,True,"[0.5, 0.25, 0.25]",207,102,108,0.578,0.583,0.583,0.579,0.579,0.583,0.612,0.612,0.583,0.583,0.583
3,Multinomial Naive Bayes,random_sampling,True,0,True,"[0.8, 0.1, 0.1]",333,39,45,0.612,0.600,0.600,0.573,0.573,0.600,0.609,0.609,0.600,0.600,0.600
4,Multinomial Naive Bayes,random_sampling,True,0,False,"[0.2, 0.4, 0.4]",81,165,171,0.467,0.573,0.573,0.504,0.504,0.573,0.520,0.520,0.573,0.573,0.573
5,Multinomial Naive Bayes,random_sampling,True,0,False,"[0.5, 0.25, 0.25]",207,102,108,0.522,0.620,0.620,0.593,0.593,0.620,0.624,0.624,0.620,0.620,0.620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
572,Logistic Regression (Lasso),active_learning,False,500,True,"[0.5, 0.25, 0.25]",3682,1840,1843,0.962,0.960,0.960,0.410,0.945,0.960,0.654,0.944,0.960,0.383,0.960
573,Logistic Regression (Lasso),active_learning,False,500,True,"[0.8, 0.1, 0.1]",5891,735,739,0.962,0.958,0.958,0.393,0.941,0.958,0.542,0.933,0.958,0.372,0.958
574,Logistic Regression (Lasso),active_learning,False,500,False,"[0.2, 0.4, 0.4]",1471,2944,2950,0.942,0.958,0.958,0.353,0.940,0.958,0.445,0.928,0.958,0.348,0.958
575,Logistic Regression (Lasso),active_learning,False,500,False,"[0.5, 0.25, 0.25]",3682,1840,1843,0.952,0.960,0.960,0.396,0.944,0.960,0.559,0.938,0.960,0.374,0.960
