In [1]:
import numpy as np
import pandas as pd
import math
import re
from nltk.corpus import stopwords
import seaborn as sns
import unidecode
from sklearn.utils import resample
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns 
warnings.filterwarnings('ignore')

In [2]:
df_2022 = pd.read_csv('../data/tweet_data_2022.csv')
df_2023 = pd.read_csv('../data/tweet_data_2023.csv')

### Data Preprocessing Functions

In [3]:
round_number = 3
random_state = 42
categories = ['Bucket']

In [4]:
def standardize_bucket(bucket):
    if ((bucket == '1.0') | (bucket == '1')):
        return '1'
    elif ((bucket == '2') | (bucket == '3') | (bucket == '2.0') | (bucket == '3.0')):
        return '2 or 3'
    else:
        return bucket

In [5]:
def clean_text(text):
    if type(text) == np.float:
        return ""
    temp = text.lower() # to lower case
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove @s
    temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r'http\S+', '', temp) # remove links
    temp = re.sub(r"www.\S+", "", temp) # remove links
    temp = re.sub(r'\n|[^a-zA-Z]', ' ', temp) # remove punctuation
    temp = temp.replace("\n", " ").split()
    temp = [w for w in temp if not w in stopwords_] # remove stopwords
    temp = [w for w in temp if not w.isdigit()] # remove numbers
    temp = [unidecode.unidecode(w) for w in temp] # turn non-enlish letters to english letters
    temp = " ".join(word for word in temp)
    return temp

In [6]:
def partition_data(df, ratio, time):
    #partiton
    if time:
        df.sort_values(by=['date'], inplace=True)
        df.reset_index(drop=True, inplace=True)
    df_rows = df.shape[0]
    seed_num = math.floor(df_rows * ratio[0])
    seed = df[:seed_num]
    unlabeled_num = seed_num + (math.floor(df_rows * ratio[1]))
    unlabeled = df[seed_num:unlabeled_num]
    test = df[unlabeled_num:]
    return seed, unlabeled, test

In [7]:
def train_model(seed, model_index):
    cv = 5
    train, test = train_test_split(seed, random_state=random_state, test_size=0.2, shuffle=True)
    X_train, X_test, Y_train, Y_test = train[['text_cleaned']], test[['text_cleaned']], train[['Bucket']], test[['Bucket']]
    #Wrap in ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("tf", CountVectorizer(stop_words=stopwords_), 'text_cleaned'),
            ("tfidf", TfidfVectorizer(stop_words=stopwords_), 'text_cleaned')]
    )
    #Define the model
    model_lst = [
#                  SVC(kernel='rbf', probability=True, random_state=random_state), #SVC RBF
#                  SVC(kernel='poly', probability=True, degree=4, random_state=random_state), #SVC Poly
                 BernoulliNB(fit_prior=True, class_prior=None), #Naive Bayes
                 RandomForestClassifier(random_state=random_state), #Random Forest
                 LogisticRegression(solver='sag', random_state=random_state), #Logistic Regression (Ridge)
                 LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=random_state), #Logistic Regression (Lasso)
            ]
    model = model_lst[model_index]
    
    #Build the pipeline
    pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('clf', OneVsRestClassifier(model, n_jobs=1)),
            ])
    #Train the model
    pipeline.fit(X_train, Y_train)
    # compute the testing accuracy
    prediction = pipeline.predict(pd.DataFrame(X_test))
    #Cross Validation
    val_score = round(np.mean(cross_val_score(pipeline, X_test, Y_test, cv=cv)), round_number)
    return pipeline, val_scor

In [8]:
def calc_entropy(x):
    entropy = 0
    for i in x:
        entropy += i * math.log2(1/i)
    return entropy

In [9]:
def choose_unlabeled(pipeline, unlabeled):
    unlabeled_x = unlabeled[['text_cleaned']]
    unlabeled_y = unlabeled[['Bucket']]
    prob = pipeline.predict_proba(unlabeled_x)
    unlabeled['prob'] = list(prob)
    unlabeled['entropy'] = unlabeled['prob'].apply(calc_entropy)
    unlabeled.sort_values(by=['entropy'], ascending=False, inplace=True)

In [10]:
def active_learning(pipeline, seed, unlabeled, instances, model_index):
    # Sort the unlabeled data based on informativeness level
    choose_unlabeled(pipeline, unlabeled)
    # Update the unlabeled data and the info_data
    info_data, unlabeled = unlabeled.iloc[:instances], unlabeled.iloc[instances:]
    # Add selected data to the training set
    seed = pd.concat([seed, info_data[['date', 'text', 'Bucket', 'text_cleaned']]])
    pipeline, validation_score = train_model(seed, model_index)
    return pipeline, validation_score

## Data Preprocessing

In [11]:
original_stopwords = stopwords.words('english')
additional_stopwords = ['none']
original_stopwords.extend(additional_stopwords)
stopwords_ = set(original_stopwords)

#Selects only the tweets about China
df = df_2022[df_2022['country']=='China']
df = df[['date', 'text', 'id', 'Bucket', 'SentimentScore']]

#Shuffle the data
df = df.sample(frac=1, replace=False, random_state=1) 
df.reset_index(drop=True, inplace=True)
#Standardized the bucket label
df['Bucket'] = df['Bucket'].apply(standardize_bucket)
#Remove tweets that are in both buckets
df_bucket_count = pd.DataFrame(df.groupby('id')['Bucket'].nunique())
df_bucket_count.reset_index(inplace=True)
df_bucket_count.columns = ['tweet_id', 'bucket_num']
df = df.merge(df_bucket_count, left_on='id', right_on='tweet_id')
df = df[df['bucket_num'] == 1]
#Remove tweets without a bucket (null)
df = df[(df['Bucket'] == '1') | (df['Bucket'] == '2 or 3')]
#Remove duplicates
df = df.drop_duplicates(subset=['id']).reset_index(drop=True)
df = df[['date', 'text', 'Bucket']]
df["text_cleaned"] = [clean_text(t) for t in df["text"]]

In [12]:
def trial(df, model_name, model_index, training_method, balance, sampling_size, sort_by_time, partition_ratio):
    output = {}
    output['model_name'] = model_name
    output['training_method'] = training_method #random_sampling, active_learning
    output['balance'] = balance
    output['sampling_size'] = sampling_size
    output['sort_by_time'] = sort_by_time
    output['partition_ratio'] = partition_ratio
        
    # 1. Balance dataset 
    df_1, df_2_3 = df[df.Bucket=='1'], df[df.Bucket=='2 or 3']
    df_lst = [df_1, df_2_3]
    
    # 1.1 Balance the label distribution  (50% Bucket 1 vs. 50% Non-Bucket 1)
    if balance:
        sample_size = min(df_1.shape[0], df_2_3.shape[0])
        if df_1.shape[0] > sample_size:
            df_1 = resample(df_1, replace=False, n_samples=sample_size, random_state=random_state)
        if df_2_3.shape[0] > sample_size:
            df_2_3 = resample(df_2_3, replace=False, n_samples=sample_size, random_state=random_state)

    # 1.2 Keep the natural label distribution
    seed_1, unlabeled_1, test_1 = partition_data(df_1, partition_ratio, sort_by_time)
    seed_2_3, unlabeled_2_3, test_2_3 = partition_data(df_2_3, partition_ratio, sort_by_time)
    seed, unlabeled, test = pd.concat([seed_1, seed_2_3]), pd.concat([unlabeled_1, unlabeled_2_3]), pd.concat([test_1, test_2_3])
    output['seed_size'], output['unlabeled_size'], output['test_size'] = seed.shape[0], unlabeled.shape[0], test.shape[0]
    
    initial_seed = seed.copy()
    initial_unlabeled = unlabeled.copy()
    
    # 2. Train the model
    initial_pipeline, initial_val_score = train_model(initial_seed, model_index)
    
    # 3. Active Learning
    if sampling_size == 0:
        pipeline, val_score = initial_pipeline, initial_val_score
        
    # 3.1 Initial Model + Random Sampling
    elif training_method == 'random_sampling':
        if initial_unlabeled.shape[0] >= sampling_size:
            sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=False, random_state=random_state)
        else:
            sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=True, random_state=random_state)
        seed_and_sample_unlabeled_df = pd.concat([initial_seed, sample_unlabeled])
        pipeline, val_score = train_model(seed_and_sample_unlabeled_df, model_index)
        
    # 3.2 Initial Model + Active Learning
    else:
        pipeline, val_score = active_learning(initial_pipeline, initial_seed, initial_unlabeled, sampling_size, model_index)

    # 4. Report Model Accuracy
    X_test, Y_test = test[['text_cleaned']], test[['Bucket']]
    prediction = pipeline.predict(pd.DataFrame(X_test))
    accuracy = round(accuracy_score(Y_test, prediction), round_number)
    f1 = round(f1_score(np.array(Y_test), prediction, pos_label='1'), round_number)
    precision = round(precision_score(np.array(Y_test), prediction, pos_label='1', average='binary'), round_number)
    recall = round(recall_score(np.array(Y_test), prediction, pos_label='1', average='binary'), round_number)
    specificity = round(recall_score(np.array(Y_test), prediction, pos_label='2 or 3', average='binary'), round_number)
    output['val_score'] = val_score
    output['accuracy'] = accuracy
    output['f1_score'] = f1
    output['precision'] = precision
    output['recall'] = recall
    output['specificity'] = specificity
    return output
    

In [13]:
# model_name = ['SVC RBF', 'SVC Poly', 'Bernoulli Naive Bayes', 'Random Forest Classifier', 
#               'Logistic Regression', 'Lasso', 'Ridge']
model_name = ['Bernoulli Naive Bayes', 'Random Forest Classifier', 'Logistic Regression (Ridge)', 'Logistic Regression (Lasso)']
training_method = ['random_sampling', 'active_learning']
balanced = [True, False]
sampling_size = [0, 50, 100, 300, 600]
sort_by_time = [True, False]
partition_ratio = [[0.5, 0.25, 0.25], [0.6, 0.2, 0.2], [0.8, 0.1, 0.1]]


# model_name = ['Bernoulli Naive Bayes', 'Random Forest Classifier', 'Logistic Regression (Ridge)', 'Logistic Regression (Lasso)']
# training_method = ['active_learning']
# balanced = [True]
# sampling_size = [10]
# sort_by_time = [False]
# partition_ratio = [[0.8, 0.1, 0.1]]

In [14]:
model_result_df = pd.DataFrame()
index = 1
for mn in range(len(model_name)):
    for tm in training_method:
        for b in balanced:
            for ss in sampling_size:
                for t in sort_by_time:
                    for r in partition_ratio:
                        print(index)
                        print(model_name[mn])
                        model_output = trial(df, model_name[mn], mn, tm, b, ss, t, r)
                        if index == 0:
                            model_result_df = pd.DataFrame(model_output, index=index)
                        else:
                            model_result_df = model_result_df.append(pd.DataFrame([model_output],index=[index]))
                        index += 1

1
Bernoulli Naive Bayes
2
Bernoulli Naive Bayes
3
Bernoulli Naive Bayes
4
Bernoulli Naive Bayes
5
Bernoulli Naive Bayes
6
Bernoulli Naive Bayes
7
Bernoulli Naive Bayes
8
Bernoulli Naive Bayes
9
Bernoulli Naive Bayes
10
Bernoulli Naive Bayes
11
Bernoulli Naive Bayes
12
Bernoulli Naive Bayes
13
Bernoulli Naive Bayes
14
Bernoulli Naive Bayes
15
Bernoulli Naive Bayes
16
Bernoulli Naive Bayes
17
Bernoulli Naive Bayes
18
Bernoulli Naive Bayes
19
Bernoulli Naive Bayes
20
Bernoulli Naive Bayes
21
Bernoulli Naive Bayes
22
Bernoulli Naive Bayes
23
Bernoulli Naive Bayes
24
Bernoulli Naive Bayes
25
Bernoulli Naive Bayes
26
Bernoulli Naive Bayes
27
Bernoulli Naive Bayes
28
Bernoulli Naive Bayes
29
Bernoulli Naive Bayes
30
Bernoulli Naive Bayes
31
Bernoulli Naive Bayes
32
Bernoulli Naive Bayes
33
Bernoulli Naive Bayes
34
Bernoulli Naive Bayes
35
Bernoulli Naive Bayes
36
Bernoulli Naive Bayes
37
Bernoulli Naive Bayes
38
Bernoulli Naive Bayes
39
Bernoulli Naive Bayes
40
Bernoulli Naive Bayes
41
Bernou

295
Logistic Regression (Ridge)
296
Logistic Regression (Ridge)
297
Logistic Regression (Ridge)
298
Logistic Regression (Ridge)
299
Logistic Regression (Ridge)
300
Logistic Regression (Ridge)
301
Logistic Regression (Ridge)
302
Logistic Regression (Ridge)
303
Logistic Regression (Ridge)
304
Logistic Regression (Ridge)
305
Logistic Regression (Ridge)
306
Logistic Regression (Ridge)
307
Logistic Regression (Ridge)
308
Logistic Regression (Ridge)
309
Logistic Regression (Ridge)
310
Logistic Regression (Ridge)
311
Logistic Regression (Ridge)
312
Logistic Regression (Ridge)
313
Logistic Regression (Ridge)
314
Logistic Regression (Ridge)
315
Logistic Regression (Ridge)
316
Logistic Regression (Ridge)
317
Logistic Regression (Ridge)
318
Logistic Regression (Ridge)
319
Logistic Regression (Ridge)
320
Logistic Regression (Ridge)
321
Logistic Regression (Ridge)
322
Logistic Regression (Ridge)
323
Logistic Regression (Ridge)
324
Logistic Regression (Ridge)
325
Logistic Regression (Ridge)
326
Logi

In [15]:
model_result_df.head()

Unnamed: 0,model_name,training_method,balance,sampling_size,sort_by_time,partition_ratio,seed_size,unlabeled_size,test_size,val_score,accuracy,f1_score,precision,recall,specificity
1,Bernoulli Naive Bayes,random_sampling,True,0,True,"[0.5, 0.25, 0.25]",5073,2536,2538,0.796,0.835,0.905,0.836,0.987,0.24
2,Bernoulli Naive Bayes,random_sampling,True,0,True,"[0.6, 0.2, 0.2]",6088,2029,2030,0.81,0.83,0.902,0.835,0.98,0.24
3,Bernoulli Naive Bayes,random_sampling,True,0,True,"[0.8, 0.1, 0.1]",8117,1014,1016,0.807,0.833,0.903,0.839,0.978,0.266
4,Bernoulli Naive Bayes,random_sampling,True,0,False,"[0.5, 0.25, 0.25]",5073,2536,2538,0.796,0.837,0.904,0.851,0.964,0.337
5,Bernoulli Naive Bayes,random_sampling,True,0,False,"[0.6, 0.2, 0.2]",6088,2029,2030,0.814,0.84,0.905,0.859,0.956,0.387


In [16]:
model_result_df.to_csv('bucket_model_result.csv')  

In [17]:
model_result_df.shape

(480, 15)

In [18]:
model_result_df

Unnamed: 0,model_name,training_method,balance,sampling_size,sort_by_time,partition_ratio,seed_size,unlabeled_size,test_size,val_score,accuracy,f1_score,precision,recall,specificity
1,Bernoulli Naive Bayes,random_sampling,True,0,True,"[0.5, 0.25, 0.25]",5073,2536,2538,0.796,0.835,0.905,0.836,0.987,0.240
2,Bernoulli Naive Bayes,random_sampling,True,0,True,"[0.6, 0.2, 0.2]",6088,2029,2030,0.810,0.830,0.902,0.835,0.980,0.240
3,Bernoulli Naive Bayes,random_sampling,True,0,True,"[0.8, 0.1, 0.1]",8117,1014,1016,0.807,0.833,0.903,0.839,0.978,0.266
4,Bernoulli Naive Bayes,random_sampling,True,0,False,"[0.5, 0.25, 0.25]",5073,2536,2538,0.796,0.837,0.904,0.851,0.964,0.337
5,Bernoulli Naive Bayes,random_sampling,True,0,False,"[0.6, 0.2, 0.2]",6088,2029,2030,0.814,0.840,0.905,0.859,0.956,0.387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,Logistic Regression (Lasso),active_learning,False,600,True,"[0.6, 0.2, 0.2]",6088,2029,2030,0.784,0.837,0.904,0.848,0.969,0.320
477,Logistic Regression (Lasso),active_learning,False,600,True,"[0.8, 0.1, 0.1]",8117,1014,1016,0.820,0.844,0.909,0.850,0.978,0.324
478,Logistic Regression (Lasso),active_learning,False,600,False,"[0.5, 0.25, 0.25]",5073,2536,2538,0.786,0.831,0.896,0.876,0.918,0.490
479,Logistic Regression (Lasso),active_learning,False,600,False,"[0.6, 0.2, 0.2]",6088,2029,2030,0.814,0.839,0.902,0.881,0.923,0.511
