In [1]:
import re
import math
import unidecode
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statistics import mode
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.utils import resample
from sklearn.metrics import plot_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
warnings.filterwarnings('ignore')

In [2]:
df_2022 = pd.read_csv('../data/tweet_data_2022.csv')
df_2023 = pd.read_csv('../data/tweet_data_2023.csv')

### Data Preprocessing Functions

In [3]:
round_number = 3
random_state = 42
categories = ['Bucket']

In [4]:
def standardize_bucket(bucket):
    if ((bucket == '1.0') | (bucket == '1')):
        return '1'
    elif ((bucket == '2') | (bucket == '3') | (bucket == '2.0') | (bucket == '3.0')):
        return '2 or 3'
    else:
        return bucket

In [5]:
def clean_text(text):
    if type(text) == np.float:
        return ""
    temp = text.lower() # to lower case
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove @s
    temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r'http\S+', '', temp) # remove links
    temp = re.sub(r"www.\S+", "", temp) # remove links
    temp = re.sub(r'\n|[^a-zA-Z]', ' ', temp) # remove punctuation
    temp = temp.replace("\n", " ").split()
    temp = [w for w in temp if not w in stopwords_] # remove stopwords
    temp = [w for w in temp if not w.isdigit()] # remove numbers
    temp = [unidecode.unidecode(w) for w in temp] # turn non-enlish letters to english letters
    temp = " ".join(word for word in temp)
    return temp

In [6]:
def partition_data(df, ratio, time):
    #partiton
    if time:
        df.sort_values(by=['date'], inplace=True)
        df.reset_index(drop=True, inplace=True)
    df_rows = df.shape[0]
    seed_num = math.floor(df_rows * ratio[0])
    seed = df[:seed_num]
    unlabeled_num = seed_num + (math.floor(df_rows * ratio[1]))
    unlabeled = df[seed_num:unlabeled_num]
    test = df[unlabeled_num:]
    return seed, unlabeled, test

In [7]:
def train_model(seed):
    cv = 5
    train, test = train_test_split(seed, random_state=random_state, test_size=0.2, shuffle=True)
    X_train, X_test, Y_train, Y_test = train[['text_cleaned']], test[['text_cleaned']], train[['Bucket']], test[['Bucket']]
    #Wrap in ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("tf", CountVectorizer(stop_words=stopwords_), 'text_cleaned'),
            ("tfidf", TfidfVectorizer(stop_words=stopwords_), 'text_cleaned')]
    )
    #Define the model
    model_lst = [
                SVC(),
                KNeighborsClassifier(),
                DecisionTreeClassifier(),
                RandomForestClassifier(),
                AdaBoostClassifier(),
            ]
    
    pl_preds = []
    for model in model_lst:
        #Build the pipeline
        pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('clf', OneVsRestClassifier(model, n_jobs=1)),
                ])
        #Train the model
        pipeline.fit(X_train, Y_train)
        # compute the testing accuracy
        prediction = pipeline.predict(pd.DataFrame(X_test))
        pl_preds.append([pipeline, prediction])
        
    #Saves all the model pipelines
    pipelines = [x[0] for x in pl_preds]
    #Saves all the model predictions
    all_preds = np.array([x[1] for x in pl_preds]).transpose()
    #Find the mode in all preds
    final_preds = [mode(i) for i in all_preds]
    accuracy = accuracy_score(Y_test,final_preds)
    return pipelines, accuracy

In [8]:
def calc_entropy(lst):
    unique_num = list(set(lst))
    entropy = 0
    for i in range(len(unique_num)):
        label = unique_num[i]
        prob = sum(np.array(lst) == label)/len(lst)
        entropy += prob * math.log2(1/prob)
    return entropy

In [9]:
def choose_unlabeled(pipelines, unlabeled):
    unlabeled_x = unlabeled[['text_cleaned']]
    unlabeled_y = unlabeled[['Bucket']]
    all_preds = np.array([pl.predict(unlabeled_x) for pl in pipelines]).transpose()
    unlabeled['all_preds'] = list(all_preds)
    unlabeled['entropy'] = unlabeled['all_preds'].apply(calc_entropy)
    unlabeled.sort_values(by=['entropy'], ascending=False, inplace=True)

In [10]:
def active_learning(pipelines, seed, unlabeled, instances):
    # Sort the unlabeled data based on informativeness level
    choose_unlabeled(pipelines, unlabeled)
    # Update the unlabeled data and the info_data
    info_data, unlabeled = unlabeled.iloc[:instances], unlabeled.iloc[instances:]
    # Add selected data to the training set
    seed = pd.concat([seed, info_data[['date', 'text', 'Bucket', 'text_cleaned']]])
    pipelines, accuracy = train_model(seed)
    return pipelines, accuracy

## Data Preprocessing

In [11]:
original_stopwords = stopwords.words('english')
additional_stopwords = ['none']
original_stopwords.extend(additional_stopwords)
stopwords_ = set(original_stopwords)

#Selects only the tweets about China
df = df_2022[df_2022['country']=='China']
df = df[['date', 'text', 'id', 'Bucket', 'SentimentScore']]

#Shuffle the data
df = df.sample(frac=1, replace=False, random_state=1) 
df.reset_index(drop=True, inplace=True)
#Standardized the bucket label
df['Bucket'] = df['Bucket'].apply(standardize_bucket)
#Remove tweets that are in both buckets
df_bucket_count = pd.DataFrame(df.groupby('id')['Bucket'].nunique())
df_bucket_count.reset_index(inplace=True)
df_bucket_count.columns = ['tweet_id', 'bucket_num']
df = df.merge(df_bucket_count, left_on='id', right_on='tweet_id')
df = df[df['bucket_num'] == 1]
#Remove tweets without a bucket (null)
df = df[(df['Bucket'] == '1') | (df['Bucket'] == '2 or 3')]
#Remove duplicates
df = df.drop_duplicates(subset=['id']).reset_index(drop=True)
df = df[['date', 'text', 'Bucket']]
df["text_cleaned"] = [clean_text(t) for t in df["text"]]

In [15]:
def trial(df, model_names, training_method, balance, sampling_size, sort_by_time, partition_ratio):
    output = {}
    output['model_names'] = model_names
    output['training_method'] = training_method #random_sampling, active_learning
    output['balance'] = balance
    output['sampling_size'] = sampling_size
    output['sort_by_time'] = sort_by_time
    output['partition_ratio'] = partition_ratio
    accuracy_lst, f1_lst, precision_lst, recall_lst, specificity_lst = [], [], [], [], []
    
    for i in range(5):
        # 1. Balance dataset 
        df_1, df_2_3 = df[df.Bucket=='1'], df[df.Bucket=='2 or 3']
        df_lst = [df_1, df_2_3]

        # 1.1 Balance the label distribution  (50% Bucket 1 vs. 50% Non-Bucket 1)
        if balance:
            sample_size = min(df_1.shape[0], df_2_3.shape[0])
            if df_1.shape[0] > sample_size:
                df_1 = resample(df_1, replace=False, n_samples=sample_size, random_state=random_state)
            if df_2_3.shape[0] > sample_size:
                df_2_3 = resample(df_2_3, replace=False, n_samples=sample_size, random_state=random_state)

        # 1.2 Keep the natural label distribution
        seed_1, unlabeled_1, test_1 = partition_data(df_1, partition_ratio, sort_by_time)
        seed_2_3, unlabeled_2_3, test_2_3 = partition_data(df_2_3, partition_ratio, sort_by_time)
        seed, unlabeled, test = pd.concat([seed_1, seed_2_3]), pd.concat([unlabeled_1, unlabeled_2_3]), pd.concat([test_1, test_2_3])
        output['seed_size'], output['unlabeled_size'], output['test_size'] = seed.shape[0], unlabeled.shape[0], test.shape[0]

        initial_seed = seed.copy()
        initial_unlabeled = unlabeled.copy()

        # 2. Train the model
        initial_pipelines, initial_accuracy = train_model(initial_seed)

        # 3. Active Learning
        if sampling_size == 0:
            pipelines, accuracy = initial_pipelines, initial_accuracy

        # 3.1 Initial Model + Random Sampling
        elif training_method == 'random_sampling':
            if initial_unlabeled.shape[0] >= sampling_size:
                sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=False, random_state=i)
            else:
                sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=True, random_state=i)
            seed_and_sample_unlabeled_df = pd.concat([initial_seed, sample_unlabeled])
            pipelines, accuracy = train_model(seed_and_sample_unlabeled_df)

        # 3.2 Initial Model + Active Learning
        else:
            pipelines, accuracy = active_learning(initial_pipelines, initial_seed, initial_unlabeled, sampling_size)

        # 4. Report Model Accuracy
        X_test, Y_test = test[['text_cleaned']], test[['Bucket']]


        pl_preds = []
        for pl in pipelines:
            # compute the testing accuracy
            prediction = pl.predict(pd.DataFrame(X_test))
            pl_preds.append([pl, prediction])

        #Saves all the model predictions
        all_preds = np.array([x[1] for x in pl_preds]).transpose()
        #Find the mode in all preds
        prediction = [mode(i) for i in all_preds]
        accuracy = round(accuracy_score(Y_test, prediction), round_number)
        f1 = round(f1_score(np.array(Y_test), prediction, pos_label='1'), round_number)
        precision = round(precision_score(np.array(Y_test), prediction, pos_label='1', average='binary'), round_number)
        recall = round(recall_score(np.array(Y_test), prediction, pos_label='1', average='binary'), round_number)
        specificity = round(recall_score(np.array(Y_test), prediction, pos_label='2 or 3', average='binary'), round_number)
        
        accuracy_lst.append(accuracy)
        f1_lst.append(f1)
        precision_lst.append(precision) 
        recall_lst.append(recall) 
        specificity_lst.append(specificity) 
    
    output['accuracy'] = np.mean(accuracy_lst)
    output['f1_score'] = np.mean(f1_lst)
    output['precision'] = np.mean(precision_lst)
    output['recall'] = np.mean(recall_lst)
    output['specificity'] = np.mean(specificity_lst)
    return output

In [16]:
training_method = ['random_sampling', 'active_learning']
balanced = [True, False]
sampling_size = [0, 200, 400, 600]
sort_by_time = [True, False]
partition_ratio = [[0.1, 0.45, 0.45], [0.5, 0.25, 0.25], [0.9, 0.05, 0.05]]

In [17]:
model_result_df = pd.DataFrame()
index = 1
model_name = "SVC, KNN, Decision Tree, Random Forest, AdaBoost"
for tm in training_method:
    for b in balanced:
        for ss in sampling_size:
            for t in sort_by_time:
                for r in partition_ratio:
                    print(index)
                    model_output = trial(df, model_name, tm, b, ss, t, r)
                    if index == 0:
                        model_result_df = pd.DataFrame(model_output, index=index)
                    else:
                        model_result_df = model_result_df.append(pd.DataFrame([model_output],index=[index]))
                    index += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96


In [18]:
model_result_df

Unnamed: 0,model_names,training_method,balance,sampling_size,sort_by_time,partition_ratio,seed_size,unlabeled_size,test_size,accuracy,f1_score,precision,recall,specificity
1,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.1, 0.45, 0.45]",412,1854,1858,0.5702,0.5470,0.5778,0.5196,0.6204
2,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.5, 0.25, 0.25]",2062,1030,1032,0.7346,0.7440,0.7190,0.7712,0.6980
3,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.9, 0.05, 0.05]",3710,206,208,0.7644,0.7846,0.7232,0.8578,0.6712
4,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,False,"[0.1, 0.45, 0.45]",412,1854,1858,0.6940,0.6462,0.7654,0.5596,0.8286
5,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,False,"[0.5, 0.25, 0.25]",2062,1030,1032,0.7576,0.7434,0.7886,0.7030,0.8116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,True,"[0.5, 0.25, 0.25]",5073,2536,2538,0.8314,0.9036,0.8284,0.9942,0.1930
93,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,True,"[0.9, 0.05, 0.05]",9131,507,509,0.8230,0.9000,0.8190,0.9980,0.1440
94,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,False,"[0.1, 0.45, 0.45]",1014,4565,4568,0.8160,0.8946,0.8220,0.9814,0.1670
95,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,False,"[0.5, 0.25, 0.25]",5073,2536,2538,0.8446,0.9098,0.8470,0.9824,0.3050


In [19]:
model_result_df.to_csv('bucket_committee_model_result.csv')  