In [1]:
import re
import math
import unidecode
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statistics import mode
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.utils import resample
from sklearn.metrics import plot_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
warnings.filterwarnings('ignore')

In [2]:
df_2022 = pd.read_csv('../data/tweet_data_2022.csv')
df_2023 = pd.read_csv('../data/tweet_data_2023.csv')

### Data Preprocessing Functions

In [3]:
round_number = 3
random_state = 42
categories = ['SentimentScore']

In [4]:
def standardize_sent(sent):
    if ((sent == 0) | (sent == 1) | (sent == 2)):
        return 'Negative'
    elif (sent == 3):
        return 'Neutral'
    else:
        return 'Positive'

In [5]:
def clean_text(text):
    if type(text) == np.float:
        return ""
    temp = text.lower() # to lower case
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove @s
    temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r'http\S+', '', temp) # remove links
    temp = re.sub(r"www.\S+", "", temp) # remove links
    temp = re.sub(r'\n|[^a-zA-Z]', ' ', temp) # remove punctuation
    temp = temp.replace("\n", " ").split()
    temp = [w for w in temp if not w in stopwords_] # remove stopwords
    temp = [w for w in temp if not w.isdigit()] # remove numbers
    temp = [unidecode.unidecode(w) for w in temp] # turn non-enlish letters to english letters
    temp = " ".join(word for word in temp)
    return temp

In [6]:
def partition_data(df, ratio, time):
    #partiton
    if time:
        df.sort_values(by=['date'], inplace=True)
        df.reset_index(drop=True, inplace=True)
    df_rows = df.shape[0]
    seed_num = math.floor(df_rows * ratio[0])
    seed = df[:seed_num]
    unlabeled_num = seed_num + (math.floor(df_rows * ratio[1]))
    unlabeled = df[seed_num:unlabeled_num]
    test = df[unlabeled_num:]
    return seed, unlabeled, test

In [7]:
def train_model(seed):
    cv = 5
    train, test = train_test_split(seed, random_state=random_state, test_size=0.2, shuffle=True)
    X_train, X_test, Y_train, Y_test = train[['text_cleaned']], test[['text_cleaned']], train[['SentimentScore']], test[['SentimentScore']]
    #Wrap in ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("tf", CountVectorizer(stop_words=stopwords_), 'text_cleaned'),
            ("tfidf", TfidfVectorizer(stop_words=stopwords_), 'text_cleaned')]
    )
    #Define the model
    model_lst = [
                SVC(),
                KNeighborsClassifier(),
                DecisionTreeClassifier(),
                RandomForestClassifier(),
                AdaBoostClassifier(),
#                  SVC(kernel='rbf', probability=True, random_state=random_state), #SVC RBF
#                  SVC(kernel='poly', probability=True, degree=4, random_state=random_state), #SVC Poly
#                  BernoulliNB(fit_prior=True, class_prior=None), #Naive Bayes
#                  RandomForestClassifier(random_state=random_state), #Random Forest
#                  LogisticRegression(solver='sag', random_state=random_state), #Logistic Regression (Ridge)
#                  LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=random_state), #Logistic Regression (Lasso)
            ]
#     model = model_lst[model_index]
    
    pl_preds = []
    for model in model_lst:
        #Build the pipeline
        pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('clf', OneVsRestClassifier(model, n_jobs=1)),
                ])
        #Train the model
        pipeline.fit(X_train, Y_train)
        # compute the testing accuracy
        prediction = pipeline.predict(pd.DataFrame(X_test))
        pl_preds.append([pipeline, prediction])
        
    #Saves all the model pipelines
    pipelines = [x[0] for x in pl_preds]
    #Saves all the model predictions
    all_preds = np.array([x[1] for x in pl_preds]).transpose()
    #Find the mode in all preds
    final_preds = [mode(i) for i in all_preds]
    accuracy = accuracy_score(Y_test,final_preds)
    return pipelines, accuracy

In [8]:
def calc_entropy(lst):
    unique_num = list(set(lst))
    entropy = 0
    for i in range(len(unique_num)):
        label = unique_num[i]
        prob = sum(np.array(lst) == label)/len(lst)
        entropy += prob * math.log2(1/prob)
    return entropy

In [9]:
def choose_unlabeled(pipelines, unlabeled):
    unlabeled_x = unlabeled[['text_cleaned']]
    unlabeled_y = unlabeled[['SentimentScore']]
    all_preds = np.array([pl.predict(unlabeled_x) for pl in pipelines]).transpose()
    unlabeled['all_preds'] = list(all_preds)
    unlabeled['entropy'] = unlabeled['all_preds'].apply(calc_entropy)
    unlabeled.sort_values(by=['entropy'], ascending=False, inplace=True)

In [10]:
def active_learning(pipelines, seed, unlabeled, instances):
    # Sort the unlabeled data based on informativeness level
    choose_unlabeled(pipelines, unlabeled)
    # Update the unlabeled data and the info_data
    info_data, unlabeled = unlabeled.iloc[:instances], unlabeled.iloc[instances:]
    # Add selected data to the training set
    seed = pd.concat([seed, info_data[['date', 'text', 'SentimentScore', 'text_cleaned']]])
    pipelines, accuracy = train_model(seed)
    return pipelines, accuracy

## Data Preprocessing

In [11]:
original_stopwords = stopwords.words('english')
additional_stopwords = ['none']
original_stopwords.extend(additional_stopwords)
stopwords_ = set(original_stopwords)

#Selects only the tweets about China
df = df_2022[df_2022['country']=='China']
df = df[['date', 'text', 'id', 'Bucket', 'SentimentScore']]

#Shuffle the data
df = df.sample(frac=1, replace=False, random_state=1) 
df.reset_index(drop=True, inplace=True)

#Step 1: Remove tweets that do not have sentiment score
#Step 2: Average the sentiment score for each unique tweet
df = df.copy()[['date', 'text', 'id', 'SentimentScore']]
df.dropna(subset=['SentimentScore'], inplace=True)

df = pd.DataFrame(df.groupby(['date', 'text', 'id'])['SentimentScore'].mean())
df.reset_index(inplace=True)

#Remove ambiguous labels
range_lst = [0, 1, 2, 3, 4, 5]
df = df[df['SentimentScore'].apply(lambda x: True if x in range_lst else False)]
df['SentimentScore'] = df['SentimentScore'].apply(standardize_sent)

#Remove duplicates
df = df.drop_duplicates(subset=['id']).reset_index(drop=True)
df = df[['date', 'text', 'SentimentScore']]
df["text_cleaned"] = [clean_text(t) for t in df["text"]]

In [12]:
def trial(df, model_names, training_method, balance, sampling_size, sort_by_time, partition_ratio):
    output = {}
    output['model_names'] = model_names
    output['training_method'] = training_method #random_sampling, active_learning
    output['balance'] = balance
    output['sampling_size'] = sampling_size
    output['sort_by_time'] = sort_by_time
    output['partition_ratio'] = partition_ratio
        
    # 1. Balance dataset 
    df_1, df_2, df_3 = df[df.SentimentScore=='Negative'], df[df.SentimentScore=='Neutral'], df[df.SentimentScore=='Positive']

    # 1.1 Balance the label distribution  (33% Negative vs. 33% Neutral vs. 33% Positive)
    if balance:
        sample_size = min(df_1.shape[0], df_2.shape[0], df_3.shape[0])
        if df_1.shape[0] > sample_size:
            df_1 = resample(df_1, replace=False, n_samples=sample_size, random_state=random_state)
        if df_2.shape[0] > sample_size:
            df_2 = resample(df_2, replace=False, n_samples=sample_size, random_state=random_state)
        if df_3.shape[0] > sample_size:
            df_3 = resample(df_3, replace=False, n_samples=sample_size, random_state=random_state)

    # 1.2 Keep the natural label distribution
    seed_1, unlabeled_1, test_1 = partition_data(df_1, partition_ratio, sort_by_time)
    seed_2, unlabeled_2, test_2 = partition_data(df_2, partition_ratio, sort_by_time)
    seed_3, unlabeled_3, test_3 = partition_data(df_3, partition_ratio, sort_by_time)
    seed, unlabeled, test = pd.concat([seed_1, seed_2, seed_3]), pd.concat([unlabeled_1, unlabeled_2, unlabeled_3]), pd.concat([test_1, test_2, test_3])
    output['seed_size'], output['unlabeled_size'], output['test_size'] = seed.shape[0], unlabeled.shape[0], test.shape[0]
    
    initial_seed = seed.copy()
    initial_unlabeled = unlabeled.copy()
    
    # 2. Train the model
    initial_pipelines, initial_accuracy = train_model(initial_seed)
    
    # 3. Active Learning
    if sampling_size == 0:
        pipelines, accuracy = initial_pipelines, initial_accuracy
        
    # 3.1 Initial Model + Random Sampling
    elif training_method == 'random_sampling':
        if initial_unlabeled.shape[0] >= sampling_size:
            sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=False, random_state=random_state)
        else:
            sample_unlabeled = initial_unlabeled.sample(n=sampling_size, replace=True, random_state=random_state)
        seed_and_sample_unlabeled_df = pd.concat([initial_seed, sample_unlabeled])
        pipelines, accuracy = train_model(seed_and_sample_unlabeled_df)
        
    # 3.2 Initial Model + Active Learning
    else:
        pipelines, accuracy = active_learning(initial_pipelines, initial_seed, initial_unlabeled, sampling_size)

    # 4. Report Model Accuracy
    X_test, Y_test = test[['text_cleaned']], test[['SentimentScore']]

    pl_preds = []
    for pl in pipelines:
        # compute the testing accuracy
        prediction = pl.predict(pd.DataFrame(X_test))
        pl_preds.append([pl, prediction])
        
    #Saves all the model predictions
    all_preds = np.array([x[1] for x in pl_preds]).transpose()
    #Find the mode in all preds
    prediction = [mode(i) for i in all_preds]
    accuracy = round(accuracy_score(Y_test, prediction), round_number)
    f1_micro = round(f1_score(np.array(Y_test), prediction, average='micro'), round_number)
    f1_macro = round(f1_score(np.array(Y_test), prediction, average='macro'), round_number)
    f1_weighted = round(f1_score(np.array(Y_test), prediction, average='weighted'), round_number)
    
    precision_micro = round(precision_score(np.array(Y_test), prediction, average='micro'), round_number)
    precision_macro = round(precision_score(np.array(Y_test), prediction, average='macro'), round_number)
    precision_weighted = round(precision_score(np.array(Y_test), prediction, average='weighted'), round_number)
        
    recall_micro = round(recall_score(np.array(Y_test), prediction, average='micro'), round_number)
    recall_macro = round(recall_score(np.array(Y_test), prediction, average='macro'), round_number)
    recall_weighted = round(recall_score(np.array(Y_test), prediction, average='weighted'), round_number)
    
    output['accuracy'] = accuracy
    output['f1_micro'], output['f1_macro'], output['f1_weighted'] = f1_micro, f1_macro, f1_weighted
    output['precision_micro'], output['precision_macro'], output['precision_weighted'] = precision_micro, precision_macro, precision_weighted
    output['recall_micro'], output['recall_macro'], output['recall_weighted'] = recall_micro, recall_macro, recall_weighted
    return output

In [13]:
training_method = ['random_sampling', 'active_learning']
balanced = [True, False]
sampling_size = [0, 50, 100, 300, 600]
sort_by_time = [True, False]
partition_ratio = [[0.5, 0.25, 0.25], [0.6, 0.2, 0.2], [0.8, 0.1, 0.1]]



# training_method = ['active_learning']
# balanced = [True]
# sampling_size = [10]
# sort_by_time = [False]
# partition_ratio = [[0.8, 0.1, 0.1]]

In [14]:
model_result_df = pd.DataFrame()
index = 1
model_name = "SVC, KNN, Decision Tree, Random Forest, AdaBoost"
for tm in training_method:
    for b in balanced:
        for ss in sampling_size:
            for t in sort_by_time:
                for r in partition_ratio:
                    print(index)
                    model_output = trial(df, model_name, tm, b, ss, t, r)
                    if index == 0:
                        model_result_df = pd.DataFrame(model_output, index=index)
                    else:
                        model_result_df = model_result_df.append(pd.DataFrame([model_output],index=[index]))
                    index += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120


In [15]:
model_result_df.head()

Unnamed: 0,model_names,training_method,balance,sampling_size,sort_by_time,partition_ratio,seed_size,unlabeled_size,test_size,accuracy,f1_micro,f1_macro,f1_weighted,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted
1,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.5, 0.25, 0.25]",207,102,108,0.519,0.519,0.499,0.499,0.519,0.503,0.503,0.519,0.519,0.519
2,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.6, 0.2, 0.2]",249,81,87,0.598,0.598,0.581,0.581,0.598,0.586,0.586,0.598,0.598,0.598
3,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.8, 0.1, 0.1]",333,39,45,0.533,0.533,0.482,0.482,0.533,0.572,0.572,0.533,0.533,0.533
4,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,False,"[0.5, 0.25, 0.25]",207,102,108,0.528,0.528,0.499,0.499,0.528,0.489,0.489,0.528,0.528,0.528
5,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,False,"[0.6, 0.2, 0.2]",249,81,87,0.54,0.54,0.525,0.525,0.54,0.522,0.522,0.54,0.54,0.54


In [16]:
model_result_df.to_csv('sentiment_committee_model_result.csv')  

In [17]:
model_result_df.shape

(120, 19)

In [18]:
model_result_df

Unnamed: 0,model_names,training_method,balance,sampling_size,sort_by_time,partition_ratio,seed_size,unlabeled_size,test_size,accuracy,f1_micro,f1_macro,f1_weighted,precision_micro,precision_macro,precision_weighted,recall_micro,recall_macro,recall_weighted
1,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.5, 0.25, 0.25]",207,102,108,0.519,0.519,0.499,0.499,0.519,0.503,0.503,0.519,0.519,0.519
2,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.6, 0.2, 0.2]",249,81,87,0.598,0.598,0.581,0.581,0.598,0.586,0.586,0.598,0.598,0.598
3,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,True,"[0.8, 0.1, 0.1]",333,39,45,0.533,0.533,0.482,0.482,0.533,0.572,0.572,0.533,0.533,0.533
4,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,False,"[0.5, 0.25, 0.25]",207,102,108,0.528,0.528,0.499,0.499,0.528,0.489,0.489,0.528,0.528,0.528
5,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",random_sampling,True,0,False,"[0.6, 0.2, 0.2]",249,81,87,0.540,0.540,0.525,0.525,0.540,0.522,0.522,0.540,0.540,0.540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,True,"[0.6, 0.2, 0.2]",4418,1471,1476,0.959,0.959,0.346,0.939,0.959,0.653,0.941,0.959,0.343,0.959
117,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,True,"[0.8, 0.1, 0.1]",5891,735,739,0.958,0.958,0.363,0.939,0.958,0.653,0.940,0.958,0.353,0.958
118,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,False,"[0.5, 0.25, 0.25]",3682,1840,1843,0.959,0.959,0.342,0.939,0.959,0.653,0.941,0.959,0.341,0.959
119,"SVC, KNN, Decision Tree, Random Forest, AdaBoost",active_learning,False,600,False,"[0.6, 0.2, 0.2]",4418,1471,1476,0.959,0.959,0.346,0.939,0.959,0.653,0.941,0.959,0.343,0.959
