In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime
import emoji
import re
import string
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import unidecode
from sklearn.metrics import plot_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, Lasso
from sklearn.utils import resample
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

### Load Data

In [2]:
df = pd.read_csv("SentimentLabeled_10112022.csv")

In [3]:
#Select tweets about China only
df = df[df['country']=='China']

In [4]:
df = df[['text', 'id', 'Bucket', 'SentimentScore']]

### Bucket Data Preprocessing

In [5]:
#Step 1: Remove tweets that do not have sentiment score
#Step 2: Average the sentiment score for each unique tweet
sent_df = df.copy()[['text', 'id', 'SentimentScore']]
sent_df.dropna(subset=['SentimentScore'], inplace=True)

In [6]:
sent_df = pd.DataFrame(sent_df.groupby(['text', 'id'])['SentimentScore'].mean())
sent_df.reset_index(inplace=True)
sent_df = sent_df[['text', 'SentimentScore']]
sent_df.head(10)

Unnamed: 0,text,SentimentScore
0,"""#NorthKorea is propped up by regimes like Chi...",2.0
1,"""... If China had learned the lessons of Tiena...",1.0
2,"""...Jim Banks, an Indiana Republican, slammed ...",1.5
3,"""Accelerating progress in vaccinating people, ...",2.0
4,"""Although the U.S. is currently ahead of China...",2.0
5,"""American Needs to Invest in Future Tech *Now*...",2.0
6,"""America’s foremost nat-sec threat is China......",1.0
7,"""An important new report from Sen. Marco Rubio...",2.0
8,"""Anyone hoping that China is finally turning a...",1.0
9,"""Between the sanctions, diplomacy &amp; having...",4.0


In [7]:
def extract_symbols(df, col):
    # extract @s
    at_users = []
    for row in range(len(df)):
        matches = re.findall(r'@\w*\b', col[row])
        if len(matches) == 0:
            users = 'NaN'
        else:
            users = [user[1:] for user in matches]
        at_users += [users]
    
    # extract #s
    hashtags = []
    for row in range(len(df)):
        matches = re.findall(r'#\w*\b', col[row])
        if len(matches) == 0:
            tags = 'NaN'
        tags = [tag[1:] for tag in matches ]
        hashtags += [tags]
    
    # extract links
    web_links = []
    for row in range(len(df)):
        matches = re.findall(r'http://\S+|https://\S+', col[row])
        if len(matches) == 0:
            links = 'NaN'
        links = [link for link in matches ]
        web_links += [links]
        
    # extract emojis
    emoji_list = []
    for row in range(len(df)):
        matches = []
        temp = list(col[row])
        for ch in temp:
            if (emoji.is_emoji(ch)):
                matches += [ch]
        if len(matches) == 0:
             emoji_list += ["NaN"]
        else:
            emoji_list += [matches]
    
    return at_users, hashtags, web_links, emoji_list

In [8]:
#Perform label extraction on sentiments
text = sent_df['text'].astype('string').fillna('NA')
user, hashtags, web_links, emoji_list = extract_symbols(sent_df, text)
# store in new columns
sent_df["com_at_users"] = user
sent_df["com_hashtags"] = hashtags
sent_df["com_web_links"] = web_links
sent_df["com_emoji_list"] = emoji_list

In [9]:
original_stopwords = stopwords.words('english')
additional_stopwords = ['none']
original_stopwords.extend(additional_stopwords)
stopwords = set(original_stopwords)

def clean_text(text):
    if type(text) == np.float:
        return ""
    temp = text.lower() # to lower case
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove @s
    temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r'http\S+', '', temp) # remove links
    temp = re.sub(r"www.\S+", "", temp) # remove links
    temp = re.sub(r'\n|[^a-zA-Z]', ' ', temp) # remove punctuation
    temp = temp.replace("\n", " ").split()
    temp = [w for w in temp if not w in stopwords] # remove stopwords
    temp = [w for w in temp if not w.isdigit()] # remove numbers
    temp = [unidecode.unidecode(w) for w in temp] # turn non-enlish letters to english letters
    temp = " ".join(word for word in temp)
    return temp

# lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [10]:
sent_df["text_cleaned"] = [clean_text(t) for t in sent_df["text"]]
sent_df['lemmatized_text'] = sent_df['text'].apply(lambda text: lemmatize_words(text))
#tokenization
sent_df['tokenized_text'] = sent_df['lemmatized_text'].apply(word_tokenize) 
# stemming
ps = PorterStemmer()
sent_df['stemmed_text'] = sent_df['text_cleaned'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))

### Sentiment Analysis Prediction

In [11]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
sent_df = sent_df[['text_cleaned', 'SentimentScore']]
sent_df = sent_df[sent_df['SentimentScore']<5]
sent_df.shape

(9022, 2)

#### Train Test Split dataset

In [12]:
categories = ['SentimentScore']
train, test = train_test_split(sent_df, random_state=42, test_size=0.2, shuffle=True)
X_train = train[['text_cleaned']]
X_test = test[['text_cleaned']]
Y_train = train[['SentimentScore']]
Y_test = test[['SentimentScore']]
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(7217, 1)
(1805, 1)
(7217, 1)
(1805, 1)


In [13]:
# wrap in ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("tf", CountVectorizer(stop_words=stop_words), 'text_cleaned'),
        ("tfidf", TfidfVectorizer(stop_words=stop_words), 'text_cleaned')]
)

In [14]:
random_state_value = 2
model_lst = [RandomForestRegressor(max_depth=10, random_state=random_state_value), #Random Forest
             LinearRegression(), #Linear Regression
             DecisionTreeRegressor(random_state=random_state_value), #Decision Tree
             Ridge(alpha=1.0, random_state=random_state_value), #Ridge
             Lasso(alpha=1.0, random_state=random_state_value) #Lasso
            ]

model_name_lst = ['Random Forest Regressor', 'Linear Regression', 'Decision Tree Regressor', 'Ridge', 'Lasso']

In [15]:
def model_result(model, model_name):
    pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model),
            ])

    pipeline.fit(X_train, Y_train)
    prediction = pipeline.predict(pd.DataFrame(X_test))
    mse = mean_squared_error(Y_test, prediction)
    print('The model name is ' + model_name + '.')
    print('MSE is {}'.format(round(mse, 4)))

### Random Forest 

In [16]:
rd, rd_name = model_lst[0], model_name_lst[0]
model_result(rd, rd_name)

The model name is Random Forest Regressor.
MSE is 0.3329


### Linear Regression

In [17]:
lr, lr_name = model_lst[1], model_name_lst[1]
model_result(lr, lr_name)

The model name is Linear Regression.
MSE is 1.1612


### Decision Tree

In [18]:
dt, dt_name = model_lst[2], model_name_lst[2]
model_result(dt, dt_name)

The model name is Decision Tree Regressor.
MSE is 0.5005


### Ridge

In [19]:
ridge, ridge_name = model_lst[3], model_name_lst[3]
model_result(ridge, ridge_name)

The model name is Ridge.
MSE is 0.4073


### Lasso

In [20]:
lasso, lasso_name = model_lst[4], model_name_lst[4]
model_result(lasso, lasso_name)

The model name is Lasso.
MSE is 0.391


### Grid Search

In [21]:
RFR_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('random_forest', RandomForestRegressor(random_state=2)),
            ])

param_grid = {
    "random_forest__n_estimators": [10, 100, 200],
    "random_forest__max_depth": [1000, None],
    "random_forest__min_samples_split": [1, 2],
    "random_forest__min_samples_leaf": [1, 2],
    "random_forest__min_weight_fraction_leaf": [0, 1],
    "random_forest__warm_start": [True, False],
}

search = GridSearchCV(RFR_pipeline, param_grid, n_jobs=5)
search.fit(X_train, Y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.265):
{'random_forest__max_depth': 1000, 'random_forest__min_samples_leaf': 1, 'random_forest__min_samples_split': 2, 'random_forest__min_weight_fraction_leaf': 0, 'random_forest__n_estimators': 200, 'random_forest__warm_start': True}


In [22]:
RFR_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('random_forest', RandomForestRegressor(max_depth=1000, 
                                                        min_samples_leaf=1, 
                                                        min_samples_split=2, 
                                                        min_weight_fraction_leaf=0, 
                                                        n_estimators=200, 
                                                        warm_start=True,
                                                        random_state=2)),
            ])

RFR_pipeline.fit(X_train, Y_train)
prediction = RFR_pipeline.predict(pd.DataFrame(X_test))

### Best Model Statistics

In [23]:
mse = round(mean_squared_error(Y_test, prediction), 3)
print('MSE is {}'.format(mse))
r2 = round(r2_score(Y_test, prediction), 3)
print('R2 is {}'.format(r2))

MSE is 0.291
R2 is 0.253
