In [1]:
import pandas as pd
biden = pd.read_csv('Joe_Biden.csv')
trump = pd.read_csv('Donald_Trump.csv')

In [2]:

# hand annotation indicies
# Biden
positive_biden = [24, 27, 28, 35, 47, 48, 51, 57, 63, 72, 86, 98, 173, 198, 219, 268, 343, 358, 369, 420]

negative_biden = [374,156,106,85,171,172,174,186,192,222,229,235,110,6,277,313,356,368,49]

neutral_biden = [150,445,161,183,227,250,29,159,240,260,283,305,325,332,341,357,370,380,398,402]

# Trump
positive_trump = [447,408, 395, 389, 377, 374, 372, 320, 321, 323, 324, 326, 344, 346, 348, 349, 355, 106, 338, 419]

negative_trump = [448, 396, 394, 1, 2, 3, 6, 7, 9, 10, 11, 12, 17, 18, 100, 332, 120, 29, 30, 36, 48]

neutral_trump = [443, 432, 418, 398, 373, 365, 364, 4, 5, 8, 14, 16, 19, 253, 300, 310, 325, 328, 341, 352]


In [3]:
# create new col for sentiment

biden['sentiment'] = ""

trump['sentiment'] = ""

In [4]:
# populate rows in new col with corresponding sentiment
# biden
for i in biden.index:
    for j in positive_biden:
        if i == j:
            biden.at[i,'sentiment'] = 'pos'
            
for i in biden.index:
    for j in neutral_biden:
        if i == j:
            biden.at[i,'sentiment'] = 'neutral'
            
for i in biden.index:
    for j in negative_biden:
        if i == j:
            biden.at[i,'sentiment'] = 'neg'

            
# Trump           
for i in trump.index:
    for j in positive_trump:
        if i == j:
            trump.at[i,'sentiment'] = 'pos'
            
for i in trump.index:
    for j in neutral_trump:
        if i == j:
            trump.at[i,'sentiment'] = 'neutral'
            
for i in trump.index:
    for j in negative_trump:
        if i == j:
            trump.at[i,'sentiment'] = 'neg'

In [5]:
# Data
import numpy as np
import pandas as pd

# NLP
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Modeling
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.utils import shuffle
from sklearn import svm

# plot
import matplotlib.pyplot as plt

ps = PorterStemmer()
stopwords_english = set(stopwords.words('english'))

In [6]:
# select text and news company names
biden_sentiment = biden[['text', 'sentiment']]

trump_sentiment = trump[['text', 'sentiment']]

In [7]:
# define a function to create corpus for each sentiment

def create_corpus(biden_sentiment, trump_sentiment, sentiment_name):
    
    df1 = biden_sentiment.loc[biden_sentiment['sentiment'] == sentiment_name]
    df2 = trump_sentiment.loc[trump_sentiment['sentiment'] == sentiment_name]
    #df3 = Trump_news.loc[Trump_news['media'] == media_name]
    frames = [df1, df2]
    df = pd.concat(frames, ignore_index = True)
    
    return df

In [8]:
# create small corpus for each sentiment
POS = create_corpus(biden_sentiment, trump_sentiment, sentiment_name = 'pos')
NEG = create_corpus(biden_sentiment, trump_sentiment, sentiment_name = 'neg')
NEUTRAL = create_corpus(biden_sentiment, trump_sentiment, sentiment_name = 'neutral')

In [9]:
corpus_All_sentiment = pd.concat([POS, NEG, NEUTRAL], axis = 0, ignore_index = True)
corpus_All_sentiment.head()

Unnamed: 0,text,sentiment
0,Hide highlightingFull TextTranslateUndo Transl...,pos
1,Hide highlightingFull TextTranslateUndo Transl...,pos
2,Hide highlightingAbstractTranslateUndo Transla...,pos
3,Hide highlightingFull TextTranslateUndo Transl...,pos
4,Hide highlightingFull TextTranslateUndo Transl...,pos


# 2. Data Preprocessing

In [10]:
def Data_Preprocessing(corpus):
    # convert string to list i.e. ['hide', 'highlightingfull', '[[missing']
    corpus['text'] = corpus['text'].str.split()

    # lower case each item in the list, and remove non-alphabetic characters i.e. ['hide', 'highlightingfull', 'missing']
    corpus['text'] = corpus['text'].apply(lambda x: [re.sub(r'[^a-zA-Z]', "",y.lower()) for y in x])

    # join the item in the list back to a string and replace keywords containing the target names
#     keywords = ['new york times', 'the new york times', 'international new york times'
#                 "the washington post", "WP Company LLC", "washpostcom",
#                 'wall street journal', 'thomaswsjcom', 'Dow Jones Company Inc.']
    corpus['text'] = corpus['text'].apply(lambda x: [' '.join(x)])

    # stem each word in the text
    corpus['text'] = corpus['text'].apply(lambda x: str(x[0]))
    corpus['text'] = corpus['text'].str.split()
    corpus['text'] = corpus['text'].apply(lambda x: [ps.stem(y) for y in x])

    # join the item in the list back to a string
    corpus['text'] = corpus['text'].apply(lambda x: [' '.join(x)])

    # convert list to a string
    corpus['text'] = corpus['text'].apply(lambda x: str(x[0]))

    print(type(corpus.iloc[0]['text']))
    
    return corpus

In [11]:
processed_sentiment_corpus = Data_Preprocessing(corpus_All_sentiment)
processed_sentiment_corpus.head(2)

<class 'str'>


Unnamed: 0,text,sentiment
0,hide highlightingful texttranslateundo transla...,pos
1,hide highlightingful texttranslateundo transla...,pos


# 3. Modeling

## 3.1 Split training and test sets

In [12]:
# separate features and targets
X = processed_sentiment_corpus.iloc[:, 0]
y = processed_sentiment_corpus.iloc[:, 1]

# split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
X_train, y_train = shuffle(X_train, y_train)

X_train.head(2)

90    hide highlightingful texttranslateundo transla...
48    full texttranslateundo translat fromtotranslat...
Name: text, dtype: object

In [13]:
le = LabelEncoder()
# get label name mapping
le.fit(y_train)
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

# encode the target 
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

{'neg': 0, 'neutral': 1, 'pos': 2}


## 3.2 Getting document term matrices

### 3.2.1 Create matrix of token counts using unigram, bigram and trigram tokens

In [14]:
# define a function to get unigram, bigram, and trigram matrix of token counts

def get_DTM(Ngram_range, x_train, x_test):
    vectorizer = CountVectorizer(stop_words='english', min_df = int(3), max_df = 0.5, 
                                 ngram_range = Ngram_range, binary=True) 
    vectorizer.fit(x_train)
    trans_x_train = vectorizer.transform(x_train)
    trans_x_test = vectorizer.transform(x_test)
    
    return trans_x_train, trans_x_test

In [15]:
# unigram token counts matrix
binary1_train, binary1_test = get_DTM(Ngram_range = (1, 1), x_train = X_train, x_test = X_test)

# bigram token counts matrix
binary2_train, binary2_test = get_DTM(Ngram_range = (1, 2), x_train = X_train, x_test = X_test)

# trigram token counts matrix
binary3_train, binary3_test = get_DTM(Ngram_range = (1, 3), x_train = X_train, x_test = X_test)

In [16]:
print("The unique terms in binary1_train is:", binary1_train.toarray().shape[1])
print("The unique terms in binary2_train is:", binary2_train.toarray().shape[1])
print("The unique terms in binary3_train is:", binary3_train.toarray().shape[1])

The unique terms in binary1_train is: 2579
The unique terms in binary2_train is: 4988
The unique terms in binary3_train is: 5735


### 3.2.2 Create DTM using unigram, bigram and trigram term frequency

In [17]:
# define a function to get unigram, bigram, and trigram term frequency matrix

def get_TF_DTM(Ngram_range, x_train, x_test):
    vectorizer = CountVectorizer(stop_words='english', min_df = int(3), max_df = 0.5, ngram_range = Ngram_range) 
    vectorizer.fit(x_train)
    trans_x_train = vectorizer.transform(x_train)
    trans_x_test = vectorizer.transform(x_test)
    
    return trans_x_train, trans_x_test

In [18]:
# unigram tf matrix
tf1_train, tf1_test = get_TF_DTM(Ngram_range = (1, 1), x_train = X_train, x_test = X_test)

# bigram tf matrix
tf2_train, tf2_test = get_TF_DTM(Ngram_range = (1, 2), x_train = X_train, x_test = X_test)

# trigram tf matrix
tf3_train, tf3_test = get_TF_DTM(Ngram_range = (1, 3), x_train = X_train, x_test = X_test)

In [19]:
print("The unique terms in tf1_train is:", tf1_train.toarray().shape[1])
print("The unique terms in tf2_train is:", tf2_train.toarray().shape[1])
print("The unique terms in tf3_train is:", tf3_train.toarray().shape[1])

The unique terms in tf1_train is: 2579
The unique terms in tf2_train is: 4988
The unique terms in tf3_train is: 5735


### 3.2.3 Create DTM using unigram, bigram and trigram TF-IDF

In [20]:
# define a function to get unigram, bigram, and trigram TF-IDF matrix

def get_TF_IDF_DTM(Ngram_range, x_train, x_test):
    vectorizer = TfidfVectorizer(stop_words='english', min_df = int(3), max_df = 0.5, 
                                 ngram_range = Ngram_range) 
    vectorizer.fit(x_train)
    trans_x_train = vectorizer.transform(x_train)
    trans_x_test = vectorizer.transform(x_test)
    
    return trans_x_train, trans_x_test

In [21]:

# unigram tf-idf matrix
tfidf1_train, tfidf1_test = get_TF_IDF_DTM(Ngram_range = (1, 1), x_train = X_train, x_test = X_test)

# bigram tf-idf matrix
tfidf2_train, tfidf2_test = get_TF_IDF_DTM(Ngram_range = (1, 2), x_train = X_train, x_test = X_test)

# trigram tf-idf matrix
tfidf3_train, tfidf3_test = get_TF_IDF_DTM(Ngram_range = (1, 3), x_train = X_train, x_test = X_test)

In [22]:
print("The unique terms in tfidf1_train is:", tfidf1_train.toarray().shape[1])
print("The unique terms in tfidf2_train is:", tfidf2_train.toarray().shape[1])
print("The unique terms in tfidf3_train is:", tfidf3_train.toarray().shape[1])

The unique terms in tfidf1_train is: 2579
The unique terms in tfidf2_train is: 4988
The unique terms in tfidf3_train is: 5735


## 3.3 Modeling

### 3.3.1 XGBoost Classifier

In [23]:
# define model training
def train_model(clf, dtm, test):
    # train data
    clf.fit(dtm, y_train)
    
    # Predicting on the test set
    preds = clf.predict(test)
    
    # print evaluation matrix
    print("Accuracy:", '{:1.4f}'.format(accuracy_score(y_test, preds)))
    print("")
    print(classification_report(y_test, preds))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, preds))
    
    return '{:1.4f}'.format(accuracy_score(y_test, preds))

In [24]:
# Use Naive Bayes
#clf = XGBClassifier() #MultinomialNB()
#clf = svm.SVC(gamma = 'scale', C = 1.0)
param = {'max_depth': 3, 'eta': 0.3, 'objective':'multi:softmax', 'num_class': 3}
xgb_clf = XGBClassifier(param)
svm_clf = svm.SVC(gamma = 'scale', C = 1.0)
# reference: https://medium.com/@gabrielziegler3/multiclass-multilabel-classification-with-xgboost-66195e4d9f2d
# reference: https://xgboost.readthedocs.io/en/latest/parameter.html

# Model Configurations
binary1 = ("unigram, binary", binary1_train, binary1_test)
binary2 = ("bigram, binary",  binary2_train, binary2_test)
binary3 = ("trigram, binary", binary3_train, binary3_test)
tf1 = ("unigram, TF", tf1_train, tf1_test)
tf2 = ("bigram, TF",  tf2_train, tf2_test)
tf3 = ("trigram, TF", tf3_train, tf3_test)
tfidf1 = ("unigram, TF-IDF", tfidf1_train, tfidf1_test)
tfidf2 = ("bigram, TF-IDF",  tfidf2_train, tfidf2_test)
tfidf3 = ("trigram, TF-IDF", tfidf3_train, tfidf3_test)
DTMs = [binary1, binary2, binary3,
        tf1, tf2, tf3,
        tfidf1, tfidf2, tfidf3]

df = pd.DataFrame({"config": [],
                   "accuracy": []})
best_config = ["Best Configuration", "none", 0, "none", "none"]
for data in DTMs:
    print(data[0])
    print("")
    score = train_model(clf = xgb_clf, dtm = data[1], test = data[2])
    print("======================================================")
    print("")
    if float(score) > float(best_config[2]):
        best_config = ["Best Configuration:", data[0], score, data[1], data[2]]
    df = df.append({"config": data[0],
               "accuracy": float(score)},
               ignore_index=True)

unigram, binary



XGBoostError: Invalid Parameter format for max_depth expect int but value='{'max_depth': 3, 'eta': 0.3, 'objective': 'multi:softmax', 'num_class': 3}'

### 3.3.2 SVM Classifier

In [25]:
# svm classifier
df_svm = pd.DataFrame({"config": [],
                   "accuracy": []})
best_config_svm = ["Best Configuration", "none", 0, "none", "none"]
for data in DTMs:
    print(data[0])
    print("")
    score = train_model(clf = svm_clf, dtm = data[1], test = data[2])
    print("======================================================")
    print("")
    if float(score) > float(best_config_svm[2]):
        best_config_svm = ["Best Configuration:", data[0], score, data[1], data[2]]
    df_svm = df_svm.append({"config": data[0],
               "accuracy": float(score)},
               ignore_index=True)

unigram, binary

Accuracy: 0.5417

              precision    recall  f1-score   support

           0       0.70      0.88      0.78         8
           1       0.50      0.25      0.33         8
           2       0.40      0.50      0.44         8

    accuracy                           0.54        24
   macro avg       0.53      0.54      0.52        24
weighted avg       0.53      0.54      0.52        24

Confusion Matrix:
[[7 0 1]
 [1 2 5]
 [2 2 4]]

bigram, binary

Accuracy: 0.5833

              precision    recall  f1-score   support

           0       0.67      0.75      0.71         8
           1       0.50      0.50      0.50         8
           2       0.57      0.50      0.53         8

    accuracy                           0.58        24
   macro avg       0.58      0.58      0.58        24
weighted avg       0.58      0.58      0.58        24

Confusion Matrix:
[[6 2 0]
 [1 4 3]
 [2 2 4]]

trigram, binary

Accuracy: 0.5833

              precision    recall  f1-sc

In [26]:
# results of svm classifier
df_svm

Unnamed: 0,config,accuracy
0,"unigram, binary",0.5417
1,"bigram, binary",0.5833
2,"trigram, binary",0.5833
3,"unigram, TF",0.4583
4,"bigram, TF",0.5417
5,"trigram, TF",0.5417
6,"unigram, TF-IDF",0.625
7,"bigram, TF-IDF",0.7083
8,"trigram, TF-IDF",0.6667


In [27]:
# best model
print(best_config_svm)

['Best Configuration:', 'bigram, TF-IDF', '0.7083', <96x4988 sparse matrix of type '<class 'numpy.float64'>'
	with 38749 stored elements in Compressed Sparse Row format>, <24x4988 sparse matrix of type '<class 'numpy.float64'>'
	with 9099 stored elements in Compressed Sparse Row format>]
