In [2]:
# Data
import numpy as np
import pandas as pd

# NLP
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim import utils
import gensim.parsing.preprocessing as gsp
from wordcloud import WordCloud

# Modeling
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.utils import shuffle
from sklearn import svm

# plot
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ps = PorterStemmer()
stopwords_english = set(stopwords.words('english'))

In [3]:
trump = pd.read_csv('/Users/sa/Desktop/NLP_The-2020-Presidential-Race-master/Data/All_Candidates/Donald_Trump.csv')

In [4]:
trump['text'][355]



In [5]:
trump['title'][355]


"Trump's Second-Term Agenda Still on the Drawing Board; The president rarely discusses what he would do if he wins another term, but his advisers have begun to sketch out ideas"

In [6]:
# hand annotation indicies
positive = [320, 321, 323, 324, 326, 344, 346, 348, 349, 355]

negative = [1, 2, 3, 6, 7, 9, 10, 11, 12, 17, 18, 100, 332]

neutral = [0, 4, 5, 8, 13, 14, 15, 16, 19, 300, 310, 325, 328, 341, 352]

In [7]:
# create new col for sentiment

trump['sentiment'] = ""

In [8]:
# populate rows in new col with corresponding sentiment

for i in trump.index:
    for j in positive:
        if i == j:
            trump.at[i,'sentiment'] = 'pos'
            
for i in trump.index:
    for j in neutral:
        if i == j:
            trump.at[i,'sentiment'] = 'neutral'
            
for i in trump.index:
    for j in negative:
        if i == j:
            trump.at[i,'sentiment'] = 'neg'

In [9]:
# select text and news company names
trump_sentiment = trump[['text', 'sentiment']]

In [10]:
# define a function to create corpus for each sentiment

def create_corpus(trump_sentiment, sentiment_name):
    
    df1 = trump_sentiment.loc[trump_sentiment['sentiment'] == sentiment_name]
    #df2 = Sanders_news.loc[Sanders_news['media'] == media_name]
    #df3 = Trump_news.loc[Trump_news['media'] == media_name]
    #frames = [df1, df2, df3]
    #df = pd.concat(frames, ignore_index = True)
    
    return df1

In [11]:
# create small corpus for each sentiment
POS = create_corpus(trump_sentiment, sentiment_name = 'pos')
NEG = create_corpus(trump_sentiment, sentiment_name = 'neg')
NEUTRAL = create_corpus(trump_sentiment, sentiment_name = 'neutral')

In [12]:
corpus_All_sentiment = pd.concat([POS, NEG, NEUTRAL], axis = 0, ignore_index = True)
corpus_All_sentiment.head()

Unnamed: 0,text,sentiment
0,Hide highlightingFull TextTranslateUndo Transl...,pos
1,Hide highlightingFull TextTranslateUndo Transl...,pos
2,Hide highlightingFull TextTranslateUndo Transl...,pos
3,Hide highlightingFull TextTranslateUndo Transl...,pos
4,Hide highlightingFull TextTranslateUndo Transl...,pos


# 2. Data Preprocessing

In [13]:
def Data_Preprocessing(corpus):
    # convert string to list i.e. ['hide', 'highlightingfull', '[[missing']
    corpus['text'] = corpus['text'].str.split()

    # lower case each item in the list, and remove non-alphabetic characters i.e. ['hide', 'highlightingfull', 'missing']
    corpus['text'] = corpus['text'].apply(lambda x: [re.sub(r'[^a-zA-Z]', "",y.lower()) for y in x])

    # join the item in the list back to a string and replace keywords containing the target names
#     keywords = ['new york times', 'the new york times', 'international new york times'
#                 "the washington post", "WP Company LLC", "washpostcom",
#                 'wall street journal', 'thomaswsjcom', 'Dow Jones Company Inc.']
    corpus['text'] = corpus['text'].apply(lambda x: [' '.join(x)])

    # stem each word in the text
    corpus['text'] = corpus['text'].apply(lambda x: str(x[0]))
    corpus['text'] = corpus['text'].str.split()
    corpus['text'] = corpus['text'].apply(lambda x: [ps.stem(y) for y in x])

    # join the item in the list back to a string
    corpus['text'] = corpus['text'].apply(lambda x: [' '.join(x)])

    # convert list to a string
    corpus['text'] = corpus['text'].apply(lambda x: str(x[0]))

    print(type(corpus.iloc[0]['text']))
    
    return corpus

In [14]:
processed_sentiment_corpus = Data_Preprocessing(corpus_All_sentiment)
processed_sentiment_corpus.head(2)

<class 'str'>


Unnamed: 0,text,sentiment
0,hide highlightingful texttranslateundo transla...,pos
1,hide highlightingful texttranslateundo transla...,pos


# 3. Modeling

## 3.1 Split training and test sets

In [15]:
# separate features and targets
X = processed_sentiment_corpus.iloc[:, 0]
y = processed_sentiment_corpus.iloc[:, 1]

# split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
X_train, y_train = shuffle(X_train, y_train)

X_train.head(2)

12    hide highlightingful texttranslateundo transla...
11    hide highlightingful texttranslateundo transla...
Name: text, dtype: object

In [16]:
le = LabelEncoder()
# get label name mapping
le.fit(y_train)
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

# encode the target 
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

{'neg': 0, 'neutral': 1, 'pos': 2}


## 3.2 Getting document term matrices

### 3.2.1 Create matrix of token counts using unigram, bigram and trigram tokens

In [17]:
# define a function to get unigram, bigram, and trigram matrix of token counts

def get_DTM(Ngram_range, x_train, x_test):
    vectorizer = CountVectorizer(stop_words='english', min_df = int(3), max_df = 0.5, 
                                 ngram_range = Ngram_range, binary=True) 
    vectorizer.fit(x_train)
    trans_x_train = vectorizer.transform(x_train)
    trans_x_test = vectorizer.transform(x_test)
    
    return trans_x_train, trans_x_test

In [18]:
# unigram token counts matrix
binary1_train, binary1_test = get_DTM(Ngram_range = (1, 1), x_train = X_train, x_test = X_test)

# bigram token counts matrix
binary2_train, binary2_test = get_DTM(Ngram_range = (1, 2), x_train = X_train, x_test = X_test)

# trigram token counts matrix
binary3_train, binary3_test = get_DTM(Ngram_range = (1, 3), x_train = X_train, x_test = X_test)

In [19]:
print("The unique terms in binary1_train is:", binary1_train.toarray().shape[1])
print("The unique terms in binary2_train is:", binary2_train.toarray().shape[1])
print("The unique terms in binary3_train is:", binary3_train.toarray().shape[1])

The unique terms in binary1_train is: 1134
The unique terms in binary2_train is: 1623
The unique terms in binary3_train is: 1798


### 3.2.2 Create DTM using unigram, bigram and trigram term frequency

In [20]:
# define a function to get unigram, bigram, and trigram term frequency matrix

def get_TF_DTM(Ngram_range, x_train, x_test):
    vectorizer = CountVectorizer(stop_words='english', min_df = int(3), max_df = 0.5, ngram_range = Ngram_range) 
    vectorizer.fit(x_train)
    trans_x_train = vectorizer.transform(x_train)
    trans_x_test = vectorizer.transform(x_test)
    
    return trans_x_train, trans_x_test

In [21]:
# unigram tf matrix
tf1_train, tf1_test = get_TF_DTM(Ngram_range = (1, 1), x_train = X_train, x_test = X_test)

# bigram tf matrix
tf2_train, tf2_test = get_TF_DTM(Ngram_range = (1, 2), x_train = X_train, x_test = X_test)

# trigram tf matrix
tf3_train, tf3_test = get_TF_DTM(Ngram_range = (1, 3), x_train = X_train, x_test = X_test)

In [22]:
print("The unique terms in tf1_train is:", tf1_train.toarray().shape[1])
print("The unique terms in tf2_train is:", tf2_train.toarray().shape[1])
print("The unique terms in tf3_train is:", tf3_train.toarray().shape[1])

The unique terms in tf1_train is: 1134
The unique terms in tf2_train is: 1623
The unique terms in tf3_train is: 1798


### 3.2.3 Create DTM using unigram, bigram and trigram TF-IDF

In [23]:
# define a function to get unigram, bigram, and trigram TF-IDF matrix

def get_TF_IDF_DTM(Ngram_range, x_train, x_test):
    vectorizer = TfidfVectorizer(stop_words='english', min_df = int(3), max_df = 0.5, 
                                 ngram_range = Ngram_range) 
    vectorizer.fit(x_train)
    trans_x_train = vectorizer.transform(x_train)
    trans_x_test = vectorizer.transform(x_test)
    
    return trans_x_train, trans_x_test

In [24]:

# unigram tf-idf matrix
tfidf1_train, tfidf1_test = get_TF_IDF_DTM(Ngram_range = (1, 1), x_train = X_train, x_test = X_test)

# bigram tf-idf matrix
tfidf2_train, tfidf2_test = get_TF_IDF_DTM(Ngram_range = (1, 2), x_train = X_train, x_test = X_test)

# trigram tf-idf matrix
tfidf3_train, tfidf3_test = get_TF_IDF_DTM(Ngram_range = (1, 3), x_train = X_train, x_test = X_test)

In [25]:
print("The unique terms in tfidf1_train is:", tfidf1_train.toarray().shape[1])
print("The unique terms in tfidf2_train is:", tfidf2_train.toarray().shape[1])
print("The unique terms in tfidf3_train is:", tfidf3_train.toarray().shape[1])

The unique terms in tfidf1_train is: 1134
The unique terms in tfidf2_train is: 1623
The unique terms in tfidf3_train is: 1798


## 3.3 Modeling

### 3.3.1 XGBoost Classifier

In [26]:
# define model training
def train_model(clf, dtm, test):
    # train data
    clf.fit(dtm, y_train)
    
    # Predicting on the test set
    preds = clf.predict(test)
    
    # print evaluation matrix
    print("Accuracy:", '{:1.4f}'.format(accuracy_score(y_test, preds)))
    print("")
    print(classification_report(y_test, preds))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, preds))
    
    return '{:1.4f}'.format(accuracy_score(y_test, preds))

In [27]:
# Use Naive Bayes
#clf = XGBClassifier() #MultinomialNB()
#clf = svm.SVC(gamma = 'scale', C = 1.0)
#param = {'max_depth': 3, 'eta': 0.3, 'objective':'multi:softmax', 'num_class': 3}
# param = {'max_depth': 3, 'learning_rate ': 0.3, 'objective':'multi:softmax'}
#xgb_clf = XGBClassifier(param)
#xgb_clf = XGBClassifier(max_depth=3, learning_rate=0.3, objective='multi:softmax')
xgb_clf = XGBClassifier(max_depth=3, learning_rate=0.3, objective='multi:softmax', num_class=3)
#svm_clf = svm.SVC(gamma = 'scale', C = 1.0)
# reference: https://medium.com/@gabrielziegler3/multiclass-multilabel-classification-with-xgboost-66195e4d9f2d
# reference: https://xgboost.readthedocs.io/en/latest/parameter.html

# Model Configurations
binary1 = ("unigram, binary", binary1_train, binary1_test)
binary2 = ("bigram, binary",  binary2_train, binary2_test)
binary3 = ("trigram, binary", binary3_train, binary3_test)
tf1 = ("unigram, TF", tf1_train, tf1_test)
tf2 = ("bigram, TF",  tf2_train, tf2_test)
tf3 = ("trigram, TF", tf3_train, tf3_test)
tfidf1 = ("unigram, TF-IDF", tfidf1_train, tfidf1_test)
tfidf2 = ("bigram, TF-IDF",  tfidf2_train, tfidf2_test)
tfidf3 = ("trigram, TF-IDF", tfidf3_train, tfidf3_test)
DTMs = [binary1, binary2, binary3,
        tf1, tf2, tf3,
        tfidf1, tfidf2, tfidf3]

df = pd.DataFrame({"config": [],
                   "accuracy": []})
best_config = ["Best Configuration", "none", 0, "none", "none"]
for data in DTMs:
    print(data[0])
    print("")
    score = train_model(clf = xgb_clf, dtm = data[1], test = data[2])
    print("======================================================")
    print("")
    if float(score) > float(best_config[2]):
        best_config = ["Best Configuration:", data[0], score, data[1], data[2]]
    df = df.append({"config": data[0],
               "accuracy": float(score)},
               ignore_index=True)

unigram, binary

Accuracy: 0.6250

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.50      0.33      0.40         3
           2       0.50      0.50      0.50         2

    accuracy                           0.62         8
   macro avg       0.58      0.61      0.59         8
weighted avg       0.59      0.62      0.60         8

Confusion Matrix:
[[3 0 0]
 [1 1 1]
 [0 1 1]]

bigram, binary

Accuracy: 0.7500

              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.33      0.50         3
           2       0.67      1.00      0.80         2

    accuracy                           0.75         8
   macro avg       0.81      0.78      0.72         8
weighted avg       0.82      0.75      0.71         8

Confusion Matrix:
[[3 0 0]
 [1 1 1]
 [0 0 2]]

trigram, binary

Accuracy: 0.7500

              precision    recall  f1-sc

  _warn_prf(average, modifier, msg_start, len(result))


### 3.3.2 SVM Classifier

In [29]:
svm_clf = svm.SVC(gamma = 'scale', C = 1.0)
# svm classifier
df_svm = pd.DataFrame({"config": [],
                   "accuracy": []})
best_config_svm = ["Best Configuration", "none", 0, "none", "none"]
for data in DTMs:
    print(data[0])
    print("")
    score = train_model(clf = svm_clf, dtm = data[1], test = data[2])
    print("======================================================")
    print("")
    if float(score) > float(best_config_svm[2]):
        best_config_svm = ["Best Configuration:", data[0], score, data[1], data[2]]
    df_svm = df_svm.append({"config": data[0],
               "accuracy": float(score)},
               ignore_index=True)

unigram, binary

Accuracy: 0.3750

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.38      1.00      0.55         3
           2       0.00      0.00      0.00         2

    accuracy                           0.38         8
   macro avg       0.12      0.33      0.18         8
weighted avg       0.14      0.38      0.20         8

Confusion Matrix:
[[0 3 0]
 [0 3 0]
 [0 2 0]]

bigram, binary

Accuracy: 0.3750

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.38      1.00      0.55         3
           2       0.00      0.00      0.00         2

    accuracy                           0.38         8
   macro avg       0.12      0.33      0.18         8
weighted avg       0.14      0.38      0.20         8

Confusion Matrix:
[[0 3 0]
 [0 3 0]
 [0 2 0]]

trigram, binary

Accuracy: 0.3750

              precision    recall  f1-sc

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# results of svm classifier
df_svm

Unnamed: 0,config,accuracy
0,"unigram, binary",0.375
1,"bigram, binary",0.375
2,"trigram, binary",0.375
3,"unigram, TF",0.375
4,"bigram, TF",0.375
5,"trigram, TF",0.375
6,"unigram, TF-IDF",0.375
7,"bigram, TF-IDF",0.375
8,"trigram, TF-IDF",0.375


In [31]:
# best model
print(best_config_svm)

['Best Configuration:', 'unigram, binary', '0.3750', <30x1134 sparse matrix of type '<class 'numpy.int64'>'
	with 6802 stored elements in Compressed Sparse Row format>, <8x1134 sparse matrix of type '<class 'numpy.int64'>'
	with 1462 stored elements in Compressed Sparse Row format>]
