In [1]:
# Data
import numpy as np
import pandas as pd

# NLP
import nltk
from nltk.util import bigrams 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# plot
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt

## 1 Import Corpus

In [2]:
# load news articles
Path = "C:/Users/Winnie/Documents/2020 Spring/NLP/Final Project/Data/All_Candidates/"
Biden = pd.read_csv(Path + 'Joe_Biden.csv')
Sanders = pd.read_csv(Path + 'Bernie_Sanders.csv')
Trump = pd.read_csv(Path + 'Donald_Trump.csv')
Trump.head(2)

Unnamed: 0,title,text,media,word_count,candidate_name
0,Primary Battles on the Right? They Seem Less S...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1880.0,Donald Trump
1,Collins Will Not Support Removal of the Presid...,Full TextTranslateUndo Translation FromToTrans...,New York Times,1066.0,Donald Trump


In [3]:
# split into title and text
Biden_News = Biden.iloc[:, 1]
Sanders_News = Sanders.iloc[:, 1]
Trump_News = Trump.iloc[:, 1]
Trump_News.head(2)

0    Hide highlightingFull TextTranslateUndo Transl...
1    Full TextTranslateUndo Translation FromToTrans...
Name: text, dtype: object

## 2 TF-IDF Analysis 

### 2.1 Define functions to generate results at a large scale 

In [4]:
def TF_IDF_Data_Preprocessing(corpus):
    
    # Split a string (each article) into a list
    news = corpus.str.split()
    
    # stemming each word in an article
    news = news.apply(lambda each_article: [PorterStemmer().stem(item) for item in each_article])
    
    # remove numbers
    #news = news.apply(lambda each_article: [item for item in each_article if not item.isdigit()])
    
    # piecing all items tgt back to a sentence format
    news = news.apply(lambda item: [' '.join(item)])
    
    # convert the corpus into a 1-dimentional numpy array
    news = pd.Series(news).astype(str)
    
    return news

In [5]:
def TF_IDF_Analyzer(corpus, min_df_value, max_df_value, n_gram_value, col_1_name, col_2_name, top_n):
    
    ## convert a collection of text documents to a matrix of n-gram token counts
    # instantiate CountVectorizer()
    vectorizer = CountVectorizer(stop_words = 'english',
                                 min_df = int(min_df_value),
                                 max_df = max_df_value,
                                 ngram_range = (n_gram_value, n_gram_value))
    
    # generate word counts for the words in the corpus
    word_count_matrix = vectorizer.fit_transform(corpus)
    
    ## compute the TF-IDF score for the corpus
    # transform the word count matrix to a normalized tf-idf representation
    transformer = TfidfTransformer()
    TF_IDF_vectors = transformer.fit_transform(word_count_matrix)
    
    # calcualte the mean tf-idf score for each unique word in the corpus
    TF_IDF_scores = np.asarray(TF_IDF_vectors.mean(axis=0)).ravel().tolist()
    
    # format the results in a dataframe and display only the top n words
    results_df = pd.DataFrame({col_1_name: vectorizer.get_feature_names(),
                              col_2_name: TF_IDF_scores})
    results_sort = results_df.sort_values(by = col_2_name, ascending = False).reset_index(drop = True)[:top_n]
    
    return results_sort

# reference: https://iyzico.engineering/how-to-calculate-tf-idf-term-frequency-inverse-document-frequency-from-the-beatles-biography-in-c4c3cd968296    

### 2.2 Joe Biden TF-IDF Analysis Results 

In [6]:
Biden_News_Processed = TF_IDF_Data_Preprocessing(Biden_News)
Biden_TF_IDF_Results = TF_IDF_Analyzer(corpus = Biden_News_Processed, 
                                       min_df_value = 3,
                                       max_df_value = 0.5,
                                       n_gram_value = 1, 
                                       col_1_name = 'Biden Terms',
                                       col_2_name = 'Biden TF-IDF Score',
                                       top_n = 20)
Biden_TF_IDF_Results

Unnamed: 0,Biden Terms,Biden TF-IDF Score
0,ms,0.040405
1,bloomberg,0.037313
2,impeach,0.03557
3,buttigieg,0.031897
4,south,0.027681
5,carolina,0.027412
6,black,0.025652
7,trial,0.024963
8,super,0.024686
9,ad,0.022953


### 2.3 Donald Trump TF-IDF Ananlysis Results  

In [7]:
Trump_News_Processed = TF_IDF_Data_Preprocessing(Trump_News)
Trump_TF_IDF_Results = TF_IDF_Analyzer(corpus = Trump_News_Processed, 
                                       min_df_value = 3,
                                       max_df_value = 0.5,
                                       n_gram_value = 1, 
                                       col_1_name = 'Trump Terms',
                                       col_2_name = 'Trump TF-IDF Score',
                                       top_n = 20)
Trump_TF_IDF_Results

Unnamed: 0,Trump Terms,Trump TF-IDF Score
0,biden,0.040791
1,bloomberg,0.027297
2,investig,0.026325
3,ukrain,0.025381
4,ms,0.024838
5,case,0.023303
6,voter,0.023002
7,iowa,0.02293
8,wit,0.022819
9,bolton,0.022006


### 2.4 Bernie Sanders TF-IDF Analysis Results 

In [8]:
Sanders_News_Processed = TF_IDF_Data_Preprocessing(Sanders_News)
Sanders_TF_IDF_Results = TF_IDF_Analyzer(corpus = Sanders_News_Processed, 
                                       min_df_value = 3,
                                       max_df_value = 0.5,
                                       n_gram_value = 1, 
                                       col_1_name = 'Sanders Terms',
                                       col_2_name = 'Sanders TF-IDF Score',
                                       top_n = 20)
Sanders_TF_IDF_Results

Unnamed: 0,Sanders Terms,Sanders TF-IDF Score
0,bloomberg,0.046533
1,ms,0.043925
2,percent,0.033493
3,million,0.026562
4,super,0.025339
5,nevada,0.024678
6,health,0.024547
7,deleg,0.02439
8,carolina,0.024179
9,klobuchar,0.023548


## 3 Show TF-IDF Analysis Results 

In [9]:
dflist = [Biden_TF_IDF_Results, Sanders_TF_IDF_Results, Trump_TF_IDF_Results]
Results_All = pd.concat(dflist, axis = 1, sort = False)
Results_All.index = Results_All.index + 1
Results_All

Unnamed: 0,Biden Terms,Biden TF-IDF Score,Sanders Terms,Sanders TF-IDF Score,Trump Terms,Trump TF-IDF Score
1,ms,0.040405,bloomberg,0.046533,biden,0.040791
2,bloomberg,0.037313,ms,0.043925,bloomberg,0.027297
3,impeach,0.03557,percent,0.033493,investig,0.026325
4,buttigieg,0.031897,million,0.026562,ukrain,0.025381
5,south,0.027681,super,0.025339,ms,0.024838
6,carolina,0.027412,nevada,0.024678,case,0.023303
7,black,0.025652,health,0.024547,voter,0.023002
8,trial,0.024963,deleg,0.02439,iowa,0.02293
9,super,0.024686,carolina,0.024179,wit,0.022819
10,ad,0.022953,klobuchar,0.023548,bolton,0.022006


## 4 Unique Words for each Candidate

In [10]:
# identify the top 60 TF-IDF terms for each candidate
Biden_TF_IDF_Results_top_100 = TF_IDF_Analyzer(corpus = Biden_News_Processed, 
                                       min_df_value = 3,
                                       max_df_value = 0.5,
                                       n_gram_value = 1, 
                                       col_1_name = 'Biden Terms',
                                       col_2_name = 'Biden TF-IDF Score',
                                       top_n = 60)

Trump_TF_IDF_Results_top_100 = TF_IDF_Analyzer(corpus = Trump_News_Processed, 
                                       min_df_value = 3,
                                       max_df_value = 0.5,
                                       n_gram_value = 1, 
                                       col_1_name = 'Trump Terms',
                                       col_2_name = 'Trump TF-IDF Score',
                                       top_n = 60)

Sanders_TF_IDF_Results_top_100 = TF_IDF_Analyzer(corpus = Sanders_News_Processed, 
                                       min_df_value = 3,
                                       max_df_value = 0.5,
                                       n_gram_value = 1, 
                                       col_1_name = 'Sanders Terms',
                                       col_2_name = 'Sanders TF-IDF Score',
                                       top_n = 60)

In [11]:
# create a function to find the unique terms for each candidate
def generate_unique_terms(df1, df2, df3):
    term_list_1 = set(df1.iloc[:, 0])
    term_list_2 = set(df2.iloc[:, 0])
    term_list_3 = set(df3.iloc[:, 0])
    unique_terms_in_df1 = term_list_1.difference(term_list_2)
    unique_terms_in_df1 = unique_terms_in_df1.difference(term_list_3)
    return unique_terms_in_df1
    

In [12]:
# unique terms for Biden
unique_terms_in_Biden = generate_unique_terms(df1 = Biden_TF_IDF_Results_top_100,
                                              df2 = Trump_TF_IDF_Results_top_100,
                                              df3 = Sanders_TF_IDF_Results_top_100)
unique_terms_in_Biden

{'buttigieg',
 'close',
 'event',
 'facebook',
 'mayor',
 'nomin',
 'party',
 'secur',
 'south',
 'trial',
 'woman',
 'won'}

In [13]:
# unique terms for Trump
unique_terms_in_Trump = generate_unique_terms(df1 = Trump_TF_IDF_Results_top_100,
                                              df2 = Biden_TF_IDF_Results_top_100,
                                              df3 = Sanders_TF_IDF_Results_top_100)
unique_terms_in_Trump

{'administr',
 'advis',
 'argu',
 'argument',
 'articl',
 'attorney',
 'biden',
 'bolton',
 'candid',
 'case',
 'congress',
 'defens',
 'democrats',
 'giuliani',
 'gop',
 'intellig',
 'iowa',
 'iran',
 'john',
 'justic',
 'lawyer',
 'manag',
 'mcconnel',
 'meet',
 'offic',
 'parna',
 'parti',
 'pelosi',
 'plan',
 'race',
 'remov',
 'romney',
 'russia',
 'sander',
 'schiff',
 'sen',
 'speech',
 'stone',
 'team',
 'thursday',
 'tuesday',
 'ukraine',
 'veri',
 'voter',
 'win'}

In [14]:
# unique terms for Sanders
unique_terms_in_Sanders = generate_unique_terms(df1 = Sanders_TF_IDF_Results_top_100,
                                                df2 = Trump_TF_IDF_Results_top_100,
                                                df3 = Biden_TF_IDF_Results_top_100)
unique_terms_in_Sanders

{'000',
 'accord',
 'ballot',
 'california',
 'caucu',
 'chang',
 'defeat',
 'earli',
 'face',
 'field',
 'hous',
 'leader',
 'past',
 'primary',
 'spend',
 'told',
 'turnout'}