# Part 3: NLP using spaCy

According to a team of researchers from the Baruch Ivcher School of Psychology, a <u>high noun-to-verb ratio is a subtle linguistic trait that can reduce anger in response to policies</u>. Source: <a href='https://journals.sagepub.com/doi/abs/10.1177/0956797618772823?journalCode=pssa&'>here</a>.

In this part, we shall use the NLP library spaCy to extract parts-of-speech tags from the speeches, and calculate the noun-to-verb score.

In [1]:
# import libraries
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
spacy.load('en_core_web_sm')
# !pip install spacy==3.3.1

In [None]:
cleaned_speeches = pd.read_csv("part2.csv", index_col = 0)
cleaned_speeches.head()

In [None]:
# Get a list of POS tags found in Obama's 2010 speech

nlp = spacy.load('en_core_web_sm')
doc = nlp(cleaned_speeches.speech[0])

pos_obama2010 = []
for token in doc:
    pos_obama2010.append(token.pos_)

In [None]:
from collections import Counter

In [None]:
counter_obama2010 =  Counter(pos_obama2010)
counter_obama2010

In [None]:
# Get the value of the key 'NOUN' in the Obama 2010 POS Counter 
print(counter_obama2010["NOUN"])

In [None]:
# Get the counts for NOUN and VERB from speech
noun_list = []
verb_list = []

for key,value in cleaned_speeches.iterrows():

    doc = nlp(value.speech)
    pos = [token.pos_ for token in doc]
    counter = Counter(pos)
    noun_list.append(counter["NOUN"])
    verb_list.append(counter["VERB"])
    
print(noun_list)


In [None]:
cleaned_speeches["NOUN_count"] = noun_list
cleaned_speeches["VERB_count"] = verb_list
cleaned_speeches.head()

In [None]:
# Derive the noun-to-verb ratio

cleaned_speeches = cleaned_speeches.assign(noun_to_verb = lambda x: x.NOUN_count / x.VERB_count)
cleaned_speeches

In [None]:
# Compare the two politicians' noun-to-verb ratio
fig_dims = (10, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.boxplot(data = cleaned_speeches, x = "name", y = "noun_to_verb")