# Word Norms and Natural Language Processing

In [None]:
import pandas as pd
import seaborn as sns
sns.set()
#!pip install plotly
import plotly.express as px
#!pip install jupyterlab "ipywidgets>=7.5"
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler

## Wittgenstein's beetle

>Suppose everyone had a box with something in it: we call it a “beetle”. No one can look into anyone else’s box, and everyone says he knows what a beetle is only by looking at his beetle- Here it would be quite possible for everyone to have something different in his box. One might even imagine such a thing constantly changing- But suppose the word “beetle” had a use in these people’s language? If so it would not be used as the name of a thing. The thing in the box has no place in the language game at all, not even as a something: for the box might even be empty. ––No, one can 'divide through' by the thing in the box; it cancels out, whatever it is.

––<cite>Ludwig Wittgenstein, <i>Philosophical Investigations</i> §293)</cite>

## What is a word norm?

A word norm capatures the average impact of of a word across a linguistic community. That is, it is a public record of the function, behaviour or reception of a word by its users. A word norm is public: individuals may vary in their reception of word, but these variations should cancel out unless they're the product of systematic word features. If the the latter, then they should form part of the word norm data. Word norms have been collected for the following features:

* Affective (i.e. emotional) impact [(Bradley & Lang 1999)](https://pdodds.w3.uvm.edu/teaching/courses/2009-08UVM-300/docs/others/everything/bradley1999a.pdf), [(Warriner at al. 2013)](https://pubmed.ncbi.nlm.nih.gov/23404613/)
* Concreteness [(Brysbaert et al. 2013)](https://link.springer.com/article/10.3758/s13428-013-0403-5)
* Sensorimotor embodiment [(Lynott et al. 2019)](https://pubmed.ncbi.nlm.nih.gov/31832879/)
* Frequency [(Shapiro and Gordon 1971)](https://www.sciencedirect.com/science/article/abs/pii/S0022537171800993)
* Imageability (easy of visualisation) [(Scott et al. 2019)](https://link.springer.com/article/10.3758/s13428-018-1099-3)
* Semantic size (how 'important' a word is felt to be) [(Scott et al. 2019)](https://link.springer.com/article/10.3758/s13428-018-1099-3)
* Agency (extent to which a word is associated with purpose) (Carney and Robertson, unpublished)

### Word Norms for two common words

<img src="two_words.png" width="800" height="400">



  

## Our word norms

We will look at two sets of word norms today:
* The valenence-arousal-dominance (VAD) word norms in Warriner et al. (2013)
* The sensorimotor word norms piblished in Lynott et al. (2019)

In [None]:
vad = pd.read_excel('vad.xlsx', index_col = 0)  #VAD norms
sm = pd.read_excel('sensorimotor.xlsx', index_col = 0) #Sensorimotor norms
sm = sm[['auditory', 'gustatory', 'haptic', 'interoceptive', 'olfactory',
       'visual', 'foot_leg', 'hand_arm', 'head', 'mouth', 'torso']]

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(sm)

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=sm.columns)
scaled_df.index = sm.index

all_norms = vad.merge(scaled_df, left_index=True, right_index=True)

## The VAD model

### [VAD plots for English and Spanish](https://texturejc.github.io/VAD_VAG_plots/VAD_plot.html)


In [None]:


vad_s = vad.sample(500)

fig = px.scatter_3d(vad_s, x='valence', y='arousal', z='dominance', hover_data = [vad_s.index])
fig.update_traces(marker=dict(size=2,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))



fig.show()



In [None]:
text = "Spiders, crime, and earthquakes haunt my nightmares"
tokens = word_tokenize(text)
lemmas = [lemmatizer.lemmatize(i.lower()) for i in tokens]

words = []
emo = []

for i in lemmas:
    if i in vad.index:
        emo.append(vad.loc[i])
        words.append(i)
    else:
        pass

In [None]:
emo_df = pd.DataFrame(emo, index = words)
emo_df

In [None]:
word = 'spider'


if word.lower() in vad.index:
    lex = vad.loc[word.lower()]
    
    # Create a DataFrame from the series
    df = pd.DataFrame({'label': lex.index, 'value': lex.values})
    
    # Manually add the first row to the end to close the loop
    first_row = df.iloc[0].copy()
    df = pd.concat([df, pd.DataFrame([first_row])], ignore_index=True)
    
    # Create the polar plot
    fig = px.line_polar(df, r='value', theta='label', line_close=True)
    fig.update_traces(fill='toself')
    fig.update_layout(title= "VAD norms for "+word.upper())
    fig.show()
else:
    print("Try another word")

## The sensorimotor norms

In [None]:
text = "Apples on vacation reduce cell oxidization and improve spirituality"
tokens = word_tokenize(text)
lemmas = [lemmatizer.lemmatize(i.lower()) for i in tokens]

words = []
sm_ = []

for i in lemmas:
    if i in sm.index:
        sm_.append(sm.loc[i])
        words.append(i)
    else:
        pass

sm_df = pd.DataFrame(sm_, index = words)
sm_df

In [None]:
word = 'chemical'


if word.lower() in sm.index:
    lex = sm.loc[word.lower()]
    
    # Create a DataFrame from the series
    df = pd.DataFrame({'label': lex.index, 'value': lex.values})
    
    # Manually add the first row to the end to close the loop
    first_row = df.iloc[0].copy()
    df = pd.concat([df, pd.DataFrame([first_row])], ignore_index=True)
    
    # Create the polar plot
    fig = px.line_polar(df, r='value', theta='label', line_close=True)
    fig.update_traces(fill='toself')
    fig.update_layout(title= "Sensorimotor norms for "+word.upper())
    fig.show()
else:
    print("Try another word")

## Combining VAD and sensorimotor norms

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { height:150% !important; }</style>"))

all_ = all_norms.sample(100)

fig = px.scatter_3d(all_, x='valence', y='gustatory', z='visual', hover_data = [all_.index])
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))



fig.show()

In [None]:
sns.regplot(x = 'gustatory', y = 'valence', scatter = False, data = all_norms)

## [Olfaction, valence, and arousal](https://texturejc.github.io/olfaction/Olfactory_valence.html)

## Five years of UK news headline data

[Data source: Kaggle.com](https://www.kaggle.com/datasets/lazuri22/over-5-years-of-uk-news-headlines)

In [None]:

news = pd.read_pickle('small_news_data.pkl')
news_day = pd.read_pickle('processed_news_grouped_by_day.pkl')

In [None]:


news['source'] = news['source'].replace(['inews',
 'daily_mail',
 'express',
 'guardian',
 'evening_standard',
 'independent'], ['iNews',
  'Daily Mail',
  'Express',
  'Guardian',
  'Evening Standard',
  'Independent'])

news_day['source'] = news_day['source'].replace(['inews',
 'daily_mail',
 'express',
 'guardian',
 'evening_standard',
 'independent'], ['iNews',
  'Daily Mail',
  'Express',
  'Guardian',
  'Evening Standard',
  'Independent'])


# How does the presence of a single word in a headline impact on the emotional tone of that headline?

In [None]:
word = 'Brexit'

headlines = []

for i in news['headline']:
    if word.lower() in i.lower():
        headlines.append('yes')
    else:
        headlines.append('no')
        
news[word] = headlines

news_ = news[news[word] == 'yes']

sns.pointplot(x = 'source', y ='valence', join = False, data = news_)
plt.xticks(rotation = 45)
