# Analysing UK news headlines using word norms

![image news](papers.png)

In [None]:
import pandas as pd
import seaborn as sns
sns.set()
#!pip install plotly
import plotly.express as px
import matplotlib.pyplot as plt

## Read in our word norm data

In [None]:
vad = pd.read_csv('vad.csv', index_col = 0)  #VAD norms
vad = vad[["V.Mean.Sum", "A.Mean.Sum", "D.Mean.Sum"]]
vad.columns = ['valence', 'arousal', 'dominance']
sm = pd.read_excel('sensorimotor.xlsx', index_col = 0) #Sensorimotor norms
sm = sm[['auditory', 'gustatory', 'haptic', 'interoceptive', 'olfactory',
       'visual', 'foot_leg', 'hand_arm', 'head', 'mouth', 'torso']]

### What do we want to know about our data? Some basic `pandas`

* What are the column names?
* What do the first few rows look like?
* How do we select specific rows by their index?
* How can we select a smaller dataframe from a larger one?
* How can we summarise our data?
* What's the largest or smallest value in a column or row?
* How can we save our processed data?

### Plot some of data to see what it looks like

In [None]:
vad_s = vad.sample(500)

fig = px.scatter_3d(vad_s, x='valence', y='arousal', z='dominance', hover_data = [vad_s.index])
fig.update_traces(marker=dict(size=5, color = 'red',
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))



fig.show()

### Can we write a function that selects word norm data for words in a list?

In [None]:
def vad_data(word_list):
    word_list = [i.lower() for i in word_list]
    words = []
    norms = []
    
    for i in word_list:
        if i in vad.index:
            norms.append(vad.loc[i])
            words.append(i)
        else:
            pass
    norms_vad = pd.DataFrame(norms, index = words)
    return norms_vad

def sm_data(word_list):
    word_list = [i.lower() for i in word_list]
    words = []
    norms = []
    
    for i in word_list:
        if i in sm.index:
            norms.append(sm.loc[i])
            words.append(i)
        else:
            pass
    norms_sm = pd.DataFrame(norms, index = words)
    return norms_sm


In [None]:
words = [
    'river', 'bed', 'scared', 'table', 'valley', 'walk', 'ride', 'ocean', 'reef', 'sad',
    'run', 'couch', 'climb', 'chair', 'peach', 'hill', 'crawl', 'jump', 'banana', 'mountain',
    'pillow', 'orange', 'blanket', 'sofa', 'cherry', 'swim', 'lake', 'excited', 'shelf', 'forest',
    'desert', 'island', 'grape', 'bored', 'brave', 'dance', 'happy', 'mango', 'apple', 'joyful',
    'lamp', 'angry', 'fly', 'pear', 'kiwi', 'tired', 'calm', 'plum', 'drive', 'desk'
]


In [None]:
vocab = vad_data(words)

fig = px.scatter_3d(vocab, x='valence', y='arousal', z='dominance', hover_data = [vocab.index])
fig.update_traces(marker=dict(size=5, color = 'pink',
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))



fig.show()

### Can we compare VAD in other languages with VAD in English?

In [None]:
from IPython.display import IFrame

IFrame(src="https://texturejc.github.io/VAD_VAG_plots/VAD_plot.html", width=1000, height=600)


## Read in our news data

In [None]:
headlines = pd.read_pickle("small_news_data.pkl")
days = pd.read_pickle("processed_news_grouped_by_day.pkl")
days["date"] = pd.to_datetime(days["just_date"], errors='coerce')

## What would we predict about the VAD profile of headlines? Let's take a look

In [None]:
vocab = headlines.sample(500)

fig = px.scatter_3d(vocab, x='valence', y='arousal', z='dominance', hover_data = [vocab['headline']], color = vocab['section'])
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))



fig.show()

### How let's look at how VAD changes over time

#### Days

In [None]:
fig = px.line(days, x="date", y="valence", color='source')
fig.show()

#### Months and years

In [None]:
variable = 'arousal'

days["month"] = days["date"].dt.month
days_m = days.groupby(['source', 'month'], as_index = False)[variable].mean()

days["year"] = days["date"].dt.year
days_y = days.groupby(['source', 'year'], as_index = False)[variable].mean()

fig = px.line(days_m, x="month", y=variable, color='source')
fig.show()

### Can we be selective about the topics we want to look at?

In [None]:
keyword = 'rugby'
keyword_df = headlines[headlines['headline'].str.contains(keyword, na=False)]


In [None]:
keyword_df