# Word Meaning
Do the meanings of words used in the political discourse change over time, can we quantify this?

In [None]:
import pandas as pd
import os
import sys
import json
import re
import random
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from collections import Counter, defaultdict
from nltk.stem import PorterStemmer
from fuzzysearch import find_near_matches
import swifter 
from tqdm import tqdm
tqdm.pandas()
import datetime

from wordcloud import WordCloud

stop_words = set(stopwords.words('english')) 
stop_words = stop_words.union(set(['ladies', 'gentlemen']))

In [None]:
# Add repo
git_dir = os.path.abspath('../')
sys.path.append(os.path.join(git_dir, 'lib', 'utils') )

data_output_path = os.path.join(git_dir, 'data', 'hansard-parsed')

# Pickle path
df_pickle_path = os.path.join(data_output_path, 'hansard-senate-emotions-ekman.pkl')

# shared-scratch/language-of-leadership/data/hansard-parsed/hansard-senate-emotions-ekman.pkl
# Read in, make string lists into strings
df = pd.read_pickle(df_pickle_path)

df['date'] = pd.to_datetime(df['date'])

In [None]:
# All other types dont have enough structure.
df['year'] = pd.DatetimeIndex(df['date']).year

In [None]:
# Baseline language data
# flat_corpus = ' '.join(df.sentences.sum())
# token_corpus = word_tokenize(flat_corpus)
# token_corpus_filtered = [w for w in token_corpus if not w in stop_words]
# corpus_freqs = Counter(token_corpus_filtered)
# most_common = corpus_freqs.most_common(300)
# corpus_norm = len(token_corpus_filtered)

In [None]:
woi = {
    'ind': ['aboriginal', 'first nations', 'aborigine', 'indigenous', 'torres strait island'],
    'econ': ['economy', 'economic'],
    'aus': ['australia', 'australian', 'aussie', 'australians'],
    'gov': ['government'],
    'ref': ['immigrant', 'asylum seeker', 'boat people', 'illegal arrivals', 'boat arrivals', 'boat'],
    'env': ['fire', 'bushfire', 'climate change', 'climate action', 'extinction', 'global warming', 'greenhouse', 'emissions', 'environment', 'coral reef', 'reef', 'climate', 'degradation', 'sustainability']
}
filters = {
    'ind': [
           'aboriginal', 
           'first nations', 
           'aborigine', 
           'indigenous', 
           'torres', 
           'strait', 
           'island',
           'australia', 
           'australian', 
           'aussie',
           'australians',
           'government',
           'nation',
           'year',
           'country'
          ],
    'econ': [
        'economy', 
        'economic',
        'australia', 
        'australian', 
        'aussie',
        'australians',
        'government',
        'nation',
        'year',
        'country'
    ],
    'gov': [
        'australia', 
        'australian', 
        'aussie',
        'australians',
        'government',
        'nation',
        'year',
        'country' 
    ],
    'gen': [
        'australia', 
        'australian', 
        'aussie',
        'australians',
        'government',
        'nation',
        'year',
        'country' 
    ],
    'ref': [
        'aboriginal', 
        'first nations', 
        'aborigine', 
        'indigenous', 
        'torres', 
        'strait', 
        'island',
        'australia', 
        'australian', 
        'aussie',
        'australians',
        'government',
        'nation',
        'year',
        'country' 
    ]
}

In [None]:
def find_colocation_neighbours(idxs_in, size, dist = 1):
    idxs = idxs_in.copy()
    for idx in idxs:
        for d in range(dist):
            if not idx - d < 0:
                idxs_in.append(idx-d) 
            if not idx + d > size:
                idxs_in.append(idx+d)

def fuzzy_colocation(sequences, search_key, stemmer):
    ret = []
    for index, sequence in enumerate(sequences):
        if any([len(find_near_matches(subsequence, sequence, max_l_dist=2)) > 0 
             for subsequence in search_key]):
            ret.append(index)
    find_colocation_neighbours(ret, len(sequences))
    acc = ' '.join([sequences[indx] for indx in ret])
    return Counter([stemmer.stem(w) 
            for w in word_tokenize(acc) 
            if not w in stop_words])

In [None]:
def get_cooccurence(
    search_key,
    key,
    filters
):
    search_key = woi[key]
    df[f'{key}'] = df['sentences']\
        .progress_map(lambda x: fuzzy_colocation(x, search_key, PorterStemmer()))

    cooccur_number = df.groupby('year')[key].sum()
    total_freqs = cooccur_number.sum()

    timeseries = defaultdict(list)
    stemmer = PorterStemmer()
    stemmed_filter = [stemmer.stem(x) for x in filters]

    timeseries = defaultdict(list)
    for x in total_freqs:
        if x not in stemmed_filter:
            for counter in cooccur_number.to_list():
                timeseries[x].append(counter.get(x, 0))

    totals = {}
    for w in timeseries:
        totals[w] = sum(timeseries[w])

    most_common = list(dict(Counter(totals).most_common(20)).keys())

    return cooccur_number, most_common, timeseries

In [None]:
def plot_coccurence(
    key,
    cooccur_number,
    most_common,
    timeseries,
    group,
    context
):
    for x in most_common[group[0]:group[1]]:
        plt.plot(cooccur_number.index, gaussian_filter1d(timeseries[x], sigma=2))
        plt.legend(most_common[group[0]:group[1]])
    plt.title(f'Language Addressing {context} Issues')
    plt.savefig(f'{fig_dir}/{group[0]}-{group[1]}-{key}-hansard.png')
    plt.show()

In [None]:
def generate_and_plot(
    search_key,
    key,
    filters,
    context
):
    cooccur_number, most_common, timeseries = get_cooccurence(search_key,key,filters)
    plot_coccurence(
        key,
        cooccur_number,
        most_common,
        timeseries,
        (0,5),
        context
    )
    plot_coccurence(
        key,
        cooccur_number,
        most_common,
        timeseries,
        (5,10),
        context
    )
    plot_coccurence(
        key,
        cooccur_number,
        most_common,
        timeseries,
        (10,15),
        context
    )
    plot_coccurence(
        key,
        cooccur_number,
        most_common,
        timeseries,
        (15,20),
        context
    )

In [None]:
key = 'aus'
generate_and_plot(woi[key], key, filters['ind'], 'First Nations')

In [None]:
key = 'econ'
generate_and_plot(woi[key], key, filters['econ'], 'Economics')

In [None]:
key = 'ref'
generate_and_plot(woi[key], key, filters['ref'], 'Refugee')

In [None]:
key = 'env'
generate_and_plot(woi[key], key, filters['env'], 'Environment')