### import libraries & functions

In [None]:
# import data analysis & viz libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import nlp libraries
import string
import collections
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams

# set nlp variables
english_stops = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
# create nlp functions

def clean_tokens(tokens):
    """ Lowercases, takes out punct and stopwords and short strings """
    return [token.lower() for token in tokens if (token not in string.punctuation) and 
                   (token.lower() not in english_stops) and len(token) > 2]

def lemmatize(tokens):
    """ Removes plurals """
    return [lemmatizer.lemmatize(token) for token in tokens]

def count_ngrams(tokens,n):
    n_grams = ngrams(tokens, n)
    ngram_freq = collections.Counter(n_grams)
    ngram_freq = ngram_freq.most_common()
    return ngram_freq

def ngram_to_dict(ngram_freq):
    l = []
    for t in ngram_freq:
        l.append((' '.join(t[0]),t[1]))
    return dict(l)

### load & clean data

In [None]:
# load data from pickle
cat = 'physics'
articles = pd.read_pickle('{}_through_2018-11-18'.format(cat))

In [None]:
# merge & drop duplicates
articles = articles.copy()
articles.drop_duplicates('id',inplace=True)

# add month & year column
articles['year'] = pd.to_datetime(articles['year-month'],format='%Y-%m').dt.strftime('%Y')

# put a space at the end of each abstract
articles['abstract+'] = articles['abstract'].apply(lambda abs: abs+' ')

articles.head()

In [None]:
# group abstracts by month and year
articles_year = articles.groupby('year')['abstract+'].agg(['count','sum'])
articles_year.head()

In [None]:
word_dict = {}
bigram_dict = {}
trigram_dict = {}
ngram_dict = {}

for year in articles_year.index:
    tokens = nltk.word_tokenize(articles_year.loc[year,'sum'])
    wc = len(tokens)
    articles_year.loc[year,"word count"] = wc
    clean = clean_tokens(tokens)
    lem = lemmatize(clean)
    
    # count word and ngram frequency
    word_freq = count_ngrams(lem, 1)
    bigram_freq = count_ngrams(lem, 2)
    trigram_freq = count_ngrams(lem, 3)
    ngram_freq = word_freq + bigram_freq + trigram_freq
    
    # change to dict
    word_freq = ngram_to_dict(word_freq)
    bigram_freq = ngram_to_dict(bigram_freq)
    trigram_freq = ngram_to_dict(trigram_freq)
    ngram_freq = ngram_to_dict(ngram_freq)
    
    # add year metadata
    word_dict[year] = word_freq
    bigram_dict[year] = bigram_freq
    trigram_dict[year] = trigram_freq
    ngram_dict[year] = ngram_freq

In [None]:
# pickle ai & ai_year with word count
articles.to_pickle('{}_through_2018-12-08'.format(cat))
articles_year.to_pickle('{}_only_year_through_2018-12-08'.format(cat))

In [None]:
# turn into dfs
def dict_to_df(dictionary, year):
    df = pd.DataFrame(dictionary)
    df.sort_values(year,ascending=False, inplace=True)
    
    return df

word_df = dict_to_df(word_dict,'2018')
bigram_df = dict_to_df(bigram_dict,'2018')
trigram_df = dict_to_df(trigram_dict,'2018')

word_df.head()

In [None]:
# store as pickle files
word_df.to_pickle('{}_words_df'.format(cat))
bigram_df.to_pickle('{}_bigrams_df'.format(cat))
trigram_df.to_pickle('{}_trigrams_df'.format(cat))

In [None]:
# store as csv files
word_df.to_csv('{}_words_df.csv'.format(cat))
bigram_df.to_csv('{}_bigrams_df.csv'.format(cat))
trigram_df.to_csv('{}_trigrams_df.csv'.format(cat))

### import cleaned data

In [None]:
# import from pickle files
articles_year = pd.read_pickle('{}_only_year_through_2018-12-08'.format(cat))

word_df = pd.read_pickle('{}_words_df'.format(cat))
bigram_df = pd.read_pickle('{}_bigrams_df'.format(cat))
trigram_df = pd.read_pickle('{}_trigrams_df'.format(cat))

In [None]:
word_df.head()

In [None]:
articles_year

### frequency per 1000 words

In [None]:
def drop_pre_1998(df):
    for year in range(1993,1999):
        df.drop(str(year),axis=1,inplace=True)
    return df

In [None]:
def calculate_freq(df,n):
    freq_per_n = df.copy()
    freq_per_n.fillna(0,inplace=True)

    for year in freq_per_n.columns:
        wc = articles_year.loc[year,'word count']
        wc_normalized = wc/n
        freq_per_n[year] = freq_per_n[year]/wc_normalized
    
    freq_per_n.reset_index(inplace=True)
    
    return freq_per_n

In [None]:
word_freq_per_1000 = drop_pre_1998(calculate_freq(word_df, 1000))
bigram_freq_per_1000 = drop_pre_1998(calculate_freq(bigram_df, 1000))
trigram_freq_per_1000 = drop_pre_1998(calculate_freq(trigram_df, 1000))

word_freq_per_1000.head()

### gains

In [None]:
def top_freq(freq_df):
    avg_freq = freq_df.copy()
    avg_freq.set_index('index',inplace=True)
    
    # calculate average frequency
    avg_freq['avg freq'] = avg_freq.mean(axis=1)
    
    # keep only top 100
    avg_freq.sort_values('avg freq',ascending=False,inplace=True)
    top_freq = avg_freq.iloc[:100]
    
    return top_freq

def calculate_gains(freq_df):
    gains = freq_df.copy()
    gains['% gain'] = (gains['2018']-gains['1999'])/gains['1999']
    
    gains.sort_values(['% gain','avg freq'],ascending=[False,False],inplace=True)
    
    return gains

In [None]:
def calculate_five_yr_gains(freq_df):
    gains = freq_df.copy()
    gains['% gain'] = (gains['2014-2018']-gains['1999-2003'])/gains['1999-2003']
    
    gains.sort_values(['% gain','avg freq'],ascending=[False,False],inplace=True)
    
    return gains

In [None]:
word_gains = calculate_five_yr_gains(top_freq(calculate_five_year(word_freq_per_1000)))
bigram_gains = calculate_five_yr_gains(top_freq(calculate_five_year(bigram_freq_per_1000)))
trigram_gains = calculate_five_yr_gains(top_freq(calculate_five_year(trigram_freq_per_1000)))

In [None]:
word_gains.head(20)

In [None]:
word_gains.tail(20)

In [None]:
bigram_gains.head(20)

In [None]:
bigram_gains.tail(20)

In [None]:
trigram_gains.head(50)

In [None]:
trigram_gains.tail(20)

### word analysis

In [None]:
def select_best(best_of,df):
    best_of_df = df[df['index'].apply(lambda ngram: ngram in best_of)]
    best_of_df = df[df['index'].apply(lambda ngram: ngram in best_of)]
    best_of_df = best_of_df.set_index('index')
    
    return best_of_df

def plot_best(best_of_df):
    best_of_df = best_of_df.T
    plt.figure(figsize=(15,5))
    plt.plot(best_of_df.index,best_of_df)
    plt.legend(best_of_df.columns)
    sns.despine(left=True,bottom=True)

# def plot_heatmap(best_of_df):
#     ax = sns.heatmap(best_of_df)

def calculate_five_year(freq_df):
    five_year = pd.DataFrame()
    five_year['index'] = freq_df['index']
    five_year['1999-2003'] = freq_df.loc[:,[str(year) for year in range(1999,2004)]].mean(axis=1)
    five_year['2004-2008'] = freq_df.loc[:,[str(year) for year in range(2004,2009)]].mean(axis=1)
    five_year['2009-2013'] = freq_df.loc[:,[str(year) for year in range(2009,2014)]].mean(axis=1)
    five_year['2014-2018'] = freq_df.loc[:,[str(year) for year in range(2014,2019)]].mean(axis=1)
    
    return five_year

#### words

In [None]:
# gains cleaned
words_up = word_gains.head(6).index
words_down = word_gains.tail(6).index

plot_best(select_best(words_up,calculate_five_year(word_freq_per_1000)))
plot_best(select_best(words_down,calculate_five_year(word_freq_per_1000)))

In [None]:
sns.set(style='white')

gains = select_best(words_up,word_freq_per_1000).iloc[:,1:]
losses = select_best(words_down,word_freq_per_1000).iloc[:,1:]

fig,ax = plt.subplots(2,1,figsize=(10,6),sharex=True)
cbar_ax = fig.add_axes([.94, .3, .015, .4])
sns.heatmap(gains,ax=ax[0],vmin=0,vmax=8,cbar_ax=cbar_ax,cmap="Reds")
sns.heatmap(losses,ax=ax[1],vmin=0,vmax=8,cbar_ax=cbar_ax,cmap="Reds")

#remove y label
ax[0].set_ylabel('')
ax[1].set_ylabel('')

In [None]:
# power normalized
from matplotlib.colors import PowerNorm

sns.set(style='white')

# normalize colorbar
power_norm = PowerNorm(gamma=0.65)

# plot
fig,ax = plt.subplots(2,1,figsize=(10,6),sharex=True)
cbar_ax = fig.add_axes([.94, .3, .015, .4])
hm1 = sns.heatmap(gains,ax=ax[0],vmin=0,vmax=10,norm=power_norm,cbar_ax=cbar_ax,cmap="Reds")
hm2 = sns.heatmap(losses,ax=ax[1],vmin=0,vmax=10,norm=power_norm,cbar_ax=cbar_ax,cmap="Reds")
# fig.colorbar(hm1,ax=cbar_ax, extend='max')

# remove y label
ax[0].set_ylabel('')
ax[1].set_ylabel('')

#### bigrams

In [None]:
# bigrams cleaned
bigrams_up = bigram_gains.head(6).index
bigrams_down = bigram_gains.tail(6).index

bigram_five_year = calculate_five_year(bigram_freq_per_1000)
bigram_five_year.set_index('index',inplace=True)
bigram_five_year.reset_index(inplace=True)

plot_best(select_best(bigrams_up,bigram_five_year))
plot_best(select_best(bigrams_down,bigram_five_year))



In [None]:
sns.set(style='white')

bigram_freq_tweaked = bigram_freq_per_1000.copy()
bigram_freq_tweaked.set_index('index',inplace=True)
bigram_freq_tweaked.reset_index(inplace=True)

gains = select_best(bigrams_up,bigram_freq_tweaked).iloc[:,1:]
losses = select_best(bigrams_down,bigram_freq_tweaked).iloc[:,1:]

# normalize colorbar
power_norm = PowerNorm(gamma=0.5)

# plot
fig,ax = plt.subplots(2,1,figsize=(10,6),sharex=True)
cbar_ax = fig.add_axes([.94, .3, .015, .4])
sns.heatmap(gains,ax=ax[0],vmin=0,vmax=2,norm=power_norm, cbar_ax=cbar_ax,cmap="Reds")
sns.heatmap(losses,ax=ax[1],vmin=0,vmax=2,norm=power_norm, cbar_ax=cbar_ax,cmap="Reds")

#remove y label
ax[0].set_ylabel('')
ax[1].set_ylabel('')

## Trigrams

In [None]:
# trigrams cleaned
trigram_up = trigram_gains.head(6).index
trigram_down = trigram_gains.tail(6).index

trigram_five_year = calculate_five_year(trigram_freq_per_1000)
trigram_five_year.set_index('index',inplace=True)
trigram_five_year.reset_index(inplace=True)

plot_best(select_best(trigram_up,trigram_five_year))
plot_best(select_best(trigram_down,trigram_five_year))

In [None]:
sns.set(style='white')

trigram_freq_tweaked = trigram_freq_per_1000.copy()
trigram_freq_tweaked.set_index('index',inplace=True)
trigram_freq_tweaked.reset_index(inplace=True)

gains = select_best(trigrams_up,trigram_freq_tweaked).iloc[:,1:]
losses = select_best(trigrams_down,trigram_freq_tweaked).iloc[:,1:]

# normalize colorbar
power_norm = PowerNorm(gamma=0.5)

# plot
fig,ax = plt.subplots(2,1,figsize=(10,6),sharex=True)
cbar_ax = fig.add_axes([.94, .3, .015, .4])
sns.heatmap(gains,ax=ax[0],vmin=0,vmax=2,norm=power_norm, cbar_ax=cbar_ax,cmap="Reds")
sns.heatmap(losses,ax=ax[1],vmin=0,vmax=2,norm=power_norm, cbar_ax=cbar_ax,cmap="Reds")

#remove y label
ax[0].set_ylabel('')
ax[1].set_ylabel('')