In [1]:
import warnings
warnings.filterwarnings("ignore")

from gensim.matutils import kullback_leibler
from gensim.models import ldamodel
from gensim.corpora import Dictionary

from datetime import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
import pandas as pd
from tqdm import tqdm
from math import e

from utils import dataset, load_bow

In [7]:
col = [
    'Publication Type', \
    'Venue', \
    'Author Full Names', \
    'Article Title', \
    'Abstract', \
    'Publication Date', \
    'Publication Year']

venues = dataset()[col]
venues = venues.dropna(subset=['Abstract'])
venues['Abstract Lemmatized'] = load_bow('25_venues')
venues = venues.dropna(subset=['Publication Date'])
venues['Date'] = (venues['Publication Date']+'-'+venues['Publication Year']).apply(lambda x: datetime.strptime(x, '%b-%Y'))
venues = venues.sort_values(by='Date')
venues.index = range(len(venues))

In [8]:
model_name = '60_asymmetric_auto_0.5_0.0075'
model = ldamodel.LdaModel.load(f'models/lda_models/{model_name}')

In [9]:
def get_topics(bow):
    return model.get_document_topics(model.id2word.doc2bow(bow), minimum_probability=0)
venues['LDA Distribution'] = venues['Abstract Lemmatized'].apply(lambda x: get_topics(x))

In [10]:
df = venues[['Date', 'LDA Distribution']]
date_dict = {pd.Timestamp(i): (df['Date'][df['Date'] == i].index[0], df['Date'][df['Date'] == i].index[-1]) for i in df['Date'].unique()}

In [11]:
def Novelty(index, w):
    start = df['Date'][index] - relativedelta(months=w)
    end = df['Date'][index] - relativedelta(months=1)
    if start > df['Date'].min():
        date = df.iloc[date_dict[start][0]:date_dict[end][1]]
        date['KLD'] = [kullback_leibler(df['LDA Distribution'][index], x) for x in date['LDA Distribution']]
        nov = date.groupby('Date')['KLD'].mean().mean()*np.log2(e)
        return nov
    else: 
        return np.nan

def Transience(index, w):
    start = df['Date'][index] + relativedelta(months=1)
    end = df['Date'][index] + relativedelta(months=w)
    if end < df['Date'].max():
        date = df.iloc[date_dict[start][0]:date_dict[end][1]]
        date['KLD'] = [kullback_leibler(df['LDA Distribution'][index], x) for x in date['LDA Distribution']]
        nov = date.groupby('Date')['KLD'].mean().mean()*np.log2(e)
        return nov
    else: 
        return np.nan

In [None]:
w = 1

# Novelty
df[f'Novelty_{w}'] = [Novelty(i, w) for i in tqdm(range(len(df)), desc=f'Novelty_{w}')]
df.to_csv(f'KLD/distributions_{w}.csv', index=False)

# Transience
df[f'Transience_{w}'] = [Transience(i, w) for i in tqdm(range(len(df)), desc=f'Transience_{w}')]
df.to_csv(f'KLD/distributions_{w}.csv', index=False)

# Resonance
df[f'Resonance_{w}'] = df[f'Novelty_{w}'] - df[f'Transience_{w}']
df.to_csv(f'KLD/distributions_{w}.csv', index=False)
