In [42]:
import re
import numpy as np
from collections import defaultdict
from textblob import TextBlob 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from os.path import isfile, join, expanduser
import pickle
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import AutoDateFormatter, AutoDateLocator
from gensim import corpora, models, utils
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaMallet
import pyLDAvis
import gzip
import os
import pandas as pd
import sklearn.preprocessing

### Constants

In [43]:
num_topics = 55 # one model at a time

In [44]:
sns.set(style="darkgrid")
std_date_format = '%Y-%m-%d'    

### Helper methods

In [45]:
dataDir = ""

def extract_params(statefile):
    """Extract the alpha and beta values from the statefile.

    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        tuple: alpha (list), beta    
    """
    with gzip.open(statefile, 'r') as state:
        params = [x.decode('utf8').strip() for x in state.readlines()[1:3]]
    return (list(params[0].split(":")[1].split(" ")), float(params[1].split(":")[1]))


def state_to_df(statefile):
    """Transform state file into pandas dataframe.
    The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.
    
    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        datframe: topic assignment for each token in each document of the model
    """
    return pd.read_csv(statefile,
                       compression='gzip',
                       sep=' ',
                       skiprows=[1,2]
                       )

In [46]:
# Topic-term matrix from state file
# https://ldavis.cpsievert.me/reviews/reviews.html

import sklearn.preprocessing

def pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable):
    """
    Turns the pandas dataframe into a data matrix.
    Args:
        df (dataframe): aggregated dataframe 
        smooth_value (float): value to add to the matrix to account for the priors
        rows_variable (str): name of dataframe column to use as the rows in the matrix
        cols_variable (str): name of dataframe column to use as the columns in the matrix
        values_variable(str): name of the dataframe column to use as the values in the matrix
    Returns:
        dataframe: pandas matrix that has been normalized on the rows.
    """
    matrix = df.pivot(index=rows_variable, columns=cols_variable, values=values_variable).fillna(value=0)
    matrix = matrix.values + smooth_value
    
    normed = sklearn.preprocessing.normalize(matrix, norm='l1', axis=1)
    
    return pd.DataFrame(normed)

## Main code

In [47]:
# load the model
lda_model = LdaMallet.load("lda_model_"+str(num_topics)+"_topics_BUS_only.sav")
lda_model_state_file_path = "lda_model_"+str(num_topics)+"_topics_file_BUS_only.gzip"

In [48]:
topic_table = lda_model.show_topics(num_topics, 15)

Fix topic indices to start from 1

In [49]:
topic_dict = {}
for topic in topic_table:
    topic_dict[topic[0]+1]= topic[1]

### Fix data for pyLDAVis

In [50]:
params = extract_params(os.path.join(dataDir, lda_model_state_file_path))
alpha = [float(x) for x in params[0][1:]]
beta = params[1]
#print("{}, {}".format(alpha, beta))

df = state_to_df(os.path.join(dataDir, lda_model_state_file_path))
df['type'] = df.type.astype(str)
#df[:10]

# Get document lengths from statefile
docs = df.groupby('#doc')['type'].count().reset_index(name ='doc_length')
#docs[:10]

# Get vocab and term frequencies from statefile
vocab = df['type'].value_counts().reset_index()
vocab.columns = ['type', 'term_freq']
vocab = vocab.sort_values(by='type', ascending=True)
#vocab[:10]

phi_df = df.groupby(['topic', 'type'])['type'].count().reset_index(name ='token_count')
phi_df = phi_df.sort_values(by='type', ascending=True)
#phi_df[:10]

phi = pivot_and_smooth(phi_df, beta, 'topic', 'type', 'token_count')
#phi[:10]

theta_df = df.groupby(['#doc', 'topic'])['topic'].count().reset_index(name ='topic_count')
theta_df[:10]
theta_df= theta_df.sort_values("topic", ascending=True)

theta = pivot_and_smooth(theta_df, alpha , '#doc', 'topic', 'topic_count')
#theta[:10]

### pyLDAVis Visualization

In [51]:
data = {'topic_term_dists': phi, 
        'doc_topic_dists': theta,
        'doc_lengths': list(docs['doc_length']),
        'vocab': list(vocab['type']),
        'term_frequency': list(vocab['term_freq'])
       }

Confirmed that this document looks ok; why are the topic #'s not corresponding in MDS?

I used the code examples I mentioned earlier for setting the keys of the data dictionary. Keeping the data in sorted dataframes helps ensure that the order is consistent and preserved as it is moved into a list for analysis.

This data is then passed the visualization library, first for preparation and then for display. In preparing the data, the library computes the distances between topics and then projects those distances into a two-dimensional space using "multidimensional scaling" or principle component analysis. In the resulting visualization, the overlap in words of the topic is represented by shared location in space, and the greater the distance between topics, the larger the dissimilarity between the weight of the words that comprise the topic. The second element that the library computes is the most relevant terms for each topic with a sliding scale that either ranks on the overall frequency of the word (the default) or the distinctiveness of the word (lambda = 0.0 on the resulting slider). This provides a richer view of the topic assignments and is useful in labeling for distinguishing between topics.

In [52]:
vis_data = pyLDAvis.prepare(**data, sort_topics=False)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [53]:
pyLDAvis.display(vis_data)

In [54]:
# save to HTML
pyLDAvis.save_html(vis_data, open("AOM_annals_pyldavis_"+str(num_topics)+"_topics_BUS_only.html", "w"))

### Now, processing topic-document matrix

Note: for simplicity, starting indexing at 1 (not 0) for topics

In [55]:
# load the df article list (ensure the right file is being used here)
df_article_list = pd.read_csv("article_list_cleaned_BUS_only.csv")

In [56]:
#df_article_list index will start from 1, not 0
len_df_article_list = len(df_article_list)
new_index_df_article_list = [i for i in range(1,len_df_article_list)]
df_article_list = df_article_list.reindex(new_index_df_article_list)

In [57]:
# going with best topic model as lda_model = models[j]
j = num_topics
document_topics = lda_model.read_doctopics("lda_model_"+str(num_topics)+"_doc_topics_file_BUS_only.txt")
topic_docs_matrix = list(document_topics)
topic_docs_matrix_fixed = [] # the topic numbers are embedded as tuples, this needs to be fixed
for doc in topic_docs_matrix:
    doc_fixed = []
    for topic_weight in doc:
        doc_fixed.append(topic_weight[1])
    topic_docs_matrix_fixed.append(doc_fixed)
dtm = pd.DataFrame(topic_docs_matrix_fixed)
new_column_names = [ "topic_"+str(i+1) for i in range(0,j)]
dtm.columns = new_column_names

In [58]:
#dtm index will start from 1, not 0
lenDtm = len(dtm)
new_index = [i for i in range(1,lenDtm)]

In [59]:
dtm = dtm.reindex(new_index)

In [60]:
dtm.head(2)

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_46,topic_47,topic_48,topic_49,topic_50,topic_51,topic_52,topic_53,topic_54,topic_55
1,0.005543,0.005543,0.005543,0.011641,0.060421,0.072616,0.005543,0.005543,0.011641,0.036031,...,0.011641,0.005543,0.005543,0.023836,0.005543,0.005543,0.023836,0.011641,0.011641,0.011641
2,0.012987,0.012987,0.006184,0.006184,0.006184,0.006184,0.006184,0.006184,0.012987,0.006184,...,0.01979,0.074212,0.01979,0.006184,0.006184,0.006184,0.012987,0.006184,0.026592,0.026592


In [61]:
# Make the document_topic dataframe rounding values in dtm
df_document_topic = pd.DataFrame(np.round(dtm, 2), columns=dtm.columns, index=dtm.index)

In [62]:
dominant_topic = np.argmax(dtm.values, axis=1)

In [63]:
df_document_topic['dominant_topic'] = dominant_topic+1 # using index starting from 1

In [64]:
df_document_topic.head(2)

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_47,topic_48,topic_49,topic_50,topic_51,topic_52,topic_53,topic_54,topic_55,dominant_topic
1,0.01,0.01,0.01,0.01,0.06,0.07,0.01,0.01,0.01,0.04,...,0.01,0.01,0.02,0.01,0.01,0.02,0.01,0.01,0.01,39
2,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.07,0.02,0.01,0.01,0.01,0.01,0.01,0.03,0.03,33


In [65]:
# joining dtm and abstract_and_titles_df
#df_article_list_slim = df_article_list.drop(columns=['document'])
df_document_topic_full = pd.concat([df_document_topic, df_article_list], axis=1)

Now, combine with article dataframe; note: this could be merged with a more full dataframe of document metadata

In [66]:
df_document_topic_full = df_document_topic_full.drop(columns=["Unnamed: 0"])

In [67]:
df_document_topic_full.rename(columns={'id': 'article_id'}, inplace=True)

In [68]:
df_document_topic = df_document_topic_full # better name to type with

In [69]:
df_document_topic.head(2)

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_53,topic_54,topic_55,dominant_topic,abstract,date,article_id,publicationName,title,abstract_cleaned
1,0.01,0.01,0.01,0.01,0.06,0.07,0.01,0.01,0.01,0.04,...,0.01,0.01,0.01,39,The rapid growth of the Chinese economy has re...,2018-11-20,2-s2.0-85053080182,Journal of Cleaner Production,Heterogeneity evaluation of China's provincial...,The rapid growth of the Chinese economy has re...
2,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.03,0.03,33,The aim of this study is to introduce a decisi...,2018-11-10,2-s2.0-85007493460,Total Quality Management and Business Excellence,Topic modelling-based decision framework for a...,The aim of this study is to introduce a decisi...


#### Manual validation using the Jha & Beckman 2015 paper

In [70]:
df_document_topic.iloc[98]

topic_1                                                          0.01
topic_2                                                          0.01
topic_3                                                          0.01
topic_4                                                          0.01
topic_5                                                          0.03
topic_6                                                          0.01
topic_7                                                          0.02
topic_8                                                          0.01
topic_9                                                          0.01
topic_10                                                         0.01
topic_11                                                         0.02
topic_12                                                         0.01
topic_13                                                         0.01
topic_14                                                         0.01
topic_15            

In [71]:
topic_dict[48]

'0.076*"type" + 0.070*"risk" + 0.058*"organizational" + 0.048*"find" + 0.042*"textual" + 0.039*"form" + 0.033*"call" + 0.030*"increase" + 0.027*"disclosure" + 0.027*"identity" + 0.027*"perception" + 0.024*"analyst" + 0.024*"show" + 0.021*"conference" + 0.021*"report"'

In [72]:
# confirmed that Harsh's paper is corrected indexed

In [73]:
df_document_topic['index_pos'] = df_document_topic.index

In [74]:
df_document_topic_output = df_document_topic.copy()
df_document_topic_output.index=df_document_topic_output['article_id']

In [75]:
df_document_topic_output.to_csv("df_document_topic_"+str(num_topics)+"topics_BUS_only.csv")

#### Review topics distribution across documents as a way of determining the category list

In [76]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution[:10]

Unnamed: 0,Topic Num,Num Documents
0,40,8
1,2,7
2,17,7
3,39,7
4,29,7
5,30,7
6,8,6
7,18,6
8,33,6
9,26,6


Dominant topics year were 40,2,17,39,29,30

In [77]:
def returnTop_articles_featuring_topic(topic_id, df):
    df_topic = df[df['dominant_topic']==topic_id]
    df_topic = df_topic.sort_values("topic_"+str(topic_id), ascending=False)
    return df_topic

##### Now looking at most dominant topics

### Topic 40

In [110]:
topic_id = 40
topic_dict[topic_id]

'0.118*"network" + 0.094*"community" + 0.040*"correlation" + 0.040*"factor" + 0.033*"matrix" + 0.028*"similar" + 0.028*"factorization" + 0.026*"capture" + 0.021*"preference" + 0.019*"friend" + 0.017*"effect" + 0.017*"people" + 0.017*"discovering" + 0.014*"structured" + 0.014*"prediction"'

In [111]:
docs_featuring_topic = returnTop_articles_featuring_topic(topic_id, df_document_topic)
for i,row in docs_featuring_topic[0:3].iterrows():
    print ('Article title:',row['title'],'\nAbstract:',row['abstract_cleaned']+"\n------\n")

Article title: Improving user recommendation by extracting social topics and interest topics of users in uni-directional social networks 
Abstract: With the rapid growth of population on social networks, people are confronted with information overload problem. This clearly makes filtering the targeted users a demanding and key research task. Uni-directional social networks are the scenarios where users provide limited follow or not binary features. Related works prefer to utilize these follower-followee relations for recommendation. However, a major problem of these methods is that they assume every follower-followee user pairs are equally likely, and this leads to the coarse user following preferences inferring. Intuitively, a user's adoption of others as followees may be motivated by her interests as well as social connections, hence a good recommender should be able to separate the two situations and take both factors into account for better recommendation results. In this regard, w

##### This is topic modeling customer product reviews; opinion mining

### Topic 2

In [113]:
topic_id = 2
topic_dict[topic_id]

'0.157*"sentiment" + 0.093*"aspect" + 0.084*"review" + 0.046*"product" + 0.035*"word" + 0.031*"opinion" + 0.029*"model" + 0.024*"simultaneously" + 0.022*"joint" + 0.020*"mining" + 0.020*"existing" + 0.015*"component" + 0.013*"summary" + 0.013*"sentence" + 0.013*"ws-tswe"'

In [114]:
docs_featuring_topic = returnTop_articles_featuring_topic(topic_id, df_document_topic)
for i,row in docs_featuring_topic[0:3].iterrows():
    print ('Article title:',row['title'],'\nAbstract:',row['abstract_cleaned']+"\n------\n")

Article title: Sentence retrieval with sentiment-specific topical anchoring for review summarization 
Abstract: We propose Topic Anchoring-based Review Summarization (TARS), a two-step extractive summarization method, which creates review summaries from the sentences that represent the most important aspects of a review. In the first step, the proposed method utilizes Topic Aspect Sentiment Model (TASM), a novel sentiment-topic model, to identify aspects of sentiment-specific topics in a collection of reviews. The output of TASM is utilized in the second step of TARS to rank review sentences based on how representative of the most important review aspects their words are. Qualitative and quantitative evaluation of review summaries using two collections indicate the effectiveness of structuring review summaries around aspects of sentiment-specific topics.
------

Article title: Weakly supervised topic sentiment joint model with word embeddings 
Abstract: Topic sentiment joint model aims

##### This is sentiment analysis on product reviews

### Topic 17

In [125]:
topic_id = 17
topic_dict[topic_id]

'0.300*"social" + 0.135*"medium" + 0.114*"twitter" + 0.051*"message" + 0.034*"tweet" + 0.025*"audience" + 0.023*"specific" + 0.023*"network" + 0.019*"large-scale" + 0.017*"activity" + 0.017*"diffusion" + 0.015*"lead" + 0.013*"share" + 0.011*"determine" + 0.011*"engagement"'

In [126]:
docs_featuring_topic = returnTop_articles_featuring_topic(topic_id, df_document_topic)
for i,row in docs_featuring_topic[0:3].iterrows():
    print ('Article title:',row['title'],'\nAbstract:',row['abstract_cleaned']+"\n------\n")

Article title: Relationship between audience engagement on social media and broadcast media ratings 
Abstract: People often share their opinions or impressions about TV shows (e.g., dramas) with other viewers through social media such as personal blogs and Twitter. As such, broadcast media, especially TV, lead to audience engagement on social media. Moreover, the audience engagement, in turn, impacts broadcast media ratings. Social TV analyzes audience's TV-related social media behaviors and tries to use the behaviors in marketing activities such as advertisement; however, this is purely based on the quantity o f engagement in social media. In this study, we analyze the subjects of the audience engagement on social media about specific TV dramas through topic modeling, and examines the relationship between changes in the topics and viewer ratings of the TV dramas.
------

Article title: Identifying topical influencers on twitter based on user behavior and network topology 
Abstract: So

##### This is breaking down audience structure on social media

### Topic 39

In [120]:
topic_id = 39
topic_dict[topic_id]

'0.180*"technology" + 0.038*"transfer" + 0.032*"emergence" + 0.032*"technological" + 0.030*"association" + 0.030*"pattern" + 0.024*"china" + 0.024*"alternative" + 0.022*"track" + 0.022*"suggestion" + 0.022*"complex" + 0.019*"count" + 0.019*"technical" + 0.019*"past" + 0.016*"innovation"'

In [122]:
docs_featuring_topic = returnTop_articles_featuring_topic(topic_id, df_document_topic)
for i,row in docs_featuring_topic[0:3].iterrows():
    print ('Article title:',row['title'],'\nAbstract:',row['abstract_cleaned']+"\n------\n")

Article title: Heterogeneity evaluation of China's provincial energy technology based on large-scale technical text data mining 
Abstract: The rapid growth of the Chinese economy has resulted in great pressure on the environment: technological innovation is the fundamental pathway for improvement the efficiency in the process of energy saving and emission reduction. Based on large-scale technical text data in 31 Chinese provinces from 1985 to 2017, the Latent Dirichlet Allocation (LDA) topic model is introduced to technology content analysis. Then the LDA provincial-topic model is constructed, the subject and object of energy technology are jointly modelled, and the relationship between technology subject and technology have been region studied. The energy saving and emission reduction technology research direction in 31 provinces of China in the past 30 years has been examined: the status and level of technical reserves in each province have been evaluated, and the heterogeneity of pr

##### This is using LDA for technology emergence

### Topic 29

In [131]:
topic_id = 29
topic_dict[topic_id]

'0.246*"research" + 0.095*"article" + 0.073*"journal" + 0.054*"trend" + 0.035*"year" + 0.035*"field" + 0.028*"published" + 0.022*"decade" + 0.017*"history" + 0.017*"evolution" + 0.017*"uncover" + 0.015*"discussed" + 0.013*"understand" + 0.013*"accounting" + 0.011*"scholar"'

In [132]:
docs_featuring_topic = returnTop_articles_featuring_topic(topic_id, df_document_topic)
for i,row in docs_featuring_topic[0:3].iterrows():
    print ('Article title:',row['title'],':\nAbstract:',row['abstract_cleaned']+"\n------\n")

Article title: The journal of consumer research at 40: A historical analysis :
Abstract: This article reviews 40 years of the Journal of Consumer Research (JCR). Using text mining, we uncover the key phrases associated with consumer research. We use a topic modeling procedure to uncover 16 topics that have been featured in the journal since its inception and to show the trends in topics over time. For example, we highlight the decline in family decision-making research and the flourishing of social identity and influence research since the journal’s inception. A citation analysis shows which JCR articles have had the most impact and compares the topics in top-cited articles with all JCR journal articles. We show that methodological and consumer culture articles tend to be heavily cited. We conclude by investigating the scholars who have been the top contributors to the journal across the four decades of its existence. And to better understand which schools have contributed most to the 

##### This is using LDA for literature reviews in academic journals

### Topic 30

In [129]:
topic_id = 30
topic_dict[topic_id]

'0.124*"patent" + 0.107*"knowledge" + 0.068*"study" + 0.051*"area" + 0.034*"strategy" + 0.031*"explore" + 0.023*"boundary" + 0.020*"novelty" + 0.020*"trademark" + 0.020*"theory" + 0.020*"diverse" + 0.020*"classification" + 0.017*"offer" + 0.017*"finally" + 0.017*"emerging"'

In [130]:
docs_featuring_topic = returnTop_articles_featuring_topic(topic_id, df_document_topic)
for i,row in docs_featuring_topic[0:3].iterrows():
    print ('Article title:',row['title'],':\nAbstract:',row['abstract_cleaned']+"\n------\n")

Article title: Identifying emerging Research and Business Development (R&BD) areas based on topic modeling and visualization with intellectual property right data :
Abstract: Although investments of R&D by government and firms have enlarged and the amount of patents has increased rapidly, R&D almost fails to commercialize for various reasons. For the purpose of decreasing failure rate of technology commercialization, it is important to identify emerging business based on technology in advance and establish appropriate strategy, leading to surviving at the market. Therefore, this paper aims to explore emerging Research and Business Development (R&BD) areas, and establish a business strategy based on valuable patents by comprehensively analyzing IPRs - patent as well as design and trademark. First, unrevealed but potential R&BD areas are explored by analyzing the relation between patent and trademark through topic modeling and network analysis, which aims to preferentially find potential

##### This is using LDA to study emergence knowledge areas in patents

#### What are most dominant topics in the last year? Note: this code needs to be updated

In [None]:
# convert to date object for better visualizing, sorting
df_document_topic_year_only = df_document_topic_output.copy()
df_document_topic_year_only['year'] = pd.to_datetime(df_document_topic_year_only['date']).dt.year
df_document_topic_year_only = df_document_topic_year_only.set_index(['year'])

In [None]:
df_document_topic_year_only_ = df_document_topic_year_only.copy()
df_document_topic_year_only_ = df_document_topic_year_only_.sort_values('year', ascending=True)

In [None]:
df_document_topic_2018_only = df_document_topic_year_only.loc[2018]
df_document_topic_2013_only = df_document_topic_year_only.loc[2013]
df_document_topic_2009_only = df_document_topic_year_only.loc[2009]

In [None]:
### Review topics distribution across documents 2009
df_document_topic_2009_only = df_document_topic_2009_only['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_document_topic_2009_only.columns = ['Topic Num', 'Num Documents']
df_document_topic_2009_only[:10]

In [None]:
### Review topics distribution across documents 2013
df_document_topic_2013_only = df_document_topic_2013_only['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_document_topic_2013_only.columns = ['Topic Num', 'Num Documents']
df_document_topic_2013_only[:10]

In [None]:
### Review topics distribution across documents 2018
df_document_topic_2018_only = df_document_topic_2018_only['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_document_topic_2018_only.columns = ['Topic Num', 'Num Documents']
df_document_topic_2018_only[:10]

Dominant topics in the first week were 17, 13, 27

In [None]:
df_document_topic_2018_only['Num Documents'].plot()

Dominant topics in 2018 are 17, 13, 27, then drops off

In [None]:
topic_dict[17]

In [None]:
topic_dict[13]

In [None]:
topic_dict[27]

## Analyze topics

### Prep for visualizations

In [None]:
# each row is an article, it is in wide form, need to convert to long form with topics, ['article_id', 'topic_id', 'topic_weight', 'date']
output_list = []
for index, row in df_document_topic.iterrows():
    #print (row['doc_id'])
    for i in range(1,num_topics+1):
        topicName = 'topic_'+str(i)
        output_list.append({'article_id': row['article_id'], 'topic_id': i, 'topic_weight': row[topicName] , 'date': row['date']})                           

In [None]:
df_long = pd.DataFrame(output_list)

In [None]:
df_long.head()

In [None]:
# convert to date object for better visualizing, sorting
df_long['date'] = pd.to_datetime(df_long['date'])

## Visualizations

In [None]:
label_size = 10
plt.rcParams['xtick.labelsize'] = label_size 

In [None]:
# average topic weights per day
# Get number of docs per year
total_docs = df_long.groupby('date')['article_id'].apply(lambda x: len(x.unique())).reset_index()
total_docs.columns = ['date', 'total_articles']

In [None]:
# Group by year and topic id
df_avg = df_long.groupby(['date', 'topic_id']).agg({'topic_weight': 'sum'}).reset_index()

In [None]:
# Merge dataframes
df_avg = df_avg.merge(total_docs, on="date", how="left")

In [None]:
# Compute the mean per topic
df_avg['average_weight'] = df_avg['topic_weight'] / df_avg['total_articles']

In [None]:
df_avg.head()

### Determining the top topic over the whole period, by using .10 threshold

In [None]:
df_avg_top_topics = df_avg.copy()

In [None]:
df_avg_top_topics[df_avg_top_topics['topic_id']==1]['average_weight'].max()

In [None]:
for topicid in range(1,num_topics+1):
    if df_avg_top_topics[df_avg_top_topics['topic_id']==topicid]['average_weight'].max() < .05:
        df_avg_top_topics = df_avg_top_topics[df_avg_top_topics.topic_id != topicid]
        print ("dropping topicID", topicid )

In [None]:
plt.figure(figsize=(25,25))
df_avg_topic10 = df_avg_top_topics[df_avg_top_topics['topic_id']==10]
ax = sns.catplot(x="date", y="average_weight",hue="topic_id", data=df_avg_top_topics)
ax.set_xticklabels(df_avg_topic10["date"].dt.strftime(std_date_format), rotation='vertical')

## alternative template derivation; using average weights in 2018

In [None]:
df_avg_2018 = df_avg.copy()
df_avg_2018.head()

df_avg_2018['year'] = pd.to_datetime(df_avg_2018['date']).dt.year

df_avg_2018 = df_avg_2018.set_index('year')
df_avg_topic10 = df_avg[(df_avg['topic_id']==10)] # to be used for labels

df_avg_2018.head()

In [None]:
df_avg_2018 = df_avg_2018.loc[2018]

In [None]:
topic_names = [ "topic"+str(i) for i in range(1,num_topics+1)]

In [None]:
dates_for_ticks = df_avg_top_topics[df_avg_top_topics['topic_id']==1]['date'].dt.strftime(std_date_format)

In [None]:
plt.figure(figsize=(25,5))

ax = sns.factorplot(x="date", y="average_weight",hue="topic_id", data=df_avg_2018)
ax.set_xticklabels(dates_for_ticks, rotation='vertical')

Topic 3, 4, 30 are clearly dominant in the first 7 days of coverage

### visually graphing average topic weights

In [None]:
df_avg_topic_13 = df_avg[(df_avg['topic_id']==13)]
plt.figure(figsize=(15,5))
ax = sns.lineplot(x="date", y="average_weight", data=df_avg_topic_13)
ax.set_xticklabels(df_avg_topic_13["date"], rotation='vertical')
ax.set_title("Topic 13, Social entrepreneurship")
#set ticks every week
ax.xaxis.set_major_locator(mdates.YearLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

### Multiple series in the same plot

In [None]:
def slideDF_byTopicColumn(df, topic_id, label=""):
    df_trimmed = df[(df['topic_id']==topic_id)]
    df_trimmed = df_trimmed[["date", "average_weight"]]
    df_trimmed.index = df_trimmed['date']
    df_trimmed = df_trimmed.drop(columns=['date'])
    df_trimmed.rename(columns={"average_weight": "topic "+str(topic_id)+" "+label}, inplace=True)
    return df_trimmed

In [None]:
data = pd.concat([slideDF_byTopicColumn(df_avg, 13, " Social entrepreneurship" ),slideDF_byTopicColumn(df_avg, 27, " Crowdsourcing" ), slideDF_byTopicColumn(df_avg, 17, " Family Business" )], axis=1)
plt.figure(figsize=(25,10))
ax = sns.lineplot(data=data)
ax.set_xticklabels(data.index, rotation='vertical')
#set ticks every week
ax.xaxis.set_major_locator(mdates.YearLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

In [None]:
data = pd.concat([slideDF_byTopicColumn(df_avg, 30, " topic 30" )], axis=1)
plt.figure(figsize=(25,10))
ax = sns.lineplot(data=data)
ax.set_xticklabels(data.index, rotation='vertical')
#set ticks every week
ax.xaxis.set_major_locator(mdates.YearLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))