# Load dependancies

In [1]:
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

# topic modeling libraries
import pyLDAvis.gensim 

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# supporting libraries
import pandas as pd
import time
import pickle
import topic_modeling_v6 as tm

  from collections import Iterable
  from collections import Mapping
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
num_topics_1 = 10
num_topics_2 = 5
num_topics_3 = 3

# Get FIRST level of topics (LDA)

In [3]:
# load data for LDA
df_data = pd.read_csv("./data/train_grouped.tsv", sep="\t")
    
print(df_data.shape)
df_data.columns

(33982, 16)


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3'],
      dtype='object')

In [4]:
df_data['list_of_nouns'] = df_data['list_of_nouns'].str.lower().str[1:-1].str.split(", ")
df_data['list_of_nouns'].head()

0    [rise, economies, march, globalisation, surge,...
1    [pfizer, commitment, responsibility, drugs, ta...
2    [weeks, interest, rates, time, years, world, b...
3    [cruise, lines, wave, months, year, holidays, ...
4    [calendar, year, mood, events, consensus, resp...
Name: list_of_nouns, dtype: object

In [5]:
df_data['list_of_lemmas'] = df_data['list_of_lemmas'].str.lower().str[1:-1].str.split(", ")
df_data['list_of_lemmas'].head()

0    [rise, big, emerging, economies, steady, march...
1    [pfizer, prided, commitment, corporate, social...
2    [weeks, raised, interest, rates, time, years, ...
3    [cruise, lines, brace, wave, months, year, nea...
4    [start, calendar, year, buoyant, mood, caught,...
Name: list_of_lemmas, dtype: object

In [6]:
start_time = time.time()
df_data_1 = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                               params={"TEXT_prepared_df": df_data,
                                                       "save_LDA_dictionary_path": "./output/lda/dictionary1.pickle"
                                                       },
                                        verbose=2)
end_time = time.time()
print("Processing time in minutes:", round((end_time - start_time)/60,2))

loaded data shape: (33982, 16)

Total number of unique Lemmas:  82802

Distribution of lemmas' document counts: 
     count       mean         std  min  50%  55%  65%  75%   85%   95%    97%  \
0  82802.0  26.127642  154.452831  1.0  1.0  2.0  3.0  6.0  17.0  94.0  175.0   

     99%      max  
0  512.0  11676.0  

Deleting too frequent and too rare words...
Lemma count upper bound: 512.0
Lemma count lower bound: 3

List of words for topic modeling dictionary is reduced from 82802 to 26768
LDA dictionary file is saved to: ./output/lda/dictionary1.pickle

Number of texts processed:  33980
Number of extracted lemmas:  26768

Each text is represented by list of  26768  tuples: 
		(lemma's index in bag-of-words dictionary, lemma's term frequency)
Processing time in minutes: 0.07


In [7]:
start_time = time.time()
df_first_level = tm.train_model(model_type="LDA",
                            params={"num_topics": num_topics_1,
                                    "LDA_prepared_df": df_data_1,
                                    "LDA_dictionary_path": "./output/lda/dictionary1.pickle",
                                    "save_LDA_model_path": "./output/lda/LDA_model1"
                                    },
                               verbose=2)
end_time = time.time()
print("Processing time in minutes:", round((end_time - start_time)/60,2))

Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
loaded data shape: (33980, 18)

Creating document-term matrix for LDA...

Training LDA model with  10  topics...
LDA model file is saved to: ./output/lda/LDA_model1
Top topic indexes are selected. NOTE "-1" corresponds to top topic with probability < 20%
Processing time in minutes: 1.61


In [8]:
#value count of TOP level topics
df_first_level['first_level_topic'] = df_first_level['top_topic']
df_first_level['first_level_topic_proba'] = df_first_level['top_topic_proba']
df_first_level['first_level_topic'].value_counts().sort_index()

0    1457
1    4999
2    1962
3    3133
4    2096
5    2862
6    4345
7    6634
8    3142
9    3350
Name: first_level_topic, dtype: int64

In [9]:
df_first_level = df_first_level.drop(columns=['selected_words', 'doc2bow',
       'infered_topics', 'top_topic', 'top_topic_proba'])

***
# Get SECOND level topics (LDA)

In [10]:
first_level_topics = list(set(df_first_level['first_level_topic']))
first_level_topics

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [11]:
start = time.time()
list_dfs = []
for topic in first_level_topics:
    print("\nSelected topic index:", topic)
    df_topic = df_first_level[df_first_level['first_level_topic'] == topic].copy()
    save_dict_path = "./output/lda/dictionary1_"+str(topic+1)+".pickle"
    save_LDA_model_path = "./output/lda/LDA_model1_" + str(topic + 1)
    
    df_data_tmp = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                       params={"TEXT_prepared_df": df_topic,
                                               "save_LDA_dictionary_path": save_dict_path
                                               },
                                       verbose=1)

    df_2nd_tmp = tm.train_model(model_type="LDA",
                                params={"num_topics": num_topics_2,
                                        "LDA_prepared_df": df_data_tmp,
                                        "LDA_dictionary_path": save_dict_path,
                                        "save_LDA_model_path": save_LDA_model_path
                                        },
                                verbose=1)

    #value counts of SECOND level topics
    print("\nValue counts of SECOND level topics:")
    df_2nd_tmp['second_level_topic'] = df_2nd_tmp['top_topic']
    df_2nd_tmp['second_level_topic_proba'] = df_2nd_tmp['top_topic_proba']
    print(df_2nd_tmp['second_level_topic'].value_counts().sort_index())

    print("#"*50)
    df_2nd_tmp = df_2nd_tmp.drop(columns=['selected_words', 'doc2bow',
                                           'infered_topics', 'top_topic', 'top_topic_proba'])
    list_dfs.append(df_2nd_tmp)
finish = time.time()


Selected topic index: 0
loaded data shape: (1457, 18)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1

Value counts of SECOND level topics:
0    247
1    319
2    235
3    297
4    359
Name: second_level_topic, dtype: int64
##################################################

Selected topic index: 1
loaded data shape: (4999, 18)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_2

Value counts of SECOND level topics:
0     743
1    1391
2     742
3    1078
4    1045
Name: second_level_topic, dtype: int64
##################################################

Selected topic index: 2
loaded data shape: (1962, 18)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_3

Value counts of SECOND level topics:
0    247
1    469
2    231
3    480
4    535
Name: second_level_topic, dtype: int64
#################

In [12]:
print("Time of gettig Second level topics in minutes:", round((finish-start)/60,2))
df_second_level = pd.concat(list_dfs)
df_second_level.columns

Time of gettig Second level topics in minutes: 8.13


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba'],
      dtype='object')

***
# Get THIRD level topics

In [13]:
df_second_level[['first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba']].describe()

Unnamed: 0,first_level_topic,first_level_topic_proba,second_level_topic,second_level_topic_proba
count,33979.0,33979.0,33979.0,33979.0
mean,4.967863,0.678492,2.050855,0.739948
std,2.773743,0.195395,1.403456,0.19557
min,0.0,0.213854,0.0,0.242436
25%,3.0,0.519647,1.0,0.568842
50%,6.0,0.659102,2.0,0.733796
75%,7.0,0.849835,3.0,0.966375
max,9.0,0.990098,4.0,0.991709


In [14]:
start = time.time()
list_dfs = []

for topic_1st in first_level_topics:
    print("\nSelected FIRST level topic index:",topic_1st)
    df_1st_tmp = df_second_level[df_second_level['first_level_topic'] == topic_1st].copy()
    second_level_topics = list(set(df_1st_tmp['second_level_topic']))
    print("second_level_topics", second_level_topics)
    
    for topic_2nd in second_level_topics:
        print("\nSelected topics' indexes:", (topic_1st, topic_2nd))
        
        save_dict_path = "./output/lda/dictionary1_"+str(topic_1st+1)+"_"+str(topic_2nd+1)+".pickle"
        save_LDA_model_path = "./output/lda/LDA_model1_"+str(topic_1st+1)+"_"+str(topic_2nd+1)
        
        df_2nd_tmp = df_1st_tmp[df_1st_tmp['second_level_topic'] == topic_2nd].copy()
        
        df_data_tmp = tm.prepare_for_modeling(data_path="", model_type="LDA",
                                           params={"TEXT_prepared_df": df_2nd_tmp,
                                                   "save_LDA_dictionary_path": save_dict_path
                                                   },
                                           verbose=1)

        df_3d_tmp = tm.train_model(model_type="LDA",
                                    params={"num_topics": num_topics_3,
                                            "LDA_prepared_df": df_data_tmp,
                                            "LDA_dictionary_path": save_dict_path,
                                            "save_LDA_model_path": save_LDA_model_path,
                                            },
                                    verbose=1)

        #value counts of SECOND level topics
        print("\nValue counts of SECOND level topics:")
        df_3d_tmp['third_level_topic'] = df_3d_tmp['top_topic']
        df_3d_tmp['third_level_topic_proba'] = df_3d_tmp['top_topic_proba']
        print(df_3d_tmp['second_level_topic'].value_counts().sort_index())

        print("#"*50)
        df_3d_tmp = df_3d_tmp.drop(columns=['selected_words', 'doc2bow',
                                               'infered_topics', 'top_topic', 'top_topic_proba'])
        list_dfs.append(df_3d_tmp)
finish = time.time()


Selected FIRST level topic index: 0
second_level_topics [0, 1, 2, 3, 4]

Selected topics' indexes: (0, 0)
loaded data shape: (247, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1_1

Value counts of SECOND level topics:
0    247
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (0, 1)
loaded data shape: (319, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1_2

Value counts of SECOND level topics:
1    319
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (0, 2)
loaded data shape: (235, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_1_3

Value counts of SECOND level topics:
2    235
Name: second_level_topic, dtype: int64
###########################

Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_6_1

Value counts of SECOND level topics:
0    784
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (5, 1)
loaded data shape: (479, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_6_2

Value counts of SECOND level topics:
1    479
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (5, 2)
loaded data shape: (493, 20)
Training LDA with only lemmas of NOUNs, VERBs, ADJs and ADVs
LDA model file is saved to: ./output/lda/LDA_model1_6_3

Value counts of SECOND level topics:
2    493
Name: second_level_topic, dtype: int64
##################################################

Selected topics' indexes: (5, 3)
loaded data shape: (609, 20)
Training LDA with only lemmas of NOUNs, VERBs, AD

In [15]:
print("Time of gettig Third level topics in minutes:", round((finish-start)/60,2))
df_third_level = pd.concat(list_dfs)
df_third_level.columns

Time of gettig Third level topics in minutes: 11.38


Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba'],
      dtype='object')

# Evaluate 

In [16]:
df_result = df_third_level.copy()
df_result[['first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
first_level_topic,33979.0,4.967863,2.773743,0.0,3.0,6.0,7.0,9.0
first_level_topic_proba,33979.0,0.678492,0.195395,0.213854,0.519647,0.659102,0.849835,0.990098
second_level_topic,33979.0,2.050855,1.403456,0.0,1.0,2.0,3.0,4.0
second_level_topic_proba,33979.0,0.739948,0.19557,0.242436,0.568842,0.733796,0.966375,0.991709
third_level_topic,33979.0,1.018953,0.813496,0.0,0.0,1.0,2.0,2.0
third_level_topic_proba,33979.0,0.819664,0.181521,0.336766,0.655216,0.909433,0.980982,0.991845


In [17]:
df_result['second_level_topic'] = df_result['first_level_topic'].apply(str) + "." +\
                                  df_result['second_level_topic'].apply(str)
df_result['third_level_topic'] = df_result['second_level_topic'].apply(str) + "." +\
                                  df_result['third_level_topic'].apply(str)
df_result[['second_level_topic','third_level_topic']].iloc[::1000].head()

Unnamed: 0,second_level_topic,third_level_topic
221,0.0,0.0.1
24575,0.3,0.3.1
20545,1.0,1.0.0
11355,1.1,1.1.0
23974,1.2,1.2.1


In [18]:
print("Number of first level clusters per publication section:")
pd.DataFrame(df_result.groupby('section')['first_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of first level clusters per publication section:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
first_level_topic,14.0,8.357143,3.387923,1.0,2.2,10.0,10.0,10.0,10.0,10.0


In [19]:
df_result['section'].value_counts()

health                   8237
business                 6930
culture                  3246
science                  2910
tech                     2527
gear                     2108
security                 1840
transportation           1666
finance-and-economics    1648
Space                    1641
Health                   1193
movies                     31
style                       1
music                       1
Name: section, dtype: int64

In [20]:
#test single section
section = "health"
df_result[df_result['section'] == section][['first_level_topic',
                                            'second_level_topic',
                                            'third_level_topic']].describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])

Unnamed: 0,first_level_topic
count,8237.0
mean,4.748938
std,2.110383
min,0.0
10%,2.0
25%,4.0
50%,5.0
75%,6.0
90%,7.0
max,9.0


In [21]:
print("Number of first level clusters per 30% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_1')['first_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of first level clusters per 30% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
first_level_topic,571.0,3.565674,2.878318,1.0,1.0,1.0,2.0,5.0,9.0,10.0


In [22]:
print("Number of second level clusters per 50% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_2')['second_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of second level clusters per 50% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
second_level_topic,6767.0,2.274716,2.984201,1.0,1.0,1.0,1.0,2.0,5.0,41.0


In [23]:
print("Number of third level clusters per 70% semantic similarity group:")
pd.DataFrame(df_result.groupby('group_level_3')['third_level_topic'].nunique().describe(percentiles=[0.1,
                                                                                   0.25,0.5,0.75,0.9])).T

Number of third level clusters per 70% semantic similarity group:


Unnamed: 0,count,mean,std,min,10%,25%,50%,75%,90%,max
third_level_topic,22065.0,1.257738,0.921828,1.0,1.0,1.0,1.0,1.0,2.0,27.0


In [24]:
with open('./output/df_result.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df_result, f, pickle.HIGHEST_PROTOCOL)

# Name Topics 

In [6]:
# load df_result
with open('./output/df_result.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    df_result = pickle.load(f)

In [7]:
df_result.head().T

Unnamed: 0,221,288,289,306,448
date,2016-04-09 00:00:00,2016-05-12 00:00:00,2016-05-12 00:00:00,2016-05-19 00:00:00,2016-07-23 00:00:00
author,,,,,
title,Dumping and tub-thumping - Free exchange,Snappy dressers - Crocodile farming,Snuffed out - Tobacco firms,The leeward side of fortune - Pacific economies,Silicon Valley 1.0 - Schumpeter
url,https://www.economist.com/finance-and-economic...,https://www.economist.com/business/2016/05/12/...,https://www.economist.com/business/2016/05/12/...,https://www.economist.com/finance-and-economic...,https://www.economist.com/business/2016/07/23/...
section,finance-and-economics,business,business,finance-and-economics,business
publication,Economist,Economist,Economist,Economist,Economist
first_10_sents,IT WAS a flood of cheap steel from an intimida...,"SOME 30,000 crocodiles bask at Izintaba, a far...",THE interests of cigarettemakers and regulator...,THE phrase Pacific island conjures images of w...,WHEN the Republican Party decided to hold its ...
list_of_first_10_sents,['IT WAS a flood of cheap steel from an intimi...,"['SOME 30,000 crocodiles bask at Izintaba, a f...",['THE interests of cigarettemakers and regulat...,['THE phrase Pacific island conjures images of...,['WHEN the Republican Party decided to hold it...
list_of_verb_lemmas,"[intimidating, prompted, angered, soaring, imp...","[bask, sprawled, Sold, watch, fetch, requires,...","[align, came, announced, expanding, grew, acco...","[conjures, served, halved, run, contend, risin...","[decided, hold, dreamed, suited, think, illust..."
noun_phrases,"['flood', 'cheap steel', 'intimidating new eco...","['Izintaba', 'farm', 'acre', 'South African ci...","['interest', 'cigarettemaker', 'regulator', 'd...","['phrase Pacific island conjures image', 'whit...","['Republican Party', 'national convention', 'C..."


In [8]:
#get first level topic names
df = df_result.copy()

LDA_model_path = "./output/lda/LDA_model1"
num_topics = num_topics_1
df['first_level_topic_name'] = tm.get_topic_names(df, 'first_level_topic', 'list_of_nouns',
                                                 LDA_model_path, num_topics, num_words = 50)
df['first_level_topic_name'].value_counts()

Music      6633
Hackers    4999
Flu        4345
Rocket     3350
Fund       3142
Videos     3133
Drivers    2862
Measles    2096
Gun        1962
Tobacco    1457
Name: first_level_topic_name, dtype: int64

In [9]:
list_dfs = []
for topic_1st in range(num_topics_1):
    print("\nSelected FIRST level topic index:",topic_1st)
    df_1st_tmp = df[df['first_level_topic'] == topic_1st].copy()
    LDA_model_path = "./output/lda/LDA_model1_"+str(topic_1st+1)

    df_1st_tmp['second_level_topic_name'] = tm.get_topic_names(df_1st_tmp, 
                                                                   'second_level_topic', 
                                                                   'list_of_nouns',
                                                               LDA_model_path, num_topics_2, num_words = 50)

    #value counts of SECOND level topics
    print("\nValue counts of SECOND level topics:")
    print(df_1st_tmp['second_level_topic_name'].value_counts().sort_index())
    print("#"*50)
    list_dfs.append(df_1st_tmp)
df_2_named = pd.concat(list_dfs)
df_2_named.columns


Selected FIRST level topic index: 0

Value counts of SECOND level topics:
Abortions     319
Beer          247
Chemicals     359
Neutrality    297
Recall        235
Name: second_level_topic_name, dtype: int64
##################################################

Selected FIRST level topic index: 1

Value counts of SECOND level topics:
Ads                1078
Documents          1045
Exports             743
Rates              1391
Vulnerabilities     742
Name: second_level_topic_name, dtype: int64
##################################################

Selected FIRST level topic index: 2

Value counts of SECOND level topics:
Coal           480
Encryption     469
Shootings      535
Transgender    247
Workers        231
Name: second_level_topic_name, dtype: int64
##################################################

Selected FIRST level topic index: 3

Value counts of SECOND level topics:
Ads       875
Board     560
Deals     384
Stores    619
Union     695
Name: second_level_topic_name, dtype: in

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba', 'first_level_topic_name',
       'second_level_topic_name'],
      dtype='object')

In [10]:
list_dfs = []
second_level_topics = list(set(df['second_level_topic']))
for topic_2nd in second_level_topics:
    print("\nSelected SECOND level topic index:",topic_2nd)
    df_2nd_tmp = df_2_named[df_2_named['second_level_topic'] == topic_2nd].copy()
    LDA_model_path = "./output/lda/LDA_model1_"+str(int(topic_2nd[0])+1)+\
                                                "_"+str(int(topic_2nd[-1])+1)

    df_2nd_tmp['third_level_topic_name'] = tm.get_topic_names(df_2nd_tmp, 
                                                                   'third_level_topic', 
                                                                   'list_of_nouns',
                                                               LDA_model_path, num_topics_3, num_words = 50)

    #value counts of THIRD level topics
    print("\nValue counts of THIRD level topics:")
    print(df_2nd_tmp['third_level_topic_name'].value_counts().sort_index())
    print("#"*50)
    list_dfs.append(df_2nd_tmp)
df_3_named = pd.concat(list_dfs)
df_3_named.columns


Selected SECOND level topic index: 5.2

Value counts of THIRD level topics:
Boys        145
School      177
Scooters    171
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 9.3

Value counts of THIRD level topics:
Exoplanet    294
Rings        303
Rover        337
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 8.0

Value counts of THIRD level topics:
Bankruptcy    176
Brands        248
Card          182
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 0.3

Value counts of THIRD level topics:
Cannabis    132
Lung         56
Women       109
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 7.2

Value counts of THIRD level topics:
Bike        321
Children    278
Episod


Value counts of THIRD level topics:
Concussion    119
Dementia       77
Ear           130
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 1.3

Value counts of THIRD level topics:
App              281
Campaigns        404
Investigation    393
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 2.4

Value counts of THIRD level topics:
Fiction    162
Stores     158
Student    215
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 7.3

Value counts of THIRD level topics:
Console    524
Network    473
Stores     590
Name: third_level_topic_name, dtype: int64
##################################################

Selected SECOND level topic index: 2.1

Value counts of THIRD level topics:
Attack    142
Gun       117
Women     210
Name: third_level_topic_name, 

Index(['date', 'author', 'title', 'url', 'section', 'publication',
       'first_10_sents', 'list_of_first_10_sents', 'list_of_verb_lemmas',
       'noun_phrases', 'list_of_nouns', 'list_of_lemmas', 'ID',
       'group_level_1', 'group_level_2', 'group_level_3', 'first_level_topic',
       'first_level_topic_proba', 'second_level_topic',
       'second_level_topic_proba', 'third_level_topic',
       'third_level_topic_proba', 'first_level_topic_name',
       'second_level_topic_name', 'third_level_topic_name'],
      dtype='object')

In [11]:
df_3_named[['publication', 
    'section',
    'first_level_topic','first_level_topic_name',
    'second_level_topic','second_level_topic_name',
    'third_level_topic', 'third_level_topic_name'
   ]].iloc[::1000].head(10).T

Unnamed: 0,278,14367,32118,25058,19153,7030,739,26492,10627,23541
publication,Economist,Gizmodo,CNN,Wired,Wired,CNN,Economist,Wired,CNN,Wired
section,finance-and-economics,Space,tech,transportation,culture,health,finance-and-economics,science,health,science
first_level_topic,5,9,8,7,9,6,3,9,4,6
first_level_topic_name,Drivers,Rocket,Fund,Music,Rocket,Flu,Videos,Rocket,Measles,Flu
second_level_topic,5.2,9.3,8.0,7.2,9.4,6.0,3.0,9.1,4.4,6.3
second_level_topic_name,Transit,Comet,Film,Mph,Galaxies,Sugar,Board,Internet,Fentanyl,Flu
third_level_topic,5.2.2,9.3.1,8.0.0,7.2.0,9.4.0,6.0.0,3.0.2,9.1.0,4.4.1,6.3.0
third_level_topic_name,Scooters,Rings,Brands,Episode,Wave,Meat,Harassment,Fires,Study,Mosquitoes


In [32]:
df_topics = df_3_named[[
    'first_level_topic','first_level_topic_name',
    'second_level_topic','second_level_topic_name',
    'third_level_topic', 'third_level_topic_name'
   ]].copy()
df_topics = df_topics.drop_duplicates()
print(df_topics.shape)
df_topics.head().T

(150, 6)


Unnamed: 0,30,328,496,159,1820
first_level_topic,5,5,5,2,2
first_level_topic_name,Drivers,Drivers,Drivers,Gun,Gun
second_level_topic,5.3,5.3,5.3,2.0,2.0
second_level_topic_name,Jets,Jets,Jets,Transgender,Transgender
third_level_topic,5.3.2,5.3.1,5.3.0,2.0.2,2.0.1
third_level_topic_name,Production,Pilots,Drone,Laws,Identity


In [33]:
with open('./output/lda/topics.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df_topics, f, pickle.HIGHEST_PROTOCOL)

# Process unseen text

In [34]:
ind = 10

text = df['first_10_sents'].iloc[ind]
text

'MOST people like to eat meat. As they grow richer they eat more of it. For individuals, that is good. Meat is nutritious. In particular, it packs much more protein per kilogram than plants do. But animals have to eat plants to put on weightso much so that feeding livestock accounts for about a third of harvested grain. Farm animals consume 8 of the worlds water supply, too. And they produce around 15 of unnatural greenhousegas emissions. More farm animals, then, could mean more environmental trouble. Some consumers, particularly in the rich West, get this.'

In [35]:
tm.predict_topics(text,
                  params={"topics_df_path": './output/lda/topics.pickle',
                          "first_dictionary_path": "./output/lda/dictionary1.pickle" ,
                          "first_LDA_model_path": "./output/lda/LDA_model1"
                         }
              )  

{'first_level_topic': 0,
 'first_level_topic_name': 'Tobacco',
 'first_level_topic_proba': 0.48858285,
 'second_level_topic': 0,
 'second_level_topic_name': 'Beer',
 'second_level_topic_proba': 0.6123519,
 'third_level_topic': 0,
 'third_level_topic_name': 'Corn',
 'third_level_topic_proba': 0.9637301}