In [76]:
import pandas as pd 
import numpy as np 
import configparser
import os
import seaborn as sns
import plotly_express as px

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

import nltk 

config = configparser.ConfigParser()
config.read("..\\env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
base_path = config['DEFAULT']['base_path']
code_dir = config['DEFAULT']['code_dir']

data_prefix = 'Maha'
out_path = f'{output_dir}/{data_prefix}'
OHCO = ['book_id','chap_id','sec_id','para_num', 'sent_num', 'token_num']


os.chdir(code_dir)
import preprocess
import bow_analysis
os.chdir(base_path + "\\notebooks")

In [77]:
CORPUS = pd.read_csv(fr"{output_dir}\F2\{data_prefix}-CORPUS.csv", sep="|").set_index(OHCO)
LIB = pd.read_csv(fr"{output_dir}\F2\{data_prefix}-LIB.csv", sep="|").set_index('book_id')
LIB2 = pd.read_csv(fr"{output_dir}\F2\{data_prefix}-LIB2.csv", sep="|").set_index(OHCO[:2])
VOCAB = pd.read_csv(fr"{output_dir}\CHAP_BOW\{data_prefix}-VOCAB2.csv", sep="|")
TFIDF = pd.read_csv(fr"{output_dir}\CHAP_BOW\{data_prefix}-TFIDF_REDUCED_CHAPS_L2.csv", sep="|").set_index(OHCO[:2])

In [78]:
PARA = OHCO[:4]
SECS = OHCO[:3]
CHAP = OHCO[:2]
BOOK = OHCO[:1]

In [79]:
BAG = CHAP

In [80]:
ngram_range = (1, 2)
n_terms = 1008
n_topics = 10
max_iter = 100
n_top_terms = 7
colors = "YlGnBu"

In [103]:
DOCS = CORPUS[CORPUS.pos.str.match(r'^NNS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(map(str,x)))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})

DOCS.to_csv(fr"{output_dir}\LDA_Results\{data_prefix}-LDA_DOC.csv",sep = "|")
DOCS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str
book_id,chap_id,Unnamed: 2_level_1
1,1,male goddess word son humility day sages vows ...
1,2,son account place descriptions ye men deserve ...
1,3,son brothers sacrifice plains brothers sacrifi...
1,4,son forest twelve years sacrifice attendance d...
1,5,reason tiger kings royal determine lives snake...


In [82]:
from sklearn.feature_extraction import text
custom_stopwords = list(text.ENGLISH_STOP_WORDS.union([
    'thou', 'thee', 'hath', 'thy', 'art', 'ye', 'hast',
    'king', 'son', 'sons', 'men',  
]))

In [83]:
count_engine = CountVectorizer(max_features=n_terms, ngram_range=ngram_range, stop_words=custom_stopwords)
count_model = count_engine.fit_transform(DOCS.doc_str)
TERMS = count_engine.get_feature_names_out()

In [84]:
VOCAB = pd.DataFrame(index=TERMS)
VOCAB.index.name = 'term_str'
DTM = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)
DTM

Unnamed: 0_level_0,Unnamed: 1_level_0,ablutions,abode,absence,abstention,accomplishment,accomplishments,account,achievements,acquisition,act,...,worship,worthy,wrath,wretch,year,years,yoga,yore,youth,yudhishthira
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,5,0,0,1,0,2,0,1,1,...,0,1,0,1,0,2,0,0,0,2
1,2,0,3,0,0,0,0,4,0,2,1,...,0,0,8,1,1,0,1,0,1,2
1,3,2,3,1,0,0,0,1,0,0,2,...,0,0,1,2,5,0,0,0,0,0
1,4,1,2,0,0,0,0,2,0,0,1,...,0,0,3,0,0,2,0,2,0,0
1,5,3,10,0,0,3,0,8,4,2,19,...,0,1,10,9,1,11,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,2,0,3,0,0,2,0,5,1,0,1,...,1,0,3,0,1,4,0,0,0,0
15,3,1,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,1
16,1,0,2,0,0,0,0,2,0,0,1,...,0,0,5,1,3,0,1,0,0,0
17,1,0,0,0,0,1,0,0,1,0,3,...,0,0,0,0,0,0,0,0,0,2


In [85]:
VOCAB['doc_count'] = DTM.astype('bool').astype('int').sum()
DOCS['term_count'] = DTM.sum(1)
VOCAB

Unnamed: 0_level_0,doc_count
term_str,Unnamed: 1_level_1
ablutions,24
abode,86
absence,27
abstention,20
accomplishment,55
...,...
years,81
yoga,24
yore,44
youth,47


In [86]:
DOCS.term_count.describe()

count       99.000000
mean      3808.525253
std       6709.625842
min         41.000000
25%        863.000000
50%       1430.000000
75%       3359.000000
max      40880.000000
Name: term_count, dtype: float64

In [87]:
lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

In [88]:
TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
TNAMES

['T00', 'T01', 'T02', 'T03', 'T04', 'T05', 'T06', 'T07', 'T08', 'T09']

In [89]:
lda_model = lda_engine.fit_transform(count_model)

In [90]:
THETA = pd.DataFrame(lda_model, index=DOCS.index)
THETA.columns.name = 'topic_id'
THETA.columns = TNAMES

In [104]:
THETA.to_csv(fr"{output_dir}\LDA_Results\{data_prefix}-LDA_THETA.csv",sep = "|")
THETA.sample(10).T.style.background_gradient(cmap=colors, axis=None)

book_id,3,1,8,6,1,1,4,6,2,1
chap_id,15,10,1,1,1,9,2,2,4,7
topic_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
T00,0.000219,0.000134,4.2e-05,0.030613,0.430669,0.000145,0.000658,0.000372,0.000113,0.000136
T01,0.301276,0.000134,0.003005,0.144272,0.190198,0.000145,0.000658,0.000372,0.016062,0.006503
T02,0.000219,0.000134,0.037719,0.023628,0.015132,0.000145,0.000658,0.046735,0.043989,0.038788
T03,0.000219,0.000134,0.031478,0.137247,0.040144,0.000145,0.453847,0.252418,0.680495,0.123256
T04,0.281898,0.313204,0.027248,6.7e-05,0.000108,0.000145,0.000658,0.072139,0.124396,0.137864
T05,0.071731,0.685721,0.002705,6.7e-05,0.217017,0.833144,0.312147,0.000372,0.010436,0.644352
T06,0.03569,0.000134,0.024885,0.196771,0.000108,0.032284,0.000658,0.23527,0.000113,0.011096
T07,0.000219,0.000134,0.032288,6.7e-05,0.000108,0.051467,0.061627,0.000372,0.000113,9e-06
T08,0.308311,0.000134,8e-06,0.23278,0.106409,0.000145,0.000658,0.391579,0.000113,9e-06
T09,0.000219,0.000134,0.840621,0.234489,0.000108,0.082237,0.168431,0.000372,0.124171,0.037988


In [92]:
PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
PHI.index.name = 'topic_id'
PHI.columns.name = 'term_str'

PHI.T.sample(10).style.background_gradient(cmap=colors, axis=None)

topic_id,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
regard,27.474779,53.037214,0.100025,15.28665,41.033538,43.21853,11.713699,6.04941,21.75295,35.333205
tusks,0.100031,1.243047,0.100027,3.522063,0.100025,7.541778,0.100051,0.100019,4.454926,88.738033
fowler,5.553156,0.100013,0.100015,0.100005,43.021726,0.100019,23.035074,0.100003,33.789984,0.100005
kinds weapons,3.905778,0.100056,0.100028,0.100032,7.53293,6.893004,0.100038,11.915623,0.100007,61.252505
roar,0.100031,0.100048,0.100039,11.422542,0.100018,45.811978,3.332196,0.100028,9.268529,211.664591
repair,0.100047,5.237097,0.100016,17.372752,11.854872,0.100023,13.76477,10.554961,35.060534,3.854928
living creatures,0.10003,116.239676,0.100053,0.100014,5.584467,0.100026,72.905304,0.100061,0.100008,30.670362
respect,28.866248,329.19296,9.285956,95.016776,200.365135,55.164429,182.481671,0.100027,12.003341,21.523458
regenerate persons,0.100054,46.822877,0.100019,0.100026,0.100011,0.100029,39.376921,0.100041,0.100006,0.100016
portion,0.100032,48.637047,5.285683,23.444643,21.70228,32.913946,67.607763,0.100044,8.85613,10.352431


In [102]:
PHI.to_csv(fr"{output_dir}\LDA_Results\{data_prefix}-LDA_PHI.csv",sep = "|")

In [93]:
TOPICS = PHI.stack().groupby('topic_id')\
    .apply(lambda x: ' '.join(x.sort_values(ascending=False).head(n_top_terms).reset_index().term_str))\
    .to_frame('top_terms')

In [94]:
TOPICS.style

Unnamed: 0_level_0,top_terms
topic_id,Unnamed: 1_level_1
T00,words race battle virtue kings earth peace
T01,acts knowledge mind creatures body senses viz
T02,weapons weapon energy gods celestials creatures lord
T03,kings race monarch wealth city foremost words
T04,person life duties wealth world man persons
T05,daughter wife words time father race monarch
T06,deities gifts man food creatures person persons
T07,grief words battle foremost monarch hero heroes
T08,gods tirtha lord merit earth race sacrifice
T09,battle arrows shafts car steeds warriors elephants


In [95]:
TOPICS['doc_weight_sum'] = THETA.sum()
TOPICS['term_freq'] = PHI.sum(1) / PHI.sum(1).sum()
TOPICS.sort_values('doc_weight_sum', ascending=False).style.background_gradient(cmap=colors)

Unnamed: 0_level_0,top_terms,doc_weight_sum,term_freq
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
T05,daughter wife words time father race monarch,20.178928,0.115214
T09,battle arrows shafts car steeds warriors elephants,17.905639,0.263142
T03,kings race monarch wealth city foremost words,16.338473,0.067753
T00,words race battle virtue kings earth peace,8.299319,0.052024
T01,acts knowledge mind creatures body senses viz,7.379867,0.143879
T04,person life duties wealth world man persons,7.31069,0.10869
T07,grief words battle foremost monarch hero heroes,6.94404,0.04599
T08,gods tirtha lord merit earth race sacrifice,5.999075,0.050832
T06,deities gifts man food creatures person persons,5.117901,0.132359
T02,weapons weapon energy gods celestials creatures lord,3.526068,0.020116


In [101]:
TOPICS.to_csv(fr"{output_dir}\LDA_Results\{data_prefix}-LDA_TOPICS.csv",sep = "|")

In [96]:
LIB2

Unnamed: 0_level_0,Unnamed: 1_level_0,chap_name,book_name,source_file_path,chap_len,n_secs,n_sents,n_tokens
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,Anukramanika Parva,Adi Parva,maha01.txt,7511,1,374,7511
1,2,Sangraha Parva,Adi Parva,maha01.txt,8655,1,524,8655
1,3,Paushya Parva,Adi Parva,maha01.txt,6137,1,396,6137
1,4,Pauloma Parva,Adi Parva,maha01.txt,4661,9,261,4661
1,5,Astika Parva,Adi Parva,maha01.txt,30619,46,1753,30619
...,...,...,...,...,...,...,...,...
15,2,Putradarsana Parva,Asramavasika Parva,maha15.txt,6789,8,417,6789
15,3,Naradagamana Parva,Asramavasika Parva,maha15.txt,2551,3,149,2551
16,1,Mausala Parva,Mausala Parva,maha16.txt,8072,7,432,8072
17,1,Mahaprasthanika Parva,Mahaprasthanika Parva,maha17.txt,3062,2,206,3062


In [97]:
THETA2 = THETA.join(LIB2)
THETA2

Unnamed: 0_level_0,Unnamed: 1_level_0,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,chap_name,book_name,source_file_path,chap_len,n_secs,n_sents,n_tokens
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,0.430669,0.190198,0.015132,0.040144,0.000108,0.217017,0.000108,0.000108,0.106409,0.000108,Anukramanika Parva,Adi Parva,maha01.txt,7511,1,374,7511
1,2,0.128488,0.036560,0.000095,0.000095,0.081291,0.461678,0.000095,0.094435,0.099255,0.098008,Sangraha Parva,Adi Parva,maha01.txt,8655,1,524,8655
1,3,0.000136,0.257815,0.000136,0.000136,0.000136,0.741095,0.000136,0.000136,0.000136,0.000136,Paushya Parva,Adi Parva,maha01.txt,6137,1,396,6137
1,4,0.000181,0.206850,0.000181,0.000181,0.000181,0.725856,0.000181,0.000181,0.066026,0.000181,Pauloma Parva,Adi Parva,maha01.txt,4661,9,261,4661
1,5,0.000023,0.074023,0.043232,0.000052,0.019705,0.712365,0.010297,0.009791,0.122437,0.008074,Astika Parva,Adi Parva,maha01.txt,30619,46,1753,30619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,2,0.000093,0.186171,0.000093,0.087954,0.000093,0.116340,0.276979,0.332091,0.000093,0.000093,Putradarsana Parva,Asramavasika Parva,maha15.txt,6789,8,417,6789
15,3,0.000267,0.088439,0.000267,0.186253,0.000267,0.234647,0.167281,0.322042,0.000267,0.000268,Naradagamana Parva,Asramavasika Parva,maha15.txt,2551,3,149,2551
16,1,0.000085,0.000085,0.000085,0.144461,0.000085,0.056427,0.133580,0.557320,0.000085,0.107786,Mausala Parva,Mausala Parva,maha16.txt,8072,7,432,8072
17,1,0.038644,0.000226,0.021121,0.375651,0.000226,0.000226,0.359210,0.171288,0.033182,0.000226,Mahaprasthanika Parva,Mahaprasthanika Parva,maha17.txt,3062,2,206,3062


In [139]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
DCM = pd.DataFrame(pca.fit_transform(THETA), index=THETA.index)
DCM.columns = ['PC{}'.format(i) for i in DCM.columns]
DCM = DCM.join(LIB2, on=OHCO[:2])
DCM = DCM.merge(LIB, on=OHCO[:1], suffixes=["_chap","_book"])
DCM


Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,n_chaps,n_secs_book,n_sents_book,n_tokens_book,n_chars,parva_name,timeline,key_focus,main_theme,broad_theme
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.162477,0.075465,-0.136649,-0.069213,0.268612,-0.261047,0.013997,-0.019064,-0.034997,-3.956810e-18,...,19,235,12422,228684,1055885,Adi,Pre-War,"Origins, family lineages, childhood of heroes",lineage,Mythology
1,-0.153595,0.277915,0.000953,0.000845,0.044509,0.028878,0.062043,-0.012683,-0.049029,3.953096e-17,...,19,235,12422,228684,1055885,Adi,Pre-War,"Origins, family lineages, childhood of heroes",lineage,Mythology
1,-0.357425,0.467218,0.081411,0.012572,-0.133037,-0.067870,-0.162326,-0.066609,-0.032976,1.682677e-16,...,19,235,12422,228684,1055885,Adi,Pre-War,"Origins, family lineages, childhood of heroes",lineage,Mythology
1,-0.351753,0.452091,0.082703,0.011289,-0.138169,-0.063480,-0.080664,-0.064838,-0.046736,8.370968e-17,...,19,235,12422,228684,1055885,Adi,Pre-War,"Origins, family lineages, childhood of heroes",lineage,Mythology
1,-0.335731,0.437104,0.109029,0.012445,-0.125140,-0.010049,0.048811,-0.027285,-0.013201,-1.127907e-16,...,19,235,12422,228684,1055885,Adi,Pre-War,"Origins, family lineages, childhood of heroes",lineage,Mythology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,-0.100426,-0.068080,-0.212613,0.344834,-0.023873,0.037140,-0.107012,0.053330,-0.019969,2.663656e-17,...,3,38,1742,30376,141565,Ashramavasika,Post-War,Retirement and death of elders,renunciation,Spirituality
15,-0.159882,-0.041264,-0.042756,0.289599,0.002485,0.101686,-0.064603,0.003766,-0.028519,2.563973e-17,...,3,38,1742,30376,141565,Ashramavasika,Post-War,Retirement and death of elders,renunciation,Spirituality
16,0.031598,-0.120713,-0.110665,0.436124,0.114883,0.240703,-0.001574,-0.061594,-0.038828,9.636784e-17,...,1,7,432,8072,37596,Mausala,Post-War,"Destruction of Yadava clan, Krishna's death",destruction,Sentiment
17,-0.102922,-0.351837,-0.021306,0.238787,-0.035516,0.007368,0.002324,0.209697,0.006013,-3.378254e-17,...,1,2,206,3062,14086,Mahaprasthanika,Post-War,"Pandavas renounce kingdom, begin final journey",detachment,Spirituality


In [140]:
LOADINGS = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_))
LOADINGS.index = THETA.columns
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]
LOADINGS.index.name = 'topic_id'

LOADINGS = LOADINGS.join(TOPICS)
LOADINGS.T

topic_id,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09
PC0,-0.015886,-0.019697,0.000393,-0.058509,-0.023107,-0.136651,-0.01122,0.008338,-0.019729,0.276068
PC1,0.004156,0.003201,0.001344,-0.162214,-0.012543,0.166906,-0.026099,-0.006755,-0.013806,0.04581
PC2,-0.009951,-0.080574,0.002735,0.127655,-0.082365,0.093441,-0.040851,-0.030276,-0.036545,0.05673
PC3,-0.030302,-0.004645,0.005431,-0.013896,-0.089103,-0.002989,0.055235,0.102864,-0.007249,-0.015345
PC4,0.118295,-0.032255,-0.00426,-0.010559,-0.006197,-0.017078,-0.037155,0.041612,-0.040075,-0.012329
PC5,-0.04826,-0.056465,-0.00981,0.00569,0.082615,0.010757,-0.017914,0.063991,-0.032289,0.001686
PC6,0.007541,-0.079693,0.021161,-0.0188,-0.006119,-0.008361,-0.004995,-0.002016,0.098316,-0.007033
PC7,0.018019,-0.041145,-0.013987,-0.007294,0.01367,0.003632,0.085905,-0.032665,-0.028121,0.001986
PC8,-0.007297,-0.007316,0.075947,-0.007589,-0.000548,-0.008163,-0.001663,-0.010314,-0.024784,-0.008272
PC9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [186]:
def vis_pcs(M, a, b,color ="book_name" ,hover_name='chap_name'):
    fig =  px.scatter(M, f"PC{a}", f"PC{b}", 
                      color = color ,
                      title = f"PCA Components Visualization - PC{a+1} vs PC{b+1}",
                    hover_name=hover_name, 
                    # symbol=symbol, size=size,
                    marginal_x='box', height=800, width = 1008)
    
    fig.update_layout(
    xaxis_title=f'PC{a+1}',
    yaxis_title=f'PC{b+1}'
    )

    return fig



def vis_loadings(LOADINGS, a=0, b=1, hover_name='topic_num', size = "doc_weight_sum"):
    fig =  px.scatter(LOADINGS.reset_index(), f"PC{a}", f"PC{b}", 
                      title = f"PCA Loadings Visualization - PC{a+1} vs PC{b+1}",
                      text='topic_id', 
                      # hover_name='top_terms',
                      size = size,
                      #  color='max_pos_group', 
                      marginal_x='box', height=800)
    
    fig.update_layout(
    xaxis_title=f'PC{a+1}',
    yaxis_title=f'PC{b+1}'
    )

    return fig




In [187]:
theta_pcs = vis_pcs(DCM, 0, 1, "timeline")
theta_pcs

In [188]:

theta_loadings = vis_loadings(LOADINGS, 0, 1, "top_terms")
theta_loadings
# LOADINGS

In [183]:
PHI

topic_id,ablutions,abode,absence,abstention,accomplishment,accomplishments,account,achievements,acquisition,act,...,worship,worthy,wrath,wretch,year,years,yoga,yore,youth,yudhishthira
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,0.100014,48.100949,17.352084,0.10003,12.632443,0.100029,26.496687,0.100035,7.163552,90.805413,...,13.440308,13.678089,163.073764,95.669821,0.100042,72.137791,0.100012,10.373931,6.374577,9.792383
T01,5.388142,43.839453,68.718646,71.889218,39.833584,0.100022,44.947041,0.100027,138.643825,127.695668,...,67.616099,9.696392,199.187321,0.100019,0.100031,43.880104,100.314455,45.791166,11.406024,0.100018
T02,0.100012,92.205542,0.100022,0.100022,0.100031,0.100021,7.950853,0.100041,4.814613,53.588551,...,15.319053,0.100025,0.10003,13.884747,0.100051,0.100038,0.100043,19.5907,0.100045,0.100018
T03,0.100024,51.25255,0.10003,0.100018,28.433377,13.195506,0.10003,16.246168,13.40121,34.467061,...,101.293394,19.36129,0.100029,31.229427,45.713986,85.707718,0.100025,0.10002,0.100032,54.60333
T04,8.609045,57.54214,34.257268,27.483052,34.569108,18.249422,48.855761,9.511987,109.697516,272.9663,...,53.682797,10.826055,132.622355,32.62639,32.153929,113.247566,9.076973,0.100024,15.380582,29.112311
T05,33.846999,116.787358,0.100032,0.100015,17.285295,19.226548,143.571681,62.427721,0.100029,121.619728,...,0.450054,15.860712,148.689847,61.954043,81.232171,166.258823,0.100036,9.335029,163.422956,0.100024
T06,54.70115,86.027359,26.204823,54.927597,15.999751,33.091477,9.466184,21.003931,3.077686,146.487692,...,89.733243,15.705506,70.400318,13.01941,90.45934,276.282974,0.100033,64.973053,6.421734,3.864056
T07,0.100025,18.995327,0.100037,0.100014,28.057328,0.100016,51.288721,0.100017,1.541685,44.582127,...,0.10002,0.100039,63.51425,46.517897,0.100038,46.575578,9.908344,0.10003,0.100056,15.53254
T08,27.954585,75.462764,0.100029,0.100012,0.100011,0.100002,40.526498,5.796522,4.271379,20.51672,...,11.264997,6.976605,28.598896,0.100017,15.940381,51.026052,0.100028,45.511873,0.100038,34.086339
T09,0.100005,159.786558,2.967028,0.100022,19.989071,8.736957,40.796544,21.613552,2.288506,62.270741,...,0.100036,18.695286,721.71319,36.898229,0.100032,53.783357,0.10005,51.124174,18.593956,55.708982


In [180]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
DCM2 = pd.DataFrame(pca.fit_transform(PHI), index=PHI.index)
DCM2.columns = ['PC{}'.format(i) for i in DCM2.columns]
DCM2 = DCM2.join(TOPICS)
DCM2


Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,top_terms,doc_weight_sum,term_freq
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
T00,-634.088787,-757.718244,-297.504239,-157.088317,317.917074,110.20635,374.536353,732.042129,141.756881,6.31756e-13,words race battle virtue kings earth peace,8.299319,0.052024
T01,-1000.44052,2379.499312,-1089.539386,430.575584,-343.912465,175.695107,-33.768183,32.05821,16.827187,6.31756e-13,acts knowledge mind creatures body senses viz,7.379867,0.143879
T02,-968.431708,-1018.2113,-469.181491,-383.893011,-489.944331,-438.629556,-223.95822,-218.312512,521.945468,6.31756e-13,weapons weapon energy gods celestials creature...,3.526068,0.020116
T03,-713.467432,-808.804403,87.271929,-30.364755,241.15207,1089.006849,-370.237769,-166.593708,29.121985,6.31756e-13,kings race monarch wealth city foremost words,16.338473,0.067753
T04,-950.32478,702.2552,74.876624,-221.819651,1385.205104,-411.728789,-102.996094,-194.61449,1.571307,6.31756e-13,person life duties wealth world man persons,7.31069,0.10869
T05,-796.532694,-340.813814,895.303438,1493.316312,-104.853853,-192.311237,55.996182,-12.770325,82.777206,6.31756e-13,daughter wife words time father race monarch,20.178928,0.115214
T06,-971.953324,1299.525714,1358.197072,-847.236443,-488.357876,53.668534,119.947114,47.92812,18.982639,6.31756e-13,deities gifts man food creatures person persons,5.117901,0.132359
T07,-643.304123,-823.601052,-391.845393,-134.38332,-166.981472,30.017187,753.892594,-404.420242,-288.339556,6.31756e-13,grief words battle foremost monarch hero heroes,6.94404,0.04599
T08,-956.179156,-732.433558,-170.060348,-133.263754,-335.210387,-384.165056,-541.130044,192.381465,-528.333366,6.31756e-13,gods tirtha lord merit earth race sacrifice,5.999075,0.050832
T09,7634.722525,100.302144,2.481794,-15.842645,-15.013865,-31.759389,-32.281933,-7.698646,3.690251,6.31756e-13,battle arrows shafts car steeds warriors eleph...,17.905639,0.263142


In [177]:
VOCAB2 = pd.read_csv(fr"{output_dir}\CHAP_BOW\{data_prefix}-VOCAB2.csv", sep="|").set_index('term_str')

LOADINGS2 = pd.DataFrame(pca.components_.T * np.sqrt(pca.explained_variance_))
LOADINGS2.index = PHI.columns
LOADINGS2.columns = ["PC{}".format(i) for i in LOADINGS2.columns]
LOADINGS2.index.name = 'topic_id'

LOADINGS2 = LOADINGS2.join(VOCAB2)
LOADINGS2.T

topic_id,ablutions,abode,absence,abstention,accomplishment,accomplishments,account,achievements,acquisition,act,...,worship,worthy,wrath,wretch,year,years,yoga,yore,youth,yudhishthira
PC0,-4.961722,29.118451,-4.77855,-6.25104,0.167529,-0.328881,0.008151,2.828067,-10.4393,-13.787969,...,-12.902018,2.710263,199.613989,2.572153,-9.537918,-13.550124,-4.760624,8.463409,-1.027888,12.651212
PC1,5.675215,-0.980839,20.529706,25.740618,7.89198,3.878774,0.841759,-0.250303,37.981746,44.103777,...,19.604976,1.599469,47.609201,-12.571432,6.283744,28.754183,23.315725,13.210566,-1.252389,-5.731884
PC2,15.126329,16.799817,-6.702009,-0.272107,-2.543265,10.316063,9.217343,13.666833,-22.905989,19.07791,...,7.734617,3.548507,-8.781105,4.014013,31.85759,71.052576,-17.982747,4.289095,22.697104,-0.405102
PC3,0.147032,8.684029,-0.525577,-3.912448,2.670427,-0.714958,35.247987,12.975474,5.19701,1.148515,...,-12.802622,1.473015,30.644485,8.350783,6.153035,-3.603902,7.21127,-6.779455,43.212124,-3.671844
PC4,-5.863495,-7.537967,3.253542,-2.683376,6.020669,1.993038,2.841526,-0.405522,20.657426,48.691695,...,4.88526,1.717289,17.056844,10.923763,-0.504301,1.444745,-4.69277,-14.005208,-0.241254,7.885487
PC5,-4.205763,-11.655812,0.300713,1.069551,5.691431,1.150835,-13.766585,-0.006997,-2.262524,-22.475604,...,23.796038,3.576568,-14.587266,3.501816,4.906104,6.312278,3.548941,-4.184555,-8.811557,8.574163
PC6,-2.32137,-13.233249,1.106494,0.402106,3.808906,-0.594256,7.895099,-1.427091,-6.094131,6.551295,...,-11.758528,-1.522105,20.080344,17.10354,-4.039457,9.157523,0.93162,-6.032897,2.928869,-8.344004
PC7,2.186128,1.799918,3.395851,-0.144309,-4.293196,-1.603375,-1.826767,-1.221607,-5.274532,-3.280766,...,-4.416543,2.409616,27.297954,13.066274,-2.704631,7.058166,-0.902511,5.797306,0.020488,-2.735811
PC8,-4.414261,9.745666,1.702816,0.938103,-1.203003,1.085238,-6.239514,1.273365,1.554615,13.838286,...,4.025098,0.28669,3.936292,5.655771,0.589803,-3.066803,-0.461593,-3.851798,6.062697,-7.827338
PC9,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,...,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0


In [184]:
vis_pcs(DCM2, 0, 1, "top_terms", "doc_weight_sum")

In [170]:
vis_loadings(LOADINGS2, 0, 1, "dfidf", None)