In [1]:
import pandas as pd
import numpy as np
import math
import operator
import os

In [40]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from keybert._mmr import mmr
from keybert._maxsum import max_sum_distance
from keybert._highlight import highlight_document
from keybert.backend._utils import select_backend

In [3]:
model = SentenceTransformer('all-mpnet-base-v2')

In [4]:
pd.options.display.max_colwidth = 250
texts_path = '../../../Georgian Texts/Vazha Pshavela/'
poems_path = texts_path + 'Poems/revised/'

In [5]:
poem_names = []
sources = ['ge/', 'en/', 'ggl/', 'gem/', 'gpt/']
ge_poems_directory = os.fsencode(poems_path + 'ge/')
for file in sorted(os.listdir(ge_poems_directory)):
    poem_names.append(file.decode())

In [6]:
print('Number of verses')
print('======================================================================+========================')
print('ID\tVazha\tHewitt\tGoogle\tGemini\tChatGPT\tPassed?\tPoem Name')
for poem in poem_names:
    output = [poem[:2]]
    for source in sources:
        with open(poems_path + source + poem, "r") as file:
            output.append(str(sum(1 for _ in file)))
    if len(set(output[1:])) > 1:
        output.append('No')
    else:
        output.append('Yes')
    output.append(poem[3:-3].replace('_', ' ').title())
    print('\t'.join(output))

Number of verses
ID	Vazha	Hewitt	Google	Gemini	ChatGPT	Passed?	Poem Name
01	4	4	4	4	4	Yes	I Believe I Always Have Believed
02	36	36	36	36	36	Yes	Bakur
03	9	9	9	9	9	Yes	Some Day It Will Happen I Shall Die
04	14	14	14	14	14	Yes	I Feel Like Singing And I Shall Sing
05	18	18	18	18	18	Yes	Voice From The Grave
06	10	10	10	10	10	Yes	That In Truth Is Not Manliness
07	17	17	17	17	17	Yes	The Law Of The World Is Thus
08	32	32	32	32	32	Yes	Amiran
09	29	29	29	29	29	Yes	The Old Song Of Cavaliers
10	17	17	17	17	17	Yes	Consciences Song
11	12	12	12	12	12	Yes	As Once You Did O Lady As Once You Did
12	17	17	17	17	17	Yes	Loneliness
13	15	15	15	15	15	Yes	What Created Me A Human Being
14	27	27	27	27	27	Yes	Yet Again Shall I See The Spring
15	34	34	34	34	34	Yes	Thrush Its The Same Song You Sing


In [7]:
verses = []
poem_verse_counts = []
df = pd.DataFrame(
    columns=[
        'Poem ID',
        'Poem Name',
        'Verse',
        'Vazha',
        'Hewitt',
        'Google',
        'Gemini',
        'ChatGPT',
        'Hewitt - Google',
        'Hewitt - Gemini',
        'Hewitt - ChatGPT'
    ]
)

for poem in poem_names:
    poem_id = int(poem[:2])
    poem_name = poem[3:-3].replace('_', ' ').title()

    poem_vazha = [line.strip() for line in open(poems_path + 'ge/' + poem, "r")]
    poem_hewitt = [line.strip() for line in open(poems_path + 'en/' + poem, "r")]
    poem_google = [line.strip() for line in open(poems_path + 'ggl/' + poem, "r")]
    poem_gemini = [line.strip() for line in open(poems_path + 'gem/' + poem, "r")]
    poem_chatgpt = [line.strip() for line in open(poems_path + 'gpt/' + poem, "r")]

    verses.extend(poem_hewitt)
    verses.extend(poem_google)
    verses.extend(poem_gemini)
    verses.extend(poem_chatgpt)

    poem_verse_counts.append(len(poem_vazha))
    for i in range(len(poem_vazha)):
        row = {
            'Poem ID': [poem_id],
            'Poem Name': [poem_name],
            'Verse': [i + 1],
            'Vazha': [poem_vazha[i]],
            'Hewitt': [poem_hewitt[i]],
            'Google': [poem_google[i]],
            'Gemini': [poem_gemini[i]],
            'ChatGPT': [poem_chatgpt[i]]
        }
        df_new = pd.DataFrame(row)
        df = pd.concat([df, df_new], ignore_index = True)

df.shape

(291, 11)

In [8]:
verse_embeddings = model.encode(verses)
verse_embeddings.shape

(1164, 768)

In [9]:
for i, row in df.iterrows():
    poem_id = row['Poem ID']
    verse = row['Verse']
    offset = sum(poem_verse_counts[:poem_id-1]) * 4
    j = offset + verse - 1

    hewitt_google = cosine_similarity(
        [verse_embeddings[j]],
        [verse_embeddings[j + 1 * poem_verse_counts[poem_id - 1]]]
    )[0][0]
    
    hewitt_gemini = cosine_similarity(
        [verse_embeddings[j]],
        [verse_embeddings[j + 2 * poem_verse_counts[poem_id - 1]]]
    )[0][0]
    
    hewitt_chatgpt = cosine_similarity(
        [verse_embeddings[j]],
        [verse_embeddings[j + 3 * poem_verse_counts[poem_id - 1]]]
    )[0][0]

    df.at[i,'Hewitt - Google'] = hewitt_google
    df.at[i,'Hewitt - Gemini'] = hewitt_gemini
    df.at[i,'Hewitt - ChatGPT'] = hewitt_chatgpt

In [10]:
df.head()

Unnamed: 0,Poem ID,Poem Name,Verse,Vazha,Hewitt,Google,Gemini,ChatGPT,Hewitt - Google,Hewitt - Gemini,Hewitt - ChatGPT
0,1,I Believe I Always Have Believed,1,"მრწამს, მარად მიწამებია მუდმივ სიცოცხლე სულისა, კარგისა, ქვეყნის მოყვარის, ქვეყნის ბედისგან წყლულისა.","I believe, I always have believed In the eternal life of the soul, The good, the lover of the world, Scarred by the fate of this same world.","I believe, there are always lands eternal life of the soul, Good, country lover, Ulcer from the fate of the country.","I believe, I've always believed In the eternal life of the soul, Of the good, the lover of the country, The one pained by the country's fate.","I believe, I have always believed in the eternal life of the soul, of the good, the lover of the homeland, the one suffering from the fate of the country.",0.534823,0.741352,0.767628
1,1,I Believe I Always Have Believed,2,"მრწამს, ფერფლნი კარგის გულისა ქარმ რო გაფანტოს ხმელადა, თვითოში მაინც ენთება ტიალ-სურვილი ცხელადა, –","I believe that, though the ashes of kind hearts Be scattered over the land by the wind, Yet in each of us there will burn Hotly a wish forlorn —","I believe that ashes are the heart of a good person Let the wind scatter the land, It still burns in itself Tial-desire was hot, –","I believe, even when the ashes Of a good heart are scattered by the wind, Within each one, a spark still ignites, A burning desire for warmth.","I believe, that even if the ashes of a good heart are scattered by the wind, they will still ignite a burning desire within, —",0.731435,0.758544,0.825589
2,1,I Believe I Always Have Believed,3,"ავის მჩაგრავად, კეთილის მუდამ იქნება მცველადა; ბეჩავის, გაჭირებულის მომხმარედ, მეშვლად, მხსნელადა.","To repress evil, and that this Will ever be a defence of the good, Help, succour and saviour Of the downtrodden and needy.","Evil oppressor, good There will always be a guard; Bechav, needy He helped me, helped me, saved me.","To suppress evil, to protect the good, It will always be a guardian; For the poor, the troubled, A helper, a savior, a rescuer.","as the protector against evil, the good will always be a guardian; for the helpless, the downtrodden a helper, a supporter, a savior.",0.419882,0.725808,0.590861
3,1,I Believe I Always Have Believed,4,"კარგს გულს არა ჰკლავს ბუნება, თან დააქვს ძველის-ძველადა!","Nature does not totally extinguish a kind heart, She carries it along with her from the very dawn of time.",Nature does not kill a good heart Brings old things with him!,"Nature does not kill a good heart, It carries it along, from time immemorial!","Nature does not kill a good heart, it carries the wisdom of the ancients!",0.661742,0.806899,0.761201
4,2,Bakur,1,"– გვიამბე, ელიზბარაო, რა ამბავ მოხდა ეღოსა?","Tell us, Elizbar, In Egho what did occur.","– Tell us, Elizabeth What happened to Egosa?","– Tell us, Elizbara, What happened at dawn?","Tell us, Elizbar, what happened at Egho?",0.457603,0.68623,0.907964


In [11]:
# df.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/cosine_similarity.csv')
# maximum_difference_easwaran_gandhi.to_csv('/content/drive/MyDrive/religious_texts/semantic analysis/all-mpnet-base-v2/easwaran gandhi.csv')
# maximum_difference_gandhi_purohit.to_csv('/content/drive/MyDrive/religious_texts/semantic analysis/all-mpnet-base-v2/gandhi purohit.csv')
# maximum_difference_purohit_easwaran.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/purohit_google.csv')
# df.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/cosine similarity - sklearn.csv')
# df_max_sum_cs.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/max_sum_in_every_chapter.csv')
# df_min_sum_cs.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/least_sum_in_every_chapter.csv')
# df = pd.read_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/cosine_similarity.csv')

In [12]:
maximum_difference_hewitt_google = df.loc[(df['Hewitt - Google'] < 0.5)]
maximum_difference_hewitt_google.shape

(87, 11)

In [13]:
maximum_difference_hewitt_gemini = df.loc[(df['Hewitt - Gemini'] < 0.5)]
maximum_difference_hewitt_gemini.shape

(22, 11)

In [14]:
maximum_difference_hewitt_chatgpt = df.loc[(df['Hewitt - ChatGPT'] < 0.5)]
maximum_difference_hewitt_chatgpt.shape

(19, 11)

In [25]:
maximum_difference = df.loc[(df['Hewitt - Google']<0.5) & (df['Hewitt - Gemini'] < 0.5) & (df['Hewitt - ChatGPT'] < 0.5)]
maximum_difference.shape

(7, 14)

In [16]:
df['Sum CS'] = df['Hewitt - Google'] + df['Hewitt - Gemini'] + df['Hewitt - ChatGPT']
df['Max Sum CS'] = df.groupby(['Poem Name'])['Sum CS'].transform('max')
df['Min Sum CS'] = df.groupby(['Poem Name'])['Sum CS'].transform('min')

In [17]:
df_max_sum_cs = df.loc[(df['Max Sum CS'] == df['Sum CS'])]
df_max_sum_cs.head()

Unnamed: 0,Poem ID,Poem Name,Verse,Vazha,Hewitt,Google,Gemini,ChatGPT,Hewitt - Google,Hewitt - Gemini,Hewitt - ChatGPT,Sum CS,Max Sum CS,Min Sum CS
1,1,I Believe I Always Have Believed,2,"მრწამს, ფერფლნი კარგის გულისა ქარმ რო გაფანტოს ხმელადა, თვითოში მაინც ენთება ტიალ-სურვილი ცხელადა, –","I believe that, though the ashes of kind hearts Be scattered over the land by the wind, Yet in each of us there will burn Hotly a wish forlorn —","I believe that ashes are the heart of a good person Let the wind scatter the land, It still burns in itself Tial-desire was hot, –","I believe, even when the ashes Of a good heart are scattered by the wind, Within each one, a spark still ignites, A burning desire for warmth.","I believe, that even if the ashes of a good heart are scattered by the wind, they will still ignite a burning desire within, —",0.731435,0.758544,0.825589,2.315568,2.315568,1.736551
26,2,Bakur,23,"უძილო, მშიერ-მწყურვალნი, – მტერს, რომ ჩვენ მაშინ ვიყვენით!","Without sleep, food or water — How we were then, one day the enemy will ponder! —","sleepless, hungry and thirsty, To the enemy that we were then!","Sleepless, hungry and thirsty, We were then against the enemy!","Sleepless, hungry, and thirsty, – we were to the enemy at that time!",0.763763,0.772805,0.802282,2.33885,2.33885,1.05735
42,3,Some Day It Will Happen I Shall Die,3,"ჩემს გულსა დაეხვევიან გაშმაგებულნი ჭიანი,","Frenzied worms Will wrap themselves about my heart,",They will hug my heart crazy worms,"Around my heart will coil Frenzied worms,","My heart will be encircled By frenzied worms,",0.727994,0.845164,0.891474,2.464631,2.464631,1.438483
60,4,I Feel Like Singing And I Shall Sing,12,"გაღრღნილი ჯაჭვიც მთელია, ჰხედავს, მეორე დილასა;","The gnawed-through chain is whole again, He sees it so every morning;","The worn chain is also whole, He sees it, the next morning;","The gnawed chain remains whole, He sees, on the next morning;","The gnawed chain remains whole, he sees it the next morning;",0.816771,0.870759,0.856012,2.543543,2.543543,1.458503
77,5,Voice From The Grave,15,"სული ღმერთთანა, – იქ, სადაც ბრწყინვენ ციური ძალები.","My soul is with God — there, where The heavenly powers shine.","The soul is with God, where The celestial powers shine.","My soul is with God, there, where Heavenly powers shine.","my soul is with God, – where the celestial powers shine.",0.813066,0.960912,0.926211,2.700189,2.700189,1.63788


In [18]:
df_min_sum_cs = df.loc[(df['Min Sum CS'] == df['Sum CS'])]
df_min_sum_cs.head()

Unnamed: 0,Poem ID,Poem Name,Verse,Vazha,Hewitt,Google,Gemini,ChatGPT,Hewitt - Google,Hewitt - Gemini,Hewitt - ChatGPT,Sum CS,Max Sum CS,Min Sum CS
2,1,I Believe I Always Have Believed,3,"ავის მჩაგრავად, კეთილის მუდამ იქნება მცველადა; ბეჩავის, გაჭირებულის მომხმარედ, მეშვლად, მხსნელადა.","To repress evil, and that this Will ever be a defence of the good, Help, succour and saviour Of the downtrodden and needy.","Evil oppressor, good There will always be a guard; Bechav, needy He helped me, helped me, saved me.","To suppress evil, to protect the good, It will always be a guardian; For the poor, the troubled, A helper, a savior, a rescuer.","as the protector against evil, the good will always be a guardian; for the helpless, the downtrodden a helper, a supporter, a savior.",0.419882,0.725808,0.590861,1.736551,2.315568,1.736551
7,2,Bakur,4,მე რა ვსთქვა? უნდა კაცობა ბაკურმა დაიკვეხოსა…,Who am I to speak? It is for Bak’ur To vaunt his own valour...,what can i say Must be a man Bakur boasted...,What can I say? It's for men To lament Bakuri's fate...,What should I say? It’s manhood that Bakur should boast of…,0.388787,0.320493,0.34807,1.05735,2.33885,1.05735
44,3,Some Day It Will Happen I Shall Die,5,ტყუილად აშფოთდებიან ჩემნი წაწალნი იანი.,My friends the violets In vain will be indignant.,"They worry for nothing My friends, Ian.",My swift falcons Will be troubled in vain.,My dear ones will be disturbed In vain.,0.351775,0.489707,0.597001,1.438483,2.464631,1.438483
54,4,I Feel Like Singing And I Shall Sing,6,დაჟინებულის მის ცქერით შევიქენ დასაძრახავი.,By stubborn gazing at it I have been turned into an object of reproach.,With his stubbornness I will have to move.,"With persistent gazing, I became blameworthy.",By persistently gazing at it I have become blameworthy.,0.130767,0.603262,0.724474,1.458503,2.543543,1.458503
74,5,Voice From The Grave,12,ერთხელ სჯობია სიკვდილი შავს ყოფნას ქვეყანაზედა!,A sudden death is preferable To a bleak existence on earth!,Better to die once Black presence in the country!,It's better to die once Than to live in shame on this earth!,It is better to die once than to live in darkness on earth!,0.425475,0.575076,0.637329,1.63788,2.700189,1.63788


In [19]:
print('Relation\tmean\t\t\tstd')
print(f"Hewitt-Google\t{df['Hewitt - Google'].mean()}\t{df['Hewitt - Google'].std()}")
print(f"Hewitt-Gemini\t{df['Hewitt - Gemini'].mean()}\t{df['Hewitt - Gemini'].std()}")
print(f"Hewitt-ChatGPT\t{df['Hewitt - ChatGPT'].mean()}\t{df['Hewitt - ChatGPT'].std()}")

Relation	mean			std
Hewitt-Google	0.5807760441835803	0.16068689923002036
Hewitt-Gemini	0.7074239409666291	0.1333006364700414
Hewitt-ChatGPT	0.7307519552224281	0.13619296838138836


In [26]:
df_mean_hewitt_google = df.groupby('Poem Name')['Hewitt - Google'].mean().reset_index()
df_mean_hewitt_gemini = df.groupby('Poem Name')['Hewitt - Gemini'].mean().reset_index()
df_mean_hewitt_chatgpt = df.groupby('Poem Name')['Hewitt - ChatGPT'].mean().reset_index()

df_std_hewitt_google = df.groupby('Poem Name')['Hewitt - Google'].std().reset_index()
df_std_hewitt_gemini = df.groupby('Poem Name')['Hewitt - Gemini'].std().reset_index()
df_std_hewitt_chatgpt = df.groupby('Poem Name')['Hewitt - ChatGPT'].std().reset_index()

In [30]:

df_std_hewitt_google

Unnamed: 0,Poem Name,Hewitt - Google
0,Amiran,0.138174
1,As Once You Did O Lady As Once You Did,0.191696
2,Bakur,0.144706
3,Consciences Song,0.139759
4,I Believe I Always Have Believed,0.13796
5,I Feel Like Singing And I Shall Sing,0.184503
6,Loneliness,0.186056
7,Some Day It Will Happen I Shall Die,0.125999
8,That In Truth Is Not Manliness,0.158653
9,The Law Of The World Is Thus,0.117547


In [31]:
df_std_hewitt_gemini

Unnamed: 0,Poem Name,Hewitt - Gemini
0,Amiran,0.118844
1,As Once You Did O Lady As Once You Did,0.152344
2,Bakur,0.130029
3,Consciences Song,0.115832
4,I Believe I Always Have Believed,0.035142
5,I Feel Like Singing And I Shall Sing,0.097183
6,Loneliness,0.145895
7,Some Day It Will Happen I Shall Die,0.113904
8,That In Truth Is Not Manliness,0.138101
9,The Law Of The World Is Thus,0.122122


In [32]:
df_std_hewitt_chatgpt

Unnamed: 0,Poem Name,Hewitt - ChatGPT
0,Amiran,0.102408
1,As Once You Did O Lady As Once You Did,0.136832
2,Bakur,0.146714
3,Consciences Song,0.14171
4,I Believe I Always Have Believed,0.101204
5,I Feel Like Singing And I Shall Sing,0.077673
6,Loneliness,0.112824
7,Some Day It Will Happen I Shall Die,0.150555
8,That In Truth Is Not Manliness,0.155026
9,The Law Of The World Is Thus,0.118905


In [33]:
df_mean_hewitt_google

Unnamed: 0,Poem Name,Hewitt - Google
0,Amiran,0.608679
1,As Once You Did O Lady As Once You Did,0.64519
2,Bakur,0.537513
3,Consciences Song,0.546356
4,I Believe I Always Have Believed,0.586971
5,I Feel Like Singing And I Shall Sing,0.67313
6,Loneliness,0.651978
7,Some Day It Will Happen I Shall Die,0.526339
8,That In Truth Is Not Manliness,0.375967
9,The Law Of The World Is Thus,0.593416


In [34]:
df_mean_hewitt_gemini

Unnamed: 0,Poem Name,Hewitt - Gemini
0,Amiran,0.739079
1,As Once You Did O Lady As Once You Did,0.729183
2,Bakur,0.646156
3,Consciences Song,0.664082
4,I Believe I Always Have Believed,0.758151
5,I Feel Like Singing And I Shall Sing,0.773672
6,Loneliness,0.758392
7,Some Day It Will Happen I Shall Die,0.720158
8,That In Truth Is Not Manliness,0.628519
9,The Law Of The World Is Thus,0.703569


In [35]:
df_mean_hewitt_chatgpt

Unnamed: 0,Poem Name,Hewitt - ChatGPT
0,Amiran,0.750303
1,As Once You Did O Lady As Once You Did,0.735404
2,Bakur,0.65244
3,Consciences Song,0.688849
4,I Believe I Always Have Believed,0.73632
5,I Feel Like Singing And I Shall Sing,0.811298
6,Loneliness,0.791746
7,Some Day It Will Happen I Shall Die,0.702109
8,That In Truth Is Not Manliness,0.64647
9,The Law Of The World Is Thus,0.716877


In [38]:
print('Google', df_std_hewitt_google['Hewitt - Google'].mean())
print('Gemini', df_std_hewitt_gemini['Hewitt - Gemini'].mean())
print('ChatGPT', df_std_hewitt_chatgpt['Hewitt - ChatGPT'].mean())

Google 0.15406655603340805
Gemini 0.12116039422014982
ChatGPT 0.12653621062249046


In [41]:

kw_model = KeyBERT(model = 'all-mpnet-base-v2')

In [42]:
####### METHODOLOGY FOR OBTAINING KEYWORDS

# We encode verses using the MPNet-base model to compute the verse by verse semantic similarity. 
# We use the MPNet-base model for extraction of keywords (using KeyBERT) from all chapters. 
# However, given the constraint in the MPNet-base model that number of tokens should not exceed 384, it would not be possible to encode large chapters directly. 
# Hence, we propose a method to overcome this limitation by breaking each chapter into paragraphs of 15 verses. 
# We include 3 verses from the previous paragraph into the current paragraph to retain some context and maintain continuity. 
# For example, in the first paragraph, verses 1-15 are included, and in the second paragraph verses 13-27, then 25-39, and so on.  
# We keep the top 20 keywords because keywords that have a lower similarity score in the original paragraph may be more relevant when the entire paragraph is considered.
 
# Next, we extract the keywords for all paragraphs i with 20 candidate keywords of paragraph j such that i!=j. 
# For each keyword, we add up its cosine similarity score across paragraphs. 
# Finally, we obtain the top 10 keywords having the highest cumulative scores. 
# The key idea here is that if a term is a keyword in a certain paragraph, it also needs to be sufficiently close to other paragraphs in the higher dimensional vector space
# to qualify as a keyword for the entire chapter. 
# We use MMR with a diversity value of 0.5 to prevent the selection of similar meaning keywords.

In [None]:


top_n = 20
final_df = pd.DataFrame()

for chapter in range(1, 19):
  df = pd.read_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/Translated_slokas/predicted sentiment/chapter '+ str(chapter))
  #df= pd.read_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/Sentiment Predictions/mahatma gandhi/chapter ' + str(chapter))
  df = df.drop(['Unnamed: 0', 'Official report'], axis=1)

  score_list = []
  text_list = []
  string_list = []
  keyword_score_map = {}
  keyword_count_map = {}

  for i in range(math.ceil(df.shape[0]/15)):

    if i==0:
      text = ' '.join(df[0:15].Tweet)

    else:
      if df.shape[0] >= 15*(i+1) - 3:
        text = ' '.join(df[(15*i - 3):(15*(i+1)-3)].Tweet)

      else:
        text = ' '.join(df[(15*i-3):df.shape[0]].Tweet)

    list1 = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', top_n = top_n, use_mmr=True, diversity=0.5)
    score_list.append(list1)
    text_list.append(text)

    # print(text)
    # print(list1)

    for jj in range(0, top_n):
      # print('keyword - score ', score_list[0][jj])
      # print('keyword ', score_list[0][jj][0])
      # print('score ', score_list[0][jj][1])

      keyword = score_list[0][jj][0]
      string_list.append(keyword)
      keyword_score_map[keyword] = 0
      keyword_count_map[keyword] = 0

  for ii in range(math.ceil(df.shape[0]/15)):
    for jj in range(math.ceil(df.shape[0]/15)):
      # if chapter==12 or chapter==15 or ii!=jj:

        list2 = kw_model.extract_keywords(text_list[jj], candidates=string_list[top_n*ii : top_n*(ii+1)], keyphrase_ngram_range=(1, 1), stop_words='english', top_n = top_n,
                                          use_mmr=True, diversity=0.3)
        #print(chapter , df.shape[0])
        for kk in range(0, len(list2)):
         # if not list2[kk][0]:
          #  break
          #print(list2[kk][0])
          keyword = list2[kk][0]
          keyword_score = list2[kk][1]
          keyword_score_map[keyword]+=keyword_score
          keyword_count_map[keyword]+=1


  keyword_score_map = dict( sorted(keyword_score_map.items(), key=operator.itemgetter(1),reverse=True))      # sort in descending order according to scores
  keyword_df = pd.DataFrame(keyword_score_map.items(), columns=['Keyword', 'Score'])                         # convert dictionary to dataframe
  keyword_df = keyword_df[:10]                                                                               # TOP 10 KEYWORDS most relevant to text 
  keyword_df['Chapter'] = chapter
  keyword_df['Author'] = 'Google Translator'

  final_df = pd.concat([final_df, keyword_df], axis=0)

final_df

Unnamed: 0,Keyword,Score,Chapter,Author
0,arjuna,7.9260,1,Google Translator
1,dhṛtarāṣṭra,5.6120,1,Google Translator
2,battle,4.9512,1,Google Translator
3,charioteers,3.3168,1,Google Translator
4,dyāsa,2.6628,1,Google Translator
...,...,...,...,...
5,senses,3.7662,18,Google Translator
6,renounce,2.7978,18,Google Translator
7,delusion,2.5776,18,Google Translator
8,sacrifices,2.5278,18,Google Translator


In [None]:
final_df.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/google_keywords.csv')


In [None]:
df = pd.read_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/Google_keyword.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.drop_duplicates(subset='Keyword', keep='last', inplace=True)
onlykey = df.loc[(df['Author'] == 'Google Translator')]
onlykey=onlykey['Keyword']
onlykey

1      dhratarashtra
2             battle
3        charioteers
4              dyasa
7               sake
           ...      
175           senses
176         renounce
177         delusion
178       sacrifices
179             fear
Name: Keyword, Length: 105, dtype: object

In [None]:
onlykey.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/onlykey.tsv', sep='\t', index=None, header=None)

In [None]:
chapter_12 = final_df.loc[(final_df['Chapter'] == 15)]
chapter_12

Unnamed: 0,Keyword,Score,Chapter,Author
0,vedas,1.7496,15,Google Translator
1,godhead,1.5788,15,Google Translator
2,entities,1.2612,15,Google Translator
3,universe,1.2188,15,Google Translator
4,knowledge,1.0428,15,Google Translator
5,body,0.9134,15,Google Translator
6,nourish,0.8264,15,Google Translator
7,eternal,0.6714,15,Google Translator
8,perceive,0.5876,15,Google Translator
9,ancients,0.4842,15,Google Translator


In [None]:
keyword_score_map = dict( sorted(keyword_score_map.items(), key=operator.itemgetter(1),reverse=True))


In [None]:
df = pd.DataFrame(keyword_score_map.items(), columns=['Keyword', 'Score'])


In [None]:
df[:10]


Unnamed: 0,Keyword,Score
0,arjuna,10.3764
1,renunciation,7.0056
2,living,5.0796
3,described,4.8228
4,godhead,4.6542
5,senses,3.7662
6,renounce,2.7978
7,delusion,2.5776
8,sacrifices,2.5278
9,fear,2.3514


In [None]:
import numpy as np
from tqdm import tqdm
from typing import List, Union, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

!pip install keybert


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.7.0.tar.gz (21 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.3 MB/s 
Collecting rich>=10.4.0
  Downloading rich-12.6.0-py3-none-any.whl (237 kB)
[K     |████████████████████████████████| 237 kB 26.2 MB/s 
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 7.9 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 74.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 58.1 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Do

In [None]:
!pip install keybert==0.7.0

In [None]:
from keybert import KeyBERT
# KeyBERT
from keybert._mmr import mmr
from keybert._maxsum import max_sum_distance
from keybert._highlight import highlight_document
from keybert.backend._utils import select_backend

In [None]:
model = KeyBERT(model = 'all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
model

In [None]:
df = pd.read_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/Semantic Analysis Results/KeyBERT/shri_purohit_swami.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.drop_duplicates(subset='Keyword', keep='last', inplace=True)
df

Unnamed: 0,Keyword,Score,Chapter,Author
2,duryodhana,4.2736,1,Shri Purohit Swami
3,generals,3.7944,1,Shri Purohit Swami
4,chariot,3.7908,1,Shri Purohit Swami
5,bowmen,3.6540,1,Shri Purohit Swami
6,valiant,3.4804,1,Shri Purohit Swami
...,...,...,...,...
175,relinquishment,10.1544,18,Shri Purohit Swami
176,desire,8.8344,18,Shri Purohit Swami
177,renounce,7.8690,18,Shri Purohit Swami
178,forgoing,7.2534,18,Shri Purohit Swami


In [None]:
candidates = df['Keyword'].tolist()

In [None]:
candidates

['duryodhana',
 'generals',
 'chariot',
 'bowmen',
 'valiant',
 'die',
 'trumpets',
 'commanded',
 'compassion',
 'endure',
 'battle',
 'kill',
 'profess',
 'bheeshma',
 'away',
 'refraining',
 'consciously',
 'duty',
 'yoga',
 'nourish',
 'reincarnate',
 'materialism',
 'changeless',
 'ancestors',
 'births',
 'meditating',
 'praise',
 'divine',
 'sin',
 'perceives',
 'spirituality',
 'meditation',
 'celibacy',
 'purification',
 'renounces',
 'virtuous',
 'vow',
 'sage',
 'spiritual',
 'righteousness',
 'natures',
 'unto',
 'consciousness',
 'eternity',
 'mysticism',
 'souls',
 'multitude',
 'immovable',
 'beings',
 'mankind',
 'holiest',
 'progenitors',
 'contentment',
 'omnipresent',
 'universe',
 'behold',
 'embraced',
 'radiance',
 'powers',
 'petal',
 'devotee',
 'worship',
 'meditate',
 'verily',
 'realise',
 'attention',
 'devotion',
 'vitality',
 'perception',
 'omniscient',
 'nature',
 'matter',
 'indolence',
 'divinity',
 'sinless',
 'infatuation',
 'spirit',
 'reborn',
 'kno

In [None]:
doc,candidate_embeddings = model.extract_embeddings(candidates)

In [None]:

print(len(candidate_embeddings))
print(len(candidate_embeddings[0]))

113
768


In [None]:
candidate_embeddings

array([[-0.00571294,  0.03313369, -0.01764511, ...,  0.01239058,
         0.04599014, -0.02001391],
       [-0.017692  ,  0.11096726, -0.04223696, ...,  0.05520196,
         0.01838507, -0.00521855],
       [ 0.03293337,  0.00913427, -0.03127398, ...,  0.02151289,
        -0.04312754, -0.02665976],
       ...,
       [-0.00420476,  0.06491704, -0.0065593 , ...,  0.0080807 ,
         0.00508294, -0.01001293],
       [ 0.01612801, -0.0735716 , -0.01101758, ...,  0.00219575,
        -0.03896737,  0.0096657 ],
       [-0.02084935, -0.02647869, -0.02503866, ...,  0.02810897,
        -0.03670445, -0.03136148]], dtype=float32)

In [None]:
candidate_embeddings = pd.DataFrame(candidate_embeddings)
candidate_embeddings.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/purohit_keywordsall.tsv', sep='\t', index=None, header=None)

In [None]:
df.to_csv('/content/drive/MyDrive/github/sentimentanalysis_bhagavadgita-main/SM_RESULTS/purohit_metadata.tsv',  index=False, sep='\t')

In [None]:


import numpy as np
!pip install scikit-learn==0.21.3
from sklearn.datasets import load_digits
from scipy.spatial.distance import pdist
#from sklearn.manifold.t_sne import _joint_probabilities
from scipy import linalg
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import squareform
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
palette = sns.color_palette("bright", 2)

In [None]:
tsne = TSNE(perplexity=30)
X_embedded = tsne.fit_transform(candidate_embeddings)

In [None]:
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], legend='full')
plt.xlabel(xlabel='T-SNE dimension 1')
plt.ylabel(ylabel='T-SNE dimension 2')

plt.show()