## Initial paragaph division

### Import dependencies and data

In [None]:
# load required dependencies
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer

In [None]:
# load data 
df = pd.read_csv("final.csv")

In [None]:
# Set the display options
pd.set_option('display.max_colwidth', 100)
df.head(10)

Unnamed: 0,heading,train,target
0,Q1 2015 Accenture PLC Earnings Call - Final,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call.\n(Operator Instruc...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
1,Q1 2015 ACE Ltd Earnings Call - Final,"OPERATOR: Good day, and welcome to the ACE Limited First Quarter 2015 Earnings Conference Call. ...",OVERVIEW\nCo. reported 1Q15 after-tax operating income of $745m or $2.25 per share.\nFINANCIAL D...
2,Q1 2015 Activision Blizzard Inc Earnings Call - Final,OPERATOR: Good day and welcome to the Activision Blizzard quarter one 2015 earnings conference c...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of $1.3b and 1Q15 GAAP EPS of $0.53. Expects 2015 GAA...
3,Q1 2015 Adobe Systems Inc Earnings Call - Final,OPERATOR: I would like to welcome you to Adobe Systems' first quarter FY15 earnings conference c...,OVERVIEW\nADBE reported 1Q15 revenue of $1.109b and GAAP diluted EPS of $0.17. Expects 2Q15 reve...
4,Q1 2015 Advanced Micro Devices Inc Earnings Call - Final,"OPERATOR: Good day, ladies and gentlemen, and thank you for your patience. You have joined AMD's...",OVERVIEW\nAMD reported 1Q15 revenue of $1.03b and net loss of $73m or $0.09 per share. Co. expec...
5,Q1 2015 Agilent Technologies Inc Earnings Call - Final,"OPERATOR: Good day ladies and gentlemen, and welcome to Agilent Technologies' first-quarter 2015...",OVERVIEW\nCo. reported 1Q15 revenue of $1.03b and non-GAAP EPS of $0.41. Expects FY15 revenue to...
6,Q1 2015 American Tower Corp Earnings Call - Final,OPERATOR: Good morning. My name is Steve and I will be your conference operator today. At this t...,OVERVIEW\nAMT reported 1Q15 net income attributable to Co. common stockholders of approx. $183m ...
7,Q1 2015 AmerisourceBergen Corp Earnings Call - Final,"OPERATOR: Ladies and gentlemen, thank you for standing by. Welcome to the AmerisourceBergen earn...","OVERVIEW\nCo. reported 1Q15 revenues of $33.6b, adjusted operating income of $436m and adjusted ..."
8,Q1 2015 Amgen Inc Earnings Call - Final,,OVERVIEW\nCo. reported 1Q15 revenues of $5b. Expects 2015 revenues to be $20.9-21.3b and adjuste...
9,Q1 2015 Analog Devices Inc Earnings Call - Final,"OPERATOR: Good afternoon. My name is Jennifer, and I will be your conference facilitator. At thi...","OVERVIEW\n\n\nADI reported 1Q15 sales of $772m and diluted EPS, excluding special items, of $0.6..."


In [None]:
df.isna().sum()

heading     0
train      18
target      1
dtype: int64

In [None]:
print("NA lines = ",df.isna().sum())
df.dropna(inplace = True)
df.reset_index(drop=True, inplace= True)
print("NA lines dropped")

NA lines =  heading     0
train      18
target      1
dtype: int64
NA lines dropped


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
#defining important variables 
max_size_tokenized_paragraph = 256

#Create an empty DF where we are going to append all results
combined_df = pd.DataFrame(columns = ["Earnings Call Name","Tokenized Size","Non-Tokenized Size","Macro Paragraph", "Tentative_Target"])

iterations = df.shape[0]
progress_bar = tqdm(total=iterations, desc="Processing", unit="iteration")

#selecting an earning call as a subset of the data
for i in range(iterations): #df.shape[0]
    earnings_call_title = df.loc[i,"heading"]
    tentative_target = df.loc[i,"target"]
    text = df.loc[i,"train"]

    # Split the text into paragraphs
    # Define the splitting criteria (a dot followed by \n or \n\n)
    split_pattern = r'(?<=\.)\n|\n\n' 
    # Split the text into paragraphs using the defined pattern
    paragraphs = re.split(split_pattern, text)
    #print("Total paragraphs: ",len(paragraphs))
    # Print each paragraph
    #for i, paragraph in enumerate(paragraphs, 1):
    #    print(f"Paragraph {i}:\n{paragraph}\n")

    
    #Creating macro paragraphs which are several individual paragraphs together with max_size_tokenized_paragraph less than X.
    macro_paragraphs = []
    current_macro_paragraph = ""

    for paragraph in paragraphs:
        tokenized_paragraph = tokenizer.encode(paragraph)
        if len(tokenized_paragraph) + len(tokenizer.encode(current_macro_paragraph)) <= max_size_tokenized_paragraph:
            current_macro_paragraph += paragraph + " "
        else:
            macro_paragraphs.append(current_macro_paragraph.strip())
            current_macro_paragraph = paragraph + " "

    # Add the remaining paragraphs as a new macro_paragraph
    if current_macro_paragraph:
        macro_paragraphs.append(current_macro_paragraph.strip())
    
    
    # Create a DataFrame with the macro paragraphs and their sizes
    macro_paragraphs_data = {
    "Earnings Call Name" : earnings_call_title, 
    "Tokenized Size": [len(tokenizer.encode(paragraph)) for paragraph in macro_paragraphs],
    "Non-Tokenized Size": [len(paragraph) for paragraph in macro_paragraphs],
    "Macro Paragraph": macro_paragraphs,
    "Tentative_Target":tentative_target,
    }
    macro_paragraphs_df = pd.DataFrame(macro_paragraphs_data)
    #macro_paragraphs_df.head()


    # Append the macro_paragraphs_df with the combined df.
    combined_df = pd.concat([combined_df, macro_paragraphs_df], ignore_index=True)
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

print("combined_df.shape: ",combined_df.shape)
combined_df

Processing:   1%|          | 27/2441 [01:31<2:15:46,  3.37s/iteration]
Processing: 100%|██████████| 2441/2441 [07:07<00:00,  5.71iteration/s]

combined_df.shape:  (94370, 5)





Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instruct...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our re...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certain non-GAAP financial measures, which we believe p...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in line with our expectations. We grew revenues 10% in...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, Pierre. Happy Holidays to all of you. Thank you fo...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
...,...,...,...,...,...
94365,Q4 2023 Salesforce Inc Earnings Call - Final,185,762,"Non-GAAP operating margin for fiscal '23 was 22.5%, significantly above our forecast, an improve...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...
94366,Q4 2023 Salesforce Inc Earnings Call - Final,224,1077,"I also want to call out the great progress we have made with MuleSoft and Tableau. As you know, ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...
94367,Q4 2023 Salesforce Inc Earnings Call - Final,174,748,"I'm excited to announce that looking forward to fiscal year '24, we expect a non-GAAP operating ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...
94368,Q4 2023 Salesforce Inc Earnings Call - Final,201,953,"We're also thrilled to welcome 3 new members to our Board, Mason Morfit, the CEO and Chief Inves...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...


In [None]:
combined_df['Matched_KPIs'] = ''
combined_df

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instruct...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our re...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certain non-GAAP financial measures, which we believe p...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in line with our expectations. We grew revenues 10% in...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, Pierre. Happy Holidays to all of you. Thank you fo...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
...,...,...,...,...,...,...
94365,Q4 2023 Salesforce Inc Earnings Call - Final,185,762,"Non-GAAP operating margin for fiscal '23 was 22.5%, significantly above our forecast, an improve...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,
94366,Q4 2023 Salesforce Inc Earnings Call - Final,224,1077,"I also want to call out the great progress we have made with MuleSoft and Tableau. As you know, ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,
94367,Q4 2023 Salesforce Inc Earnings Call - Final,174,748,"I'm excited to announce that looking forward to fiscal year '24, we expect a non-GAAP operating ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,
94368,Q4 2023 Salesforce Inc Earnings Call - Final,201,953,"We're also thrilled to welcome 3 new members to our Board, Mason Morfit, the CEO and Chief Inves...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,


In [None]:
combined_df.to_csv('paragraph_div.csv', index=False)

In [None]:
# Group the DataFrame by the 'Name' column and count the number of rows in each group
grouped_df = combined_df.groupby('Earnings Call Name').size().reset_index(name='Paragraph Count')
grouped_df

Unnamed: 0,Earnings Call Name,Paragraph Count
0,Q1 2015 ACE Ltd Earnings Call - Final,55
1,Q1 2015 Accenture PLC Earnings Call - Final,13
2,Q1 2015 Activision Blizzard Inc Earnings Call - Final,45
3,Q1 2015 Adobe Systems Inc Earnings Call - Final,29
4,Q1 2015 Advanced Micro Devices Inc Earnings Call - Final,5
...,...,...
2436,Q4 2022 Texas Instruments Inc Earnings Call - Final,26
2437,Q4 2023 Autodesk Inc Earnings Call - Final,4
2438,Q4 2023 CrowdStrike Holdings Inc Earnings Call - Final,55
2439,Q4 2023 NVIDIA Corp Earnings Call - Final,37


## Keep going from here because we already have paragraph division

In [None]:
# load required dependencies
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
combined_df = pd.read_csv('paragraph_div.csv')
combined_df.head()

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
df = pd.read_csv("final.csv")

ATTENTION!


Some Earnings calls are not complete! in the final csv file appear only the first text for some of the first paragraphs but not complete... 

## Now we will go into sentence similarity /entity recognitionto pair the KPIs to their corresponding paragraph

In [None]:
#!pip install sentence_transformers

In [None]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

In [None]:
paragraph = combined_df.loc[80,"Macro Paragraph"] # one paragraph 

# Tokenize the paragraph into sentences
paragraph_sentences = nltk.sent_tokenize(paragraph)

print("Num. of sentences on macro paragraph:", len(paragraph_sentences))
#print(paragraph)
#print()
# Print the sentences
for paragraph_sentence in paragraph_sentences:
    print(paragraph_sentence)
    print()


Num. of sentences on macro paragraph: 8
Now to our 2015 full-year numbers.

For 2015 on a GAAP basis, we expect revenues of $4.25 billion, an increase
of $110 million versus our guidance in February; product costs of 23%; and operating expenses of 50%.

For both
GAAP and non-GAAP, we expect interest expense of $202 million.

Our GAAP tax rate is expected to be 22%.

We
expect 750 million fully diluted shares both for GAAP and non-GAAP.

And GAAP EPS is expected to be $0.98, up
$0.09 from our February guidance.

For 2015, on a non-GAAP basis, we expect revenues of $4.245 billion, $25 million higher than our February
guidance; product costs of 24%; operating expenses of 45%; and an operating margin of 31%, 100 basis points
better than our February guidance.

Our non-GAAP tax rate is expected to be 24%.



In [None]:
def preprocess_target(X):
    #split text into individual targets
    pattern = r'(?<=\.)\n' 
    target_sentences= re.split(pattern, X)

    #taking away the list characters
    target_sentences_filt = []
    for sentence in target_sentences:
        # Use regular expressions to match and remove the listation if it exists
        sentence_without_listation = re.sub(r'^\d+\.\s+', '', sentence)
        target_sentences_filt.append(sentence_without_listation)
        
    return target_sentences_filt

In [None]:
target_paragraph = combined_df.loc[80,"Tentative_Target"]
target_sentences_filt = preprocess_target(target_paragraph)
print(len(target_sentences_filt))
target_sentences_filt

180


['OVERVIEW\nATVI reported 1Q15 GAAP revenues of $1.3b and 1Q15 GAAP EPS of $0.53. Expects 2015 GAAP revenues to\nbe $4.25b and GAAP EPS to be $0.98. Expected 2Q15 GAAP net revenues are $930m and GAAP EPS is $0.21.',
 'FINANCIAL DATA\n1. 1Q15 GAAP revenues = $1.3b.',
 '1Q15 non-GAAP revenues = $703m.',
 '1Q15 GAAP EPS = $0.53.',
 '1Q15 non-GAAP EPS = $0.16.',
 '1Q15 YoverY GAAP revenue growth = 15%.',
 '1Q15 GAAP operating margin = 43%.',
 '1Q15 non-GAAP operating margin = 29%.',
 '1Q15-end cash and investments = approx. $4.5b.',
 '1Q15-end total debt = $4.12b.',
 '\n\n10. 2015 GAAP revenue guidance = $4.25b.',
 '2015 non-GAAP revenue guidance = $4.425b.',
 '2Q15 GAAP net revenue guidance = $930m.',
 '2Q15 non-GAAP revenue guidance = $650m.',
 '2015 GAAP EPS guidance = $0.98.',
 '2015 non-GAAP EPS guidance = $1.20.',
 '2Q15 GAAP EPS guidance = $0.21.',
 '2Q15 non-GAAP EPS guidance = $0.07.',
 'PRESENTATION SUMMARY -\nOpening Remarks (B.K.)\n1. 1Q15:\n1. For first time, recognized by For

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2') #'all-MiniLM-L6-v2' #paraphrase-MiniLM-L6-v2

#Sentences are encoded by calling model.encode()
emb1 = model.encode(paragraph_sentences)
emb2 = model.encode(target_sentences_filt)

cos_sim = util.cos_sim(emb1, emb2)
#print("Cosine-Similarity:", cos_sim)

In [None]:
cos_sim.shape

torch.Size([9, 187])

In [None]:
#cos_sim[4] #[1646]

In [None]:
#Encode all sentences
embeddings = model.encode(paragraph_sentences)
target_embedings = model.encode(target_sentences_filt)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, target_embedings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(paragraph_sentences[i], target_sentences_filt[j], cos_sim[i][j]))
    print()

Top-5 most similar pairs:
Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. 	 1Q15 net income = $892m. 	 0.6062

Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. 	 FINANCIAL DATA
1. 1Q15 net revenues = $7.9b. 	 0.6016

Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. 	 1Q15 YoverY net revenue growth (US dollars) = 7%. 	 0.4978

Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year

**NOTE:** Cosine similarity is not really giving us a reliable way of matching the target KPI to the original paragraphs of the earnings call.

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
ea_name = combined_df.loc[80,"Earnings Call Name"]
ea_name

'Q1 2015 Activision Blizzard Inc Earnings Call - Final'

In [None]:
combined_df[combined_df["Earnings Call Name"]==ea_name]#.shape[0]

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
68,Q1 2015 Activision Blizzard Inc Earnings Call ...,180,780,OPERATOR: Good day and welcome to the Activisi...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
69,Q1 2015 Activision Blizzard Inc Earnings Call ...,184,969,These are forward-looking statements that are ...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
70,Q1 2015 Activision Blizzard Inc Earnings Call ...,237,1020,I'd like to note that certain numbers we will ...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
71,Q1 2015 Activision Blizzard Inc Earnings Call ...,209,1037,"Usually, I end my remarks by thanking our incr...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
72,Q1 2015 Activision Blizzard Inc Earnings Call ...,229,1058,"In the last 12 months, we had over 150 million...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
73,Q1 2015 Activision Blizzard Inc Earnings Call ...,254,1077,Our greatest achievement continues to be our a...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
74,Q1 2015 Activision Blizzard Inc Earnings Call ...,235,983,"Also, the numbers I'll be quoting are compared...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
75,Q1 2015 Activision Blizzard Inc Earnings Call ...,192,985,Blizzard Entertainment had steady and strong p...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
76,Q1 2015 Activision Blizzard Inc Earnings Call ...,202,898,"And new franchises, like Hearthstone and Heroe...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
77,Q1 2015 Activision Blizzard Inc Earnings Call ...,252,1037,"In terms of cash flow in Q1, we generated stro...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,


In [None]:
%%time
#now we´ll do it for a complete earning call of a company.. 
total_extracted_kpi = []
for i in range (68,113): #I know there are 12 rows in combined_df that belont to this company.. 
    # Iterate over each paragraph
    paragraph = combined_df.loc[i,"Macro Paragraph"] # one paragraph 
    print("------------------------------------------------------------")
    print("Macro paragraph: ", paragraph)
    
    # Tokenize the paragraph into sentences
    paragraph_sentences = nltk.sent_tokenize(paragraph)
    target_embedings = model.encode(target_sentences_filt)
   
    
    for sentence in paragraph_sentences:
        #initialize empty list of matched targets 
        matched_targets=[]
        
        # Encode the sentence in the paragraph
        embeddings = model.encode(sentence)
        
        # Compute cosine similarity between paragraph and target sentences
        cos_sim = util.cos_sim(embeddings, target_embedings)

        # Add all pairs to a list with their cosine similarity score
        all_sentence_combinations = []
        for j in range(len(cos_sim[0])):
            all_sentence_combinations.append([cos_sim[0][j], 0, j])

        # Sort list by the highest cosine similarity score
        all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
        
        docs_paragraphs = nlp(paragraph)  
        # Print the top 5 most similar sentences for the current paragraph
        for score, i, j in all_sentence_combinations:
            docs_target = nlp(target_sentences_filt[j])
            # print([token.text for token in docs_target if token.like_num])
            # if a matched target sentence has numbers, make sure they are indeed in the paragraph
            if (any([token.text for token in docs_target if token.like_num])):
                numbers_target =  [token.text for token in docs_target if token.like_num]
                numbers_paragraph = [token.text for token in docs_paragraphs if token.like_num]
                if any(x in numbers_target for x in numbers_paragraph):
                    if score >0.6:
                        print("==================================")
                        print("TARGET like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        total_extracted_kpi.append(target_sentences_filt[j])
                        print("Similarity Score: {:.4f}".format(score))
            else:
                if score >0.6:
                        print("==================================")
                        print("TARGET not like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        total_extracted_kpi.append(target_sentences_filt[j])
                        print("Similarity Score: {:.4f}".format(score))
            

------------------------------------------------------------
Macro paragraph:  OPERATOR: Good day and welcome to the Activision Blizzard quarter one 2015 earnings conference call. Today's conference is being recorded. At this time, for opening remarks and introductions, I would like to turn today's call over to Amrita Ahuja. AMRITA AHUJA, SVP OF IR, ACTIVISION BLIZZARD INC: Good afternoon. Thank you for joining us today for
Activision Blizzard's first-quarter 2015 conference call. Speaking on this call today will be Bobby Kotick, CEO of Activision Blizzard; Dennis Durkin, CFO of Activision
Blizzard; Eric Hirshberg, CEO of Activision Publishing; Mike Morhaime, CEO of Blizzard Entertainment; and
Thomas Tippl, COO of Activision Blizzard. I would like to remind everyone that during this call, we will be making statements that are not historical facts.
------------------------------------------------------------
Macro paragraph:  These are forward-looking statements that are based on curren

In [None]:
not_matched_kpi = [x for x in target_sentences_filt if x not in total_extracted_kpi ]
print(f"KPIs not matched = {len(not_matched_kpi)} out of {len(target_sentences_filt)}")
print("KPIs not matched: ", not_matched_kpi)


KPIs not matched = 47 out of 180
KPIs not matched:  ['1Q15-end cash and investments = approx. $4.5b.', '2Q15 GAAP EPS guidance = $0.21.', 'Better than expected results.', '1Q record, absolute.', 'All-time high, percentage.', 'Relatively lighter slate in 1Q15.', 'Numbers vs. 1Q14, unless otherwise noted.', 'EPS $0.16.', 'Blizzard Entertainment:\n1. Had steady and strong performance on Hearthstone with ongoing engagement on 4Q expansion, Goblins vs\nGnomes.', 'Key Metrics:\n1. All percentages based on revenues, except tax rate.', 'OpEx 37%.', 'OpEx 50%.', 'Adjusted EBITDA $223m.', 'Operating cash flow $209m.', 'Free cash flow $188m after CapEx.', 'Net cash $360m.', "2Q15 Outlook:\n1. Activision Publishing's slate includes additional downloadable content releases from Call of Duty and Destiny\nand continued live operations during open beta for Call of Duty Online in China.", 'Product costs 20%.', 'OpEx 52%.', 'EPS $0.21.', 'Product costs 19%.', 'OpEx 63%.', 'Tax rate approx. 26%.', 'Produ

In [None]:
# alternative to finding all numbers.... 

# Regular expression pattern to match numbers
pattern = r"\d+(?:\.\d+)?"
# Find all matches of the pattern in the text
a = re.findall(pattern, target_sentences_filt[8])
print(a)

['11', '30', '14', '4.5']


#### **Attention** This could be also included in the function and would give us better results as the actual ones.. 

## Now that we´ve seen how it works, lets create a function to match all KPIs to their origin.. 

In [None]:
#pull all dependencies and start from here... 
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")
from pandarallel import pandarallel

In [None]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #paraphrase-MiniLM-L6-v2 or all-MiniLM-L6-v2

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
#Import the data
combined_df = pd.read_csv('paragraph_div.csv')
combined_df.head()

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
def preprocess_target(X):
    #split text into individual targets
    pattern = r'(?<=\.)\n' 
    target_sentences= re.split(pattern, X)

    #taking away the list characters
    target_sentences_filt = []
    for sentence in target_sentences:
        # Use regular expressions to match and remove the listation if it exists
        sentence_without_listation = re.sub(r'^\d+\.\s+', '', sentence)
        target_sentences_filt.append(sentence_without_listation)
        
    return target_sentences_filt

In [None]:
# now lets make it a function 
def match_kpis(row):
    
    #get the data
    #paragraph = another_df.loc[row,"Macro Paragraph"]  
    paragraph = row['Macro Paragraph']
    #raw_target = another_df.loc[row,"Tentative_Target"]
    raw_target = row['Tentative_Target']
    
    #preprocess the target
    target_sentences_filt = preprocess_target(raw_target)
    
    #print out paragraph
    #print("------------------------------------------------------------")
    #print("Macro paragraph: ", paragraph)
    
    # Tokenize the paragraph and targets into sentences
    paragraph_sentences = nltk.sent_tokenize(paragraph)
    target_embedings = model.encode(target_sentences_filt)
    
    #initialize empty list of matched targets
    matched_targets=[]
    
    for sentence in paragraph_sentences:
                
        # Encode the sentence in the paragraph
        embeddings = model.encode(sentence)
        
        # Compute cosine similarity between paragraph and target sentences
        cos_sim = util.cos_sim(embeddings, target_embedings)

        # Add all pairs to a list with their cosine similarity score
        all_sentence_combinations = []
        for j in range(len(cos_sim[0])):
            all_sentence_combinations.append([cos_sim[0][j], 0, j])

        # Sort list by the highest cosine similarity score
        all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
        
        docs_paragraphs = nlp(paragraph)  
        # Print the top 5 most similar sentences for the current paragraph
        for score, i, j in all_sentence_combinations:
            docs_target = nlp(target_sentences_filt[j])
            # print([token.text for token in docs_target if token.like_num])
            # if a matched target sentence has numbers, make sure they are indeed in the paragraph
            if (any([token.text for token in docs_target if token.like_num])):
                numbers_target =  [token.text for token in docs_target if token.like_num]
                numbers_paragraph = [token.text for token in docs_paragraphs if token.like_num]
                if any(x in numbers_target for x in numbers_paragraph):
                    if score >0.55:
                        #print("==================================")
                        #print("TARGET like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        #print("Similarity Score: {:.4f}".format(score))
            else:
                if score >0.65:
                        #print("==================================")
                        #print("TARGET not like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        #print("Similarity Score: {:.4f}".format(score))
       
    # Join the list elements into a single string
    joined_matched_targets = ' --- '.join(matched_targets)
    return joined_matched_targets 
    

In [None]:
#create a different dataframe with only 15 rows to make tryouts..... 
another_df = combined_df[0:15]
another_df.loc[:,"Matched_KPIs"] = ""
another_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
%%time 
#without parallelization
another_df["Matched_KPIs"] = another_df.apply(lambda row: match_kpis(row), axis=1)
another_df.head(15)

CPU times: user 3min 47s, sys: 2.88 s, total: 3min 50s
Wall time: 2min 34s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Strong growth in consulting and outsourcing. -...
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,New bookings $7.7b. --- Grew revenues 10% in l...
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Durable revenue growth:\n\n\n1. Expanded busin...
5,Q1 2015 Accenture PLC Earnings Call - Final,225,1024,"With respect to sustainable margin expansion, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Sustainable margin expansion:\n1. Expanded ope...
6,Q1 2015 Accenture PLC Earnings Call - Final,205,923,We're pleased with the composition of our new ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,"Pleased with composition of new bookings, spec..."
7,Q1 2015 Accenture PLC Earnings Call - Final,179,978,"Before I cover the operating groups, let me pr...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Operating Groups:\n1. 15% growth in Communicat...
8,Q1 2015 Accenture PLC Earnings Call - Final,189,977,"In H&PS, the 13% growth in the quarter was lea...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,13% growth in H&PS. --- Led by significant gro...
9,Q1 2015 Accenture PLC Earnings Call - Final,195,915,"Resources grew 2%, up from last quarter, as we...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,2% growth in Resources. --- Ongoing challenges...


All good until here.. 

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"

In [None]:
import torch
torch.multiprocessing.set_start_method('spawn', force=True)

In [None]:
try:
   set_start_method('spawn', force=True)
   print("spawned")
except RuntimeError:
   pass

spawned


In [None]:
%%time 
# Initialize pandarallel

pandarallel.initialize(progress_bar = True)

another_df["Matched_KPIs"] = another_df.parallel_apply(lambda row: match_kpis(row), axis=1)
another_df.head(15)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [None]:
from multiprocessing import set_start_method


### While parallelization is not ready...Ill Keep doing some other stuff

In [None]:
# Reset index and make it a column
another_df

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
5,Q1 2015 Accenture PLC Earnings Call - Final,225,1024,"With respect to sustainable margin expansion, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
6,Q1 2015 Accenture PLC Earnings Call - Final,205,923,We're pleased with the composition of our new ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
7,Q1 2015 Accenture PLC Earnings Call - Final,179,978,"Before I cover the operating groups, let me pr...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
8,Q1 2015 Accenture PLC Earnings Call - Final,189,977,"In H&PS, the 13% growth in the quarter was lea...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
9,Q1 2015 Accenture PLC Earnings Call - Final,195,915,"Resources grew 2%, up from last quarter, as we...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
another_df.head()

Unnamed: 0,index,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
%%time 
# Initialize pandarallel
pandarallel.initialize(progress_bar = True)

another_df["Matched_KPIs"] = another_df.parallel_apply(lambda row: match_kpis(row), axis=1)
another_df.head(15)

------------------------------------------------------------
Macro paragraph:  OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instructions)
As a reminder, this conference is being recorded. I would now like to turn the conference over to our host, Head
of Investor Relations, Ms KC McClure. Please go ahead. KC MCCLURE, MANAGING DIRECTOR OF IR, ACCENTURE PLC: Thank you, Tom. Thanks everyone for
joining us today on our first-quarter FY15 earnings announcement. As Tom just mentioned, I'm KC McClure,
Managing Director, Head of Investor Relations. With me today are Pierre Nanterme, our Chairman and Chief
Executive Officer and David Rowland, our Chief Financial Officer. We hope you've had an opportunity to review
the news release we issued a short time ago.
------------------------------------------------------------
Macro paragraph:  Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our results. David will take


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Strong growth in consulting and outsourcing. -...
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,New bookings $7.7b. --- Grew revenues 10% in l...
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Durable revenue growth:\n\n\n1. Expanded busin...
5,Q1 2015 Accenture PLC Earnings Call - Final,225,1024,"With respect to sustainable margin expansion, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Sustainable margin expansion:\n1. Expanded ope...
6,Q1 2015 Accenture PLC Earnings Call - Final,205,923,We're pleased with the composition of our new ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,"Pleased with composition of new bookings, spec..."
7,Q1 2015 Accenture PLC Earnings Call - Final,179,978,"Before I cover the operating groups, let me pr...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Operating Groups:\n1. 15% growth in Communicat...
8,Q1 2015 Accenture PLC Earnings Call - Final,189,977,"In H&PS, the 13% growth in the quarter was lea...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,13% growth in H&PS. --- Led by significant gro...
9,Q1 2015 Accenture PLC Earnings Call - Final,195,915,"Resources grew 2%, up from last quarter, as we...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,2% growth in Resources. --- Ongoing challenges...


In [None]:
from multiprocessing import set_start_method
set_start_method('spawn')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# Function to add values from columns A and B
def add_columns(row):
    return row['A'] + row['B']

In [None]:
# Sample DataFrame
dff = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'F': [1, 2, 3]})
dff

Unnamed: 0,A,B,F
0,1,4,1
1,2,5,2
2,3,6,3


In [None]:
# Apply the add_columns function row-wise
dff['C'] = dff.apply(lambda row: add_columns(row), axis=1)

# Print the updated DataFrame
dff

Unnamed: 0,A,B,F,C
0,1,4,1,5
1,2,5,2,7
2,3,6,3,9


In [None]:
combined_df

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
...,...,...,...,...,...,...
94366,Q4 2023 Salesforce Inc Earnings Call - Final,224,1077,I also want to call out the great progress we ...,OVERVIEW\nCo. reported FY23 revenues of $31.4b...,
94367,Q4 2023 Salesforce Inc Earnings Call - Final,174,748,I'm excited to announce that looking forward t...,OVERVIEW\nCo. reported FY23 revenues of $31.4b...,
94368,Q4 2023 Salesforce Inc Earnings Call - Final,201,953,We're also thrilled to welcome 3 new members t...,OVERVIEW\nCo. reported FY23 revenues of $31.4b...,
94369,Q4 2023 Salesforce Inc Earnings Call - Final,215,1019,"We know that we have the right team, the right...",OVERVIEW\nCo. reported FY23 revenues of $31.4b...,


In [None]:
combined_df.columns

Index(['Earnings Call Name', 'Tokenized Size', 'Non-Tokenized Size',
       'Macro Paragraph', 'Tentative_Target', 'Matched_KPIs'],
      dtype='object')

In [None]:
# Apply the match_kpis function to the complete DataFrame
for i in range(15):
    text = combined_df.loc[i,'Macro Paragraph']
    target = combined_df.loc[i,'Tentative_Target']
    
    # Tokenize the target paragraph into sentences
    #target_sentences= nltk.sent_tokenize(target_paragraph) # didnt work as expected.. 
    pattern = r'(?<=\.)\n' 
    target_sentences= re.split(pattern, target)

    #taking away the list characters
    target = []
    for sentence in target_sentences:
        # Use regular expressions to match and remove the listation if it exists
        sentence_without_listation = re.sub(r'^\d+\.\s+', '', sentence)

        target.append(sentence_without_listation)
        
    #print(target)
    combined_df.loc[i,'Matched_KPIs'] = match_kpis(text, target)
#combined_df.head()

------------------------------------------------------------
Macro paragraph:  OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instructions)
As a reminder, this conference is being recorded. I would now like to turn the conference over to our host, Head
of Investor Relations, Ms KC McClure. Please go ahead. KC MCCLURE, MANAGING DIRECTOR OF IR, ACCENTURE PLC: Thank you, Tom. Thanks everyone for
joining us today on our first-quarter FY15 earnings announcement. As Tom just mentioned, I'm KC McClure,
Managing Director, Head of Investor Relations. With me today are Pierre Nanterme, our Chairman and Chief
Executive Officer and David Rowland, our Chief Financial Officer. We hope you've had an opportunity to review
the news release we issued a short time ago.


In [None]:
combined_df.head()

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instruct...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our re...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certain non-GAAP financial measures, which we believe p...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in line with our expectations. We grew revenues 10% in...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, Pierre. Happy Holidays to all of you. Thank you fo...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]


In [None]:
tryout_text = combined_df.loc[10,"Macro Paragraph"]

target = match_kpis(tryout_text,target_sentences_filt)
print(target)

------------------------------------------------------------
Macro paragraph:  Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. This reflects a
12% year-over-year increase Turning to DSOs, our day services outstanding continue to be industry leading. There were 37 days up from 36
days last quarter Free cash flow in the quarter was $821 million resulting from cash generated by operating activities of $873
million net of property and equipment additions of $52 million. Cash flows in the quarter were positively impacted
by a shift in the timing of a portion of compensation payments, which were paid in quarter one in prior years and
beginning this year, will be paid in quarter two, with no impact to full year cash flow Moving to our level of cash. Our cash balance at November 30 was $4.5 billion compared with $4.9 billion at
August 31 a

In [None]:
#now we´ll do it for a complete earning call of a company.. 
for i in range (13): #I know there are 12 rows in combined_df that belont to this company.. 
    # Iterate over each paragraph
    paragraph = combined_df.loc[i,"Macro Paragraph"] # one paragraph 
    # Tokenize the paragraph into sentences
    # paragraph_sentences = nltk.sent_tokenize(paragraph)
    print("PARAGRAPH SECTION" ) 

    embeddings = model.encode(paragraph)
    target_embedings = model.encode(target_sentences_filt)

    # Compute cosine similarity between paragraph and target sentences
    cos_sim = util.cos_sim(embeddings, target_embedings)

    # Add all pairs to a list with their cosine similarity score
    all_sentence_combinations = []
    for j in range(len(cos_sim[0])):
        all_sentence_combinations.append([cos_sim[0][j], 0, j])

    # Sort list by the highest cosine similarity score
    all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
    print("---------------------------------------------")
    print("Paragraph: ", paragraph)
    print("Top-5 most similar sentences:")


    docs_paragraphs = nlp(paragraph)

    # Print the top 5 most similar sentences for the current paragraph
    for score, i, j in all_sentence_combinations:        
            docs_target = nlp(target_sentences_filt[j])
            # print([token.text for token in docs_target if token.like_num])
            if any([token.text for token in docs_target if token.like_num]):
                numbers_target =  [token.text for token in docs_target if token.like_num]
                numbers_paragraph = [token.text for token in docs_paragraphs if token.like_num]
                if any(x in numbers_target for x in numbers_paragraph):
                    if score >0.450:
                        print("====================================")
                        print("TARGET:", target_sentences_filt[j])
                        print("Similarity Score: {:.4f}".format(score))  


PARAGRAPH SECTION
---------------------------------------------
Paragraph:  OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call (Operator Instructions)
As a reminder, this conference is being recorded. I would now like to turn the conference over to our host, Head
of Investor Relations, Ms KC McClure. Please go ahead KC MCCLURE, MANAGING DIRECTOR OF IR, ACCENTURE PLC: Thank you, Tom. Thanks everyone for
joining us today on our first-quarter FY15 earnings announcement. As Tom just mentioned, I'm KC McClure,
Managing Director, Head of Investor Relations. With me today are Pierre Nanterme, our Chairman and Chief
Executive Officer and David Rowland, our Chief Financial Officer. We hope you've had an opportunity to review
the news release we issued a short time ago
Top-5 most similar sentences:
131
133
146
PARAGRAPH SECTION
---------------------------------------------
Paragraph:  Let me quickly outline the agenda for today's call. Pierre will begin with an overview

**NOTE:** we have to figure the way to only include those whose figures/digits match.. otherwise its bllshit. Ill try entity recognition now.. 

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm


In [None]:
#additional approach needs to have the exact same text on one as in the other.. 
import spacy

nlp = spacy.load('en_core_web_sm')

#target_sentences

# Process the paragraph
for paragraph_sentence in paragraph_sentences:
    doc = nlp(paragraph_sentence)
    print("ORIGINAL:",doc)
    # Extract the relevant sentences
    extracted_sentences = []
    for sent in doc.sents:
        sent_text = sent.text.strip()
        for sentence in target_sentences_filt:
            doc2 = nlp(sentence)
            keywords =  [token.text for token in doc2 if token.pos_ in ['NOUN', 'PROPN', 'ADJ'] or token.like_num]
            # Check if all keywords are present in the sentence
            #print(keywords)
            #print()
            if all(key in sent_text for key in keywords):
                extracted_sentences.append(sentence)
    
    # Print the extracted sentences
    for extracted_sentence in extracted_sentences:
        print("TARGET:",extracted_sentence)


ORIGINAL: Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year.
TARGET: Net income $892m
TARGET: Diluted EPS $1.29
ORIGINAL: This reflects a
12% year-over-year increase Turning to DSOs, our day services outstanding continue to be industry leading.
ORIGINAL: There were 37 days up from 36
days last quarter Free cash flow in the quarter was $821 million resulting from cash generated by operating activities of $873
million net of property and equipment additions of $52 million.
TARGET: Free cash flow $821m
TARGET: Free cash flow $821m
TARGET: Resulting from cash generated by operating activities of $873m, net of property and equipment additions of
$52m
ORIGINAL: Cash flows in the quarter were positively impacted
by a shift in the timing of a portion of compensation payments, which were paid in quarter one in prior years and
beginning this ye

**NOTE:** Better approach but still missing out on many matches... 

And from now on its just trial and error. I believe we have to find out the way to enhance cosine similarity or pair it up with entity recognition to excel performance... 

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Extract entities for each paragraph
paragraph_entities = []

for paragraph_sentence in paragraph_sentences:
    # Process the document
    doc = nlp(paragraph_sentence)
    for paragraph in doc.sents:
        entities = [(ent.text, ent.label_) for ent in paragraph.ents]
        paragraph_entities.append(entities)
print(paragraph_entities)
# Match relevant sentences to their paragraphs
relevant_sentences = []
for target_sentence in target_sentences_filt:
    sentence_entities = [(ent.text, ent.label_) for ent in target_sentence.ents]
    
    for entities in paragraph_entities:
        if set(sentence_entities).issubset(set(entities)):
            relevant_sentences.append((sentence.text, sentence.start, sentence.end))
            break

# Print relevant sentences and their paragraph of origin
for sentence in relevant_sentences:
    paragraph = doc.sent
    print("Relevant Sentence:", sentence)
    print("Paragraph of Origin:", paragraph.text)
    print()


[[('$892 million', 'MONEY'), ('the first quarter', 'DATE'), ('$812 million', 'MONEY'), ('the same quarter last year', 'DATE'), ('Diluted', 'ORG'), ('1.29', 'MONEY'), ('EPS', 'ORG'), ('1.15', 'MONEY'), ('the first quarter last year', 'DATE')], [('12%', 'PERCENT'), ('year-over-year', 'DATE')], [('37 days', 'DATE'), ('36\ndays', 'DATE'), ('last quarter', 'DATE'), ('the quarter', 'DATE'), ('$821 million', 'MONEY'), ('873', 'MONEY'), ('$52 million', 'MONEY')], [('the quarter', 'DATE'), ('quarter one', 'DATE'), ('prior years', 'DATE'), ('this year', 'DATE'), ('quarter two', 'DATE')], [('November 30', 'DATE'), ('$4.5 billion', 'MONEY'), ('$4.9 billion', 'MONEY'), ('August 31', 'DATE'), ('this quarter', 'DATE'), ('November', 'DATE')]]


In [None]:
#different approach
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

sentences = [
    "1Q15 net income = $892m",
    "1Q15 diluted EPS = $1.29",
    "1Q15 YoverY net revenue growth (US dollars) = 7%"
]

# Perform entity recognition
for paragraph_sentence in paragraph_sentences:
    doc = nlp(paragraph_sentence)
    print("ORIGINAL", doc)
    print()
    entities = [ent.text.lower() for ent in doc.ents]

    # Define patterns for structural analysis
    patterns = [
        [{"LOWER": "1q15"}, {"LOWER": "net"}, {"LOWER": "income"}],
        [{"LOWER": "1q15"}, {"LOWER": "diluted"}, {"LOWER": "eps"}]
    ]

    # Initialize Matcher and apply pattern matching
    matcher = Matcher(nlp.vocab)
    matcher.add("PatternMatch", patterns)

    # Check which sentences are extracted
    extracted_sentences = []
    for sentence in sentences:
        sentence_doc = nlp(sentence)
        matches = matcher(sentence_doc)
        if any(matches):
            extracted_sentences.append(sentence)

    # Print the extracted sentences
    for extracted_sentence in extracted_sentences:
        print(extracted_sentence)


ORIGINAL Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year.

1Q15 net income = $892m
1Q15 diluted EPS = $1.29
ORIGINAL This reflects a
12% year-over-year increase Turning to DSOs, our day services outstanding continue to be industry leading.

1Q15 net income = $892m
1Q15 diluted EPS = $1.29
ORIGINAL There were 37 days up from 36
days last quarter Free cash flow in the quarter was $821 million resulting from cash generated by operating activities of $873
million net of property and equipment additions of $52 million.

1Q15 net income = $892m
1Q15 diluted EPS = $1.29
ORIGINAL Cash flows in the quarter were positively impacted
by a shift in the timing of a portion of compensation payments, which were paid in quarter one in prior years and
beginning this year, will be paid in quarter two, with no impact to full year cash flow Moving to ou

In [None]:
combined_df.loc[0:13,"Earnings Call Name"]

0     Q1 2015 Accenture PLC Earnings Call - Final
1     Q1 2015 Accenture PLC Earnings Call - Final
2     Q1 2015 Accenture PLC Earnings Call - Final
3     Q1 2015 Accenture PLC Earnings Call - Final
4     Q1 2015 Accenture PLC Earnings Call - Final
5     Q1 2015 Accenture PLC Earnings Call - Final
6     Q1 2015 Accenture PLC Earnings Call - Final
7     Q1 2015 Accenture PLC Earnings Call - Final
8     Q1 2015 Accenture PLC Earnings Call - Final
9     Q1 2015 Accenture PLC Earnings Call - Final
10    Q1 2015 Accenture PLC Earnings Call - Final
11    Q1 2015 Accenture PLC Earnings Call - Final
12    Q1 2015 Accenture PLC Earnings Call - Final
13          Q1 2015 ACE Ltd Earnings Call - Final
Name: Earnings Call Name, dtype: object

## Initial paragaph division

### Import dependencies and data

In [None]:
# load required dependencies
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer

In [None]:
# load data 
df = pd.read_csv("final.csv")

In [None]:
# Set the display options
pd.set_option('display.max_colwidth', 100)
df.head(10)

Unnamed: 0,heading,train,target
0,Q1 2015 Accenture PLC Earnings Call - Final,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call.\n(Operator Instruc...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
1,Q1 2015 ACE Ltd Earnings Call - Final,"OPERATOR: Good day, and welcome to the ACE Limited First Quarter 2015 Earnings Conference Call. ...",OVERVIEW\nCo. reported 1Q15 after-tax operating income of $745m or $2.25 per share.\nFINANCIAL D...
2,Q1 2015 Activision Blizzard Inc Earnings Call - Final,OPERATOR: Good day and welcome to the Activision Blizzard quarter one 2015 earnings conference c...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of $1.3b and 1Q15 GAAP EPS of $0.53. Expects 2015 GAA...
3,Q1 2015 Adobe Systems Inc Earnings Call - Final,OPERATOR: I would like to welcome you to Adobe Systems' first quarter FY15 earnings conference c...,OVERVIEW\nADBE reported 1Q15 revenue of $1.109b and GAAP diluted EPS of $0.17. Expects 2Q15 reve...
4,Q1 2015 Advanced Micro Devices Inc Earnings Call - Final,"OPERATOR: Good day, ladies and gentlemen, and thank you for your patience. You have joined AMD's...",OVERVIEW\nAMD reported 1Q15 revenue of $1.03b and net loss of $73m or $0.09 per share. Co. expec...
5,Q1 2015 Agilent Technologies Inc Earnings Call - Final,"OPERATOR: Good day ladies and gentlemen, and welcome to Agilent Technologies' first-quarter 2015...",OVERVIEW\nCo. reported 1Q15 revenue of $1.03b and non-GAAP EPS of $0.41. Expects FY15 revenue to...
6,Q1 2015 American Tower Corp Earnings Call - Final,OPERATOR: Good morning. My name is Steve and I will be your conference operator today. At this t...,OVERVIEW\nAMT reported 1Q15 net income attributable to Co. common stockholders of approx. $183m ...
7,Q1 2015 AmerisourceBergen Corp Earnings Call - Final,"OPERATOR: Ladies and gentlemen, thank you for standing by. Welcome to the AmerisourceBergen earn...","OVERVIEW\nCo. reported 1Q15 revenues of $33.6b, adjusted operating income of $436m and adjusted ..."
8,Q1 2015 Amgen Inc Earnings Call - Final,,OVERVIEW\nCo. reported 1Q15 revenues of $5b. Expects 2015 revenues to be $20.9-21.3b and adjuste...
9,Q1 2015 Analog Devices Inc Earnings Call - Final,"OPERATOR: Good afternoon. My name is Jennifer, and I will be your conference facilitator. At thi...","OVERVIEW\n\n\nADI reported 1Q15 sales of $772m and diluted EPS, excluding special items, of $0.6..."


In [None]:
df.isna().sum()

heading     0
train      18
target      1
dtype: int64

In [None]:
print("NA lines = ",df.isna().sum())
df.dropna(inplace = True)
df.reset_index(drop=True, inplace= True)
print("NA lines dropped")

NA lines =  heading     0
train      18
target      1
dtype: int64
NA lines dropped


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
#defining important variables 
max_size_tokenized_paragraph = 256

#Create an empty DF where we are going to append all results
combined_df = pd.DataFrame(columns = ["Earnings Call Name","Tokenized Size","Non-Tokenized Size","Macro Paragraph", "Tentative_Target"])

iterations = df.shape[0]
progress_bar = tqdm(total=iterations, desc="Processing", unit="iteration")

#selecting an earning call as a subset of the data
for i in range(iterations): #df.shape[0]
    earnings_call_title = df.loc[i,"heading"]
    tentative_target = df.loc[i,"target"]
    text = df.loc[i,"train"]

    # Split the text into paragraphs
    # Define the splitting criteria (a dot followed by \n or \n\n)
    split_pattern = r'(?<=\.)\n|\n\n' 
    # Split the text into paragraphs using the defined pattern
    paragraphs = re.split(split_pattern, text)
    #print("Total paragraphs: ",len(paragraphs))
    # Print each paragraph
    #for i, paragraph in enumerate(paragraphs, 1):
    #    print(f"Paragraph {i}:\n{paragraph}\n")

    
    #Creating macro paragraphs which are several individual paragraphs together with max_size_tokenized_paragraph less than X.
    macro_paragraphs = []
    current_macro_paragraph = ""

    for paragraph in paragraphs:
        tokenized_paragraph = tokenizer.encode(paragraph)
        if len(tokenized_paragraph) + len(tokenizer.encode(current_macro_paragraph)) <= max_size_tokenized_paragraph:
            current_macro_paragraph += paragraph + " "
        else:
            macro_paragraphs.append(current_macro_paragraph.strip())
            current_macro_paragraph = paragraph + " "

    # Add the remaining paragraphs as a new macro_paragraph
    if current_macro_paragraph:
        macro_paragraphs.append(current_macro_paragraph.strip())
    
    
    # Create a DataFrame with the macro paragraphs and their sizes
    macro_paragraphs_data = {
    "Earnings Call Name" : earnings_call_title, 
    "Tokenized Size": [len(tokenizer.encode(paragraph)) for paragraph in macro_paragraphs],
    "Non-Tokenized Size": [len(paragraph) for paragraph in macro_paragraphs],
    "Macro Paragraph": macro_paragraphs,
    "Tentative_Target":tentative_target,
    }
    macro_paragraphs_df = pd.DataFrame(macro_paragraphs_data)
    #macro_paragraphs_df.head()


    # Append the macro_paragraphs_df with the combined df.
    combined_df = pd.concat([combined_df, macro_paragraphs_df], ignore_index=True)
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

print("combined_df.shape: ",combined_df.shape)
combined_df

Processing:   1%|          | 27/2441 [01:31<2:15:46,  3.37s/iteration]
Processing: 100%|██████████| 2441/2441 [07:07<00:00,  5.71iteration/s]

combined_df.shape:  (94370, 5)





Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instruct...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our re...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certain non-GAAP financial measures, which we believe p...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in line with our expectations. We grew revenues 10% in...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, Pierre. Happy Holidays to all of you. Thank you fo...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29...."
...,...,...,...,...,...
94365,Q4 2023 Salesforce Inc Earnings Call - Final,185,762,"Non-GAAP operating margin for fiscal '23 was 22.5%, significantly above our forecast, an improve...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...
94366,Q4 2023 Salesforce Inc Earnings Call - Final,224,1077,"I also want to call out the great progress we have made with MuleSoft and Tableau. As you know, ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...
94367,Q4 2023 Salesforce Inc Earnings Call - Final,174,748,"I'm excited to announce that looking forward to fiscal year '24, we expect a non-GAAP operating ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...
94368,Q4 2023 Salesforce Inc Earnings Call - Final,201,953,"We're also thrilled to welcome 3 new members to our Board, Mason Morfit, the CEO and Chief Inves...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...


In [None]:
combined_df['Matched_KPIs'] = ''
combined_df

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instruct...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our re...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certain non-GAAP financial measures, which we believe p...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in line with our expectations. We grew revenues 10% in...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, Pierre. Happy Holidays to all of you. Thank you fo...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",
...,...,...,...,...,...,...
94365,Q4 2023 Salesforce Inc Earnings Call - Final,185,762,"Non-GAAP operating margin for fiscal '23 was 22.5%, significantly above our forecast, an improve...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,
94366,Q4 2023 Salesforce Inc Earnings Call - Final,224,1077,"I also want to call out the great progress we have made with MuleSoft and Tableau. As you know, ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,
94367,Q4 2023 Salesforce Inc Earnings Call - Final,174,748,"I'm excited to announce that looking forward to fiscal year '24, we expect a non-GAAP operating ...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,
94368,Q4 2023 Salesforce Inc Earnings Call - Final,201,953,"We're also thrilled to welcome 3 new members to our Board, Mason Morfit, the CEO and Chief Inves...",OVERVIEW\nCo. reported FY23 revenues of $31.4b and 4Q23 revenues of $8.38b. Expects FY24 revenue...,


In [None]:
combined_df.to_csv('paragraph_div.csv', index=False)

In [None]:
# Group the DataFrame by the 'Name' column and count the number of rows in each group
grouped_df = combined_df.groupby('Earnings Call Name').size().reset_index(name='Paragraph Count')
grouped_df

Unnamed: 0,Earnings Call Name,Paragraph Count
0,Q1 2015 ACE Ltd Earnings Call - Final,55
1,Q1 2015 Accenture PLC Earnings Call - Final,13
2,Q1 2015 Activision Blizzard Inc Earnings Call - Final,45
3,Q1 2015 Adobe Systems Inc Earnings Call - Final,29
4,Q1 2015 Advanced Micro Devices Inc Earnings Call - Final,5
...,...,...
2436,Q4 2022 Texas Instruments Inc Earnings Call - Final,26
2437,Q4 2023 Autodesk Inc Earnings Call - Final,4
2438,Q4 2023 CrowdStrike Holdings Inc Earnings Call - Final,55
2439,Q4 2023 NVIDIA Corp Earnings Call - Final,37


## Keep going from here because we already have paragraph division

In [None]:
# load required dependencies
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
combined_df = pd.read_csv('paragraph_div.csv')
combined_df.head()

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
df = pd.read_csv("final.csv")

ATTENTION!


Some Earnings calls are not complete! in the final csv file appear only the first text for some of the first paragraphs but not complete... 

## Now we will go into sentence similarity /entity recognitionto pair the KPIs to their corresponding paragraph

In [None]:
#!pip install sentence_transformers

In [None]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

In [None]:
paragraph = combined_df.loc[80,"Macro Paragraph"] # one paragraph 

# Tokenize the paragraph into sentences
paragraph_sentences = nltk.sent_tokenize(paragraph)

print("Num. of sentences on macro paragraph:", len(paragraph_sentences))
#print(paragraph)
#print()
# Print the sentences
for paragraph_sentence in paragraph_sentences:
    print(paragraph_sentence)
    print()


Num. of sentences on macro paragraph: 8
Now to our 2015 full-year numbers.

For 2015 on a GAAP basis, we expect revenues of $4.25 billion, an increase
of $110 million versus our guidance in February; product costs of 23%; and operating expenses of 50%.

For both
GAAP and non-GAAP, we expect interest expense of $202 million.

Our GAAP tax rate is expected to be 22%.

We
expect 750 million fully diluted shares both for GAAP and non-GAAP.

And GAAP EPS is expected to be $0.98, up
$0.09 from our February guidance.

For 2015, on a non-GAAP basis, we expect revenues of $4.245 billion, $25 million higher than our February
guidance; product costs of 24%; operating expenses of 45%; and an operating margin of 31%, 100 basis points
better than our February guidance.

Our non-GAAP tax rate is expected to be 24%.



In [None]:
def preprocess_target(X):
    #split text into individual targets
    pattern = r'(?<=\.)\n' 
    target_sentences= re.split(pattern, X)

    #taking away the list characters
    target_sentences_filt = []
    for sentence in target_sentences:
        # Use regular expressions to match and remove the listation if it exists
        sentence_without_listation = re.sub(r'^\d+\.\s+', '', sentence)
        target_sentences_filt.append(sentence_without_listation)
        
    return target_sentences_filt

In [None]:
target_paragraph = combined_df.loc[80,"Tentative_Target"]
target_sentences_filt = preprocess_target(target_paragraph)
print(len(target_sentences_filt))
target_sentences_filt

180


['OVERVIEW\nATVI reported 1Q15 GAAP revenues of $1.3b and 1Q15 GAAP EPS of $0.53. Expects 2015 GAAP revenues to\nbe $4.25b and GAAP EPS to be $0.98. Expected 2Q15 GAAP net revenues are $930m and GAAP EPS is $0.21.',
 'FINANCIAL DATA\n1. 1Q15 GAAP revenues = $1.3b.',
 '1Q15 non-GAAP revenues = $703m.',
 '1Q15 GAAP EPS = $0.53.',
 '1Q15 non-GAAP EPS = $0.16.',
 '1Q15 YoverY GAAP revenue growth = 15%.',
 '1Q15 GAAP operating margin = 43%.',
 '1Q15 non-GAAP operating margin = 29%.',
 '1Q15-end cash and investments = approx. $4.5b.',
 '1Q15-end total debt = $4.12b.',
 '\n\n10. 2015 GAAP revenue guidance = $4.25b.',
 '2015 non-GAAP revenue guidance = $4.425b.',
 '2Q15 GAAP net revenue guidance = $930m.',
 '2Q15 non-GAAP revenue guidance = $650m.',
 '2015 GAAP EPS guidance = $0.98.',
 '2015 non-GAAP EPS guidance = $1.20.',
 '2Q15 GAAP EPS guidance = $0.21.',
 '2Q15 non-GAAP EPS guidance = $0.07.',
 'PRESENTATION SUMMARY -\nOpening Remarks (B.K.)\n1. 1Q15:\n1. For first time, recognized by For

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2') #'all-MiniLM-L6-v2' #paraphrase-MiniLM-L6-v2

#Sentences are encoded by calling model.encode()
emb1 = model.encode(paragraph_sentences)
emb2 = model.encode(target_sentences_filt)

cos_sim = util.cos_sim(emb1, emb2)
#print("Cosine-Similarity:", cos_sim)

In [None]:
cos_sim.shape

torch.Size([9, 187])

In [None]:
#cos_sim[4] #[1646]

In [None]:
#Encode all sentences
embeddings = model.encode(paragraph_sentences)
target_embedings = model.encode(target_sentences_filt)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, target_embedings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(paragraph_sentences[i], target_sentences_filt[j], cos_sim[i][j]))
    print()

Top-5 most similar pairs:
Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. 	 1Q15 net income = $892m. 	 0.6062

Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. 	 FINANCIAL DATA
1. 1Q15 net revenues = $7.9b. 	 0.6016

Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. 	 1Q15 YoverY net revenue growth (US dollars) = 7%. 	 0.4978

Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year

**NOTE:** Cosine similarity is not really giving us a reliable way of matching the target KPI to the original paragraphs of the earnings call.

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
ea_name = combined_df.loc[80,"Earnings Call Name"]
ea_name

'Q1 2015 Activision Blizzard Inc Earnings Call - Final'

In [None]:
combined_df[combined_df["Earnings Call Name"]==ea_name]#.shape[0]

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
68,Q1 2015 Activision Blizzard Inc Earnings Call ...,180,780,OPERATOR: Good day and welcome to the Activisi...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
69,Q1 2015 Activision Blizzard Inc Earnings Call ...,184,969,These are forward-looking statements that are ...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
70,Q1 2015 Activision Blizzard Inc Earnings Call ...,237,1020,I'd like to note that certain numbers we will ...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
71,Q1 2015 Activision Blizzard Inc Earnings Call ...,209,1037,"Usually, I end my remarks by thanking our incr...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
72,Q1 2015 Activision Blizzard Inc Earnings Call ...,229,1058,"In the last 12 months, we had over 150 million...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
73,Q1 2015 Activision Blizzard Inc Earnings Call ...,254,1077,Our greatest achievement continues to be our a...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
74,Q1 2015 Activision Blizzard Inc Earnings Call ...,235,983,"Also, the numbers I'll be quoting are compared...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
75,Q1 2015 Activision Blizzard Inc Earnings Call ...,192,985,Blizzard Entertainment had steady and strong p...,OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
76,Q1 2015 Activision Blizzard Inc Earnings Call ...,202,898,"And new franchises, like Hearthstone and Heroe...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,
77,Q1 2015 Activision Blizzard Inc Earnings Call ...,252,1037,"In terms of cash flow in Q1, we generated stro...",OVERVIEW\nATVI reported 1Q15 GAAP revenues of ...,


In [None]:
%%time
#now we´ll do it for a complete earning call of a company.. 
total_extracted_kpi = []
for i in range (68,113): #I know there are 12 rows in combined_df that belont to this company.. 
    # Iterate over each paragraph
    paragraph = combined_df.loc[i,"Macro Paragraph"] # one paragraph 
    print("------------------------------------------------------------")
    print("Macro paragraph: ", paragraph)
    
    # Tokenize the paragraph into sentences
    paragraph_sentences = nltk.sent_tokenize(paragraph)
    target_embedings = model.encode(target_sentences_filt)
   
    
    for sentence in paragraph_sentences:
        #initialize empty list of matched targets 
        matched_targets=[]
        
        # Encode the sentence in the paragraph
        embeddings = model.encode(sentence)
        
        # Compute cosine similarity between paragraph and target sentences
        cos_sim = util.cos_sim(embeddings, target_embedings)

        # Add all pairs to a list with their cosine similarity score
        all_sentence_combinations = []
        for j in range(len(cos_sim[0])):
            all_sentence_combinations.append([cos_sim[0][j], 0, j])

        # Sort list by the highest cosine similarity score
        all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
        
        docs_paragraphs = nlp(paragraph)  
        # Print the top 5 most similar sentences for the current paragraph
        for score, i, j in all_sentence_combinations:
            docs_target = nlp(target_sentences_filt[j])
            # print([token.text for token in docs_target if token.like_num])
            # if a matched target sentence has numbers, make sure they are indeed in the paragraph
            if (any([token.text for token in docs_target if token.like_num])):
                numbers_target =  [token.text for token in docs_target if token.like_num]
                numbers_paragraph = [token.text for token in docs_paragraphs if token.like_num]
                if any(x in numbers_target for x in numbers_paragraph):
                    if score >0.6:
                        print("==================================")
                        print("TARGET like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        total_extracted_kpi.append(target_sentences_filt[j])
                        print("Similarity Score: {:.4f}".format(score))
            else:
                if score >0.6:
                        print("==================================")
                        print("TARGET not like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        total_extracted_kpi.append(target_sentences_filt[j])
                        print("Similarity Score: {:.4f}".format(score))
            

------------------------------------------------------------
Macro paragraph:  OPERATOR: Good day and welcome to the Activision Blizzard quarter one 2015 earnings conference call. Today's conference is being recorded. At this time, for opening remarks and introductions, I would like to turn today's call over to Amrita Ahuja. AMRITA AHUJA, SVP OF IR, ACTIVISION BLIZZARD INC: Good afternoon. Thank you for joining us today for
Activision Blizzard's first-quarter 2015 conference call. Speaking on this call today will be Bobby Kotick, CEO of Activision Blizzard; Dennis Durkin, CFO of Activision
Blizzard; Eric Hirshberg, CEO of Activision Publishing; Mike Morhaime, CEO of Blizzard Entertainment; and
Thomas Tippl, COO of Activision Blizzard. I would like to remind everyone that during this call, we will be making statements that are not historical facts.
------------------------------------------------------------
Macro paragraph:  These are forward-looking statements that are based on curren

In [None]:
not_matched_kpi = [x for x in target_sentences_filt if x not in total_extracted_kpi ]
print(f"KPIs not matched = {len(not_matched_kpi)} out of {len(target_sentences_filt)}")
print("KPIs not matched: ", not_matched_kpi)


KPIs not matched = 47 out of 180
KPIs not matched:  ['1Q15-end cash and investments = approx. $4.5b.', '2Q15 GAAP EPS guidance = $0.21.', 'Better than expected results.', '1Q record, absolute.', 'All-time high, percentage.', 'Relatively lighter slate in 1Q15.', 'Numbers vs. 1Q14, unless otherwise noted.', 'EPS $0.16.', 'Blizzard Entertainment:\n1. Had steady and strong performance on Hearthstone with ongoing engagement on 4Q expansion, Goblins vs\nGnomes.', 'Key Metrics:\n1. All percentages based on revenues, except tax rate.', 'OpEx 37%.', 'OpEx 50%.', 'Adjusted EBITDA $223m.', 'Operating cash flow $209m.', 'Free cash flow $188m after CapEx.', 'Net cash $360m.', "2Q15 Outlook:\n1. Activision Publishing's slate includes additional downloadable content releases from Call of Duty and Destiny\nand continued live operations during open beta for Call of Duty Online in China.", 'Product costs 20%.', 'OpEx 52%.', 'EPS $0.21.', 'Product costs 19%.', 'OpEx 63%.', 'Tax rate approx. 26%.', 'Produ

In [None]:
# alternative to finding all numbers.... 

# Regular expression pattern to match numbers
pattern = r"\d+(?:\.\d+)?"
# Find all matches of the pattern in the text
a = re.findall(pattern, target_sentences_filt[8])
print(a)

['11', '30', '14', '4.5']


#### **Attention** This could be also included in the function and would give us better results as the actual ones.. 

## Now that we´ve seen how it works, lets create a function to match all KPIs to their origin.. 

In [None]:
#pull all dependencies and start from here... 
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")
from pandarallel import pandarallel

In [None]:
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #paraphrase-MiniLM-L6-v2 or all-MiniLM-L6-v2

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
#Import the data
combined_df = pd.read_csv('paragraph_div.csv')
combined_df.head()

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
def preprocess_target(X):
    #split text into individual targets
    pattern = r'(?<=\.)\n' 
    target_sentences= re.split(pattern, X)

    #taking away the list characters
    target_sentences_filt = []
    for sentence in target_sentences:
        # Use regular expressions to match and remove the listation if it exists
        sentence_without_listation = re.sub(r'^\d+\.\s+', '', sentence)
        target_sentences_filt.append(sentence_without_listation)
        
    return target_sentences_filt

In [None]:
# now lets make it a function 
def match_kpis(row):
    
    #get the data
    #paragraph = another_df.loc[row,"Macro Paragraph"]  
    paragraph = row['Macro Paragraph']
    #raw_target = another_df.loc[row,"Tentative_Target"]
    raw_target = row['Tentative_Target']
    
    #preprocess the target
    target_sentences_filt = preprocess_target(raw_target)
    
    #print out paragraph
    #print("------------------------------------------------------------")
    #print("Macro paragraph: ", paragraph)
    
    # Tokenize the paragraph and targets into sentences
    paragraph_sentences = nltk.sent_tokenize(paragraph)
    target_embedings = model.encode(target_sentences_filt)
    
    #initialize empty list of matched targets
    matched_targets=[]
    
    for sentence in paragraph_sentences:
                
        # Encode the sentence in the paragraph
        embeddings = model.encode(sentence)
        
        # Compute cosine similarity between paragraph and target sentences
        cos_sim = util.cos_sim(embeddings, target_embedings)

        # Add all pairs to a list with their cosine similarity score
        all_sentence_combinations = []
        for j in range(len(cos_sim[0])):
            all_sentence_combinations.append([cos_sim[0][j], 0, j])

        # Sort list by the highest cosine similarity score
        all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
        
        docs_paragraphs = nlp(paragraph)  
        # Print the top 5 most similar sentences for the current paragraph
        for score, i, j in all_sentence_combinations:
            docs_target = nlp(target_sentences_filt[j])
            # print([token.text for token in docs_target if token.like_num])
            # if a matched target sentence has numbers, make sure they are indeed in the paragraph
            if (any([token.text for token in docs_target if token.like_num])):
                numbers_target =  [token.text for token in docs_target if token.like_num]
                numbers_paragraph = [token.text for token in docs_paragraphs if token.like_num]
                if any(x in numbers_target for x in numbers_paragraph):
                    if score >0.55:
                        #print("==================================")
                        #print("TARGET like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        #print("Similarity Score: {:.4f}".format(score))
            else:
                if score >0.65:
                        #print("==================================")
                        #print("TARGET not like Num:", target_sentences_filt[j])
                        matched_targets.append(target_sentences_filt[j])
                        #print("Similarity Score: {:.4f}".format(score))
       
    # Join the list elements into a single string
    joined_matched_targets = ' --- '.join(matched_targets)
    return joined_matched_targets 
    

In [None]:
#create a different dataframe with only 15 rows to make tryouts..... 
another_df = combined_df[0:15]
another_df.loc[:,"Matched_KPIs"] = ""
another_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
%%time 
#without parallelization
another_df["Matched_KPIs"] = another_df.apply(lambda row: match_kpis(row), axis=1)
another_df.head(15)

CPU times: user 3min 47s, sys: 2.88 s, total: 3min 50s
Wall time: 2min 34s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Strong growth in consulting and outsourcing. -...
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,New bookings $7.7b. --- Grew revenues 10% in l...
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Durable revenue growth:\n\n\n1. Expanded busin...
5,Q1 2015 Accenture PLC Earnings Call - Final,225,1024,"With respect to sustainable margin expansion, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Sustainable margin expansion:\n1. Expanded ope...
6,Q1 2015 Accenture PLC Earnings Call - Final,205,923,We're pleased with the composition of our new ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,"Pleased with composition of new bookings, spec..."
7,Q1 2015 Accenture PLC Earnings Call - Final,179,978,"Before I cover the operating groups, let me pr...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Operating Groups:\n1. 15% growth in Communicat...
8,Q1 2015 Accenture PLC Earnings Call - Final,189,977,"In H&PS, the 13% growth in the quarter was lea...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,13% growth in H&PS. --- Led by significant gro...
9,Q1 2015 Accenture PLC Earnings Call - Final,195,915,"Resources grew 2%, up from last quarter, as we...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,2% growth in Resources. --- Ongoing challenges...


All good until here.. 

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"

In [None]:
import torch
torch.multiprocessing.set_start_method('spawn', force=True)

In [None]:
try:
   set_start_method('spawn', force=True)
   print("spawned")
except RuntimeError:
   pass

spawned


In [None]:
%%time 
# Initialize pandarallel

pandarallel.initialize(progress_bar = True)

another_df["Matched_KPIs"] = another_df.parallel_apply(lambda row: match_kpis(row), axis=1)
another_df.head(15)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [None]:
from multiprocessing import set_start_method


### While parallelization is not ready...Ill Keep doing some other stuff

In [None]:
# Reset index and make it a column
another_df

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
5,Q1 2015 Accenture PLC Earnings Call - Final,225,1024,"With respect to sustainable margin expansion, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
6,Q1 2015 Accenture PLC Earnings Call - Final,205,923,We're pleased with the composition of our new ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
7,Q1 2015 Accenture PLC Earnings Call - Final,179,978,"Before I cover the operating groups, let me pr...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
8,Q1 2015 Accenture PLC Earnings Call - Final,189,977,"In H&PS, the 13% growth in the quarter was lea...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
9,Q1 2015 Accenture PLC Earnings Call - Final,195,915,"Resources grew 2%, up from last quarter, as we...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
another_df.head()

Unnamed: 0,index,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,


In [None]:
%%time 
# Initialize pandarallel
pandarallel.initialize(progress_bar = True)

another_df["Matched_KPIs"] = another_df.parallel_apply(lambda row: match_kpis(row), axis=1)
another_df.head(15)

------------------------------------------------------------
Macro paragraph:  OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instructions)
As a reminder, this conference is being recorded. I would now like to turn the conference over to our host, Head
of Investor Relations, Ms KC McClure. Please go ahead. KC MCCLURE, MANAGING DIRECTOR OF IR, ACCENTURE PLC: Thank you, Tom. Thanks everyone for
joining us today on our first-quarter FY15 earnings announcement. As Tom just mentioned, I'm KC McClure,
Managing Director, Head of Investor Relations. With me today are Pierre Nanterme, our Chairman and Chief
Executive Officer and David Rowland, our Chief Financial Officer. We hope you've had an opportunity to review
the news release we issued a short time ago.
------------------------------------------------------------
Macro paragraph:  Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our results. David will take


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Strong growth in consulting and outsourcing. -...
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,New bookings $7.7b. --- Grew revenues 10% in l...
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Durable revenue growth:\n\n\n1. Expanded busin...
5,Q1 2015 Accenture PLC Earnings Call - Final,225,1024,"With respect to sustainable margin expansion, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Sustainable margin expansion:\n1. Expanded ope...
6,Q1 2015 Accenture PLC Earnings Call - Final,205,923,We're pleased with the composition of our new ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,"Pleased with composition of new bookings, spec..."
7,Q1 2015 Accenture PLC Earnings Call - Final,179,978,"Before I cover the operating groups, let me pr...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,Operating Groups:\n1. 15% growth in Communicat...
8,Q1 2015 Accenture PLC Earnings Call - Final,189,977,"In H&PS, the 13% growth in the quarter was lea...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,13% growth in H&PS. --- Led by significant gro...
9,Q1 2015 Accenture PLC Earnings Call - Final,195,915,"Resources grew 2%, up from last quarter, as we...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,2% growth in Resources. --- Ongoing challenges...


In [None]:
from multiprocessing import set_start_method
set_start_method('spawn')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# Function to add values from columns A and B
def add_columns(row):
    return row['A'] + row['B']

In [None]:
# Sample DataFrame
dff = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'F': [1, 2, 3]})
dff

Unnamed: 0,A,B,F
0,1,4,1
1,2,5,2
2,3,6,3


In [None]:
# Apply the add_columns function row-wise
dff['C'] = dff.apply(lambda row: add_columns(row), axis=1)

# Print the updated DataFrame
dff

Unnamed: 0,A,B,F,C
0,1,4,1,5
1,2,5,2,7
2,3,6,3,9


In [None]:
combined_df

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's ...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certa...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in l...,OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, ...",OVERVIEW\nCo. reported 1Q15 net revenues of $7...,
...,...,...,...,...,...,...
94366,Q4 2023 Salesforce Inc Earnings Call - Final,224,1077,I also want to call out the great progress we ...,OVERVIEW\nCo. reported FY23 revenues of $31.4b...,
94367,Q4 2023 Salesforce Inc Earnings Call - Final,174,748,I'm excited to announce that looking forward t...,OVERVIEW\nCo. reported FY23 revenues of $31.4b...,
94368,Q4 2023 Salesforce Inc Earnings Call - Final,201,953,We're also thrilled to welcome 3 new members t...,OVERVIEW\nCo. reported FY23 revenues of $31.4b...,
94369,Q4 2023 Salesforce Inc Earnings Call - Final,215,1019,"We know that we have the right team, the right...",OVERVIEW\nCo. reported FY23 revenues of $31.4b...,


In [None]:
combined_df.columns

Index(['Earnings Call Name', 'Tokenized Size', 'Non-Tokenized Size',
       'Macro Paragraph', 'Tentative_Target', 'Matched_KPIs'],
      dtype='object')

In [None]:
# Apply the match_kpis function to the complete DataFrame
for i in range(15):
    text = combined_df.loc[i,'Macro Paragraph']
    target = combined_df.loc[i,'Tentative_Target']
    
    # Tokenize the target paragraph into sentences
    #target_sentences= nltk.sent_tokenize(target_paragraph) # didnt work as expected.. 
    pattern = r'(?<=\.)\n' 
    target_sentences= re.split(pattern, target)

    #taking away the list characters
    target = []
    for sentence in target_sentences:
        # Use regular expressions to match and remove the listation if it exists
        sentence_without_listation = re.sub(r'^\d+\.\s+', '', sentence)

        target.append(sentence_without_listation)
        
    #print(target)
    combined_df.loc[i,'Matched_KPIs'] = match_kpis(text, target)
#combined_df.head()

------------------------------------------------------------
Macro paragraph:  OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instructions)
As a reminder, this conference is being recorded. I would now like to turn the conference over to our host, Head
of Investor Relations, Ms KC McClure. Please go ahead. KC MCCLURE, MANAGING DIRECTOR OF IR, ACCENTURE PLC: Thank you, Tom. Thanks everyone for
joining us today on our first-quarter FY15 earnings announcement. As Tom just mentioned, I'm KC McClure,
Managing Director, Head of Investor Relations. With me today are Pierre Nanterme, our Chairman and Chief
Executive Officer and David Rowland, our Chief Financial Officer. We hope you've had an opportunity to review
the news release we issued a short time ago.


In [None]:
combined_df.head()

Unnamed: 0,Earnings Call Name,Tokenized Size,Non-Tokenized Size,Macro Paragraph,Tentative_Target,Matched_KPIs
0,Q1 2015 Accenture PLC Earnings Call - Final,170,722,OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call. (Operator Instruct...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
1,Q1 2015 Accenture PLC Earnings Call - Final,219,1094,Let me quickly outline the agenda for today's call. Pierre will begin with an overview of our re...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
2,Q1 2015 Accenture PLC Earnings Call - Final,249,1180,"During our call today, we will reference certain non-GAAP financial measures, which we believe p...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
3,Q1 2015 Accenture PLC Earnings Call - Final,202,837,We delivered new bookings of $7.7 billion in line with our expectations. We grew revenues 10% in...,"OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]
4,Q1 2015 Accenture PLC Earnings Call - Final,230,1064,"DAVID ROWLAND, CFO, ACCENTURE PLC: Thank you, Pierre. Happy Holidays to all of you. Thank you fo...","OVERVIEW\nCo. reported 1Q15 net revenues of $7.9b, net income of $892m and diluted EPS of $1.29....",[]


In [None]:
tryout_text = combined_df.loc[10,"Macro Paragraph"]

target = match_kpis(tryout_text,target_sentences_filt)
print(target)

------------------------------------------------------------
Macro paragraph:  Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year. This reflects a
12% year-over-year increase Turning to DSOs, our day services outstanding continue to be industry leading. There were 37 days up from 36
days last quarter Free cash flow in the quarter was $821 million resulting from cash generated by operating activities of $873
million net of property and equipment additions of $52 million. Cash flows in the quarter were positively impacted
by a shift in the timing of a portion of compensation payments, which were paid in quarter one in prior years and
beginning this year, will be paid in quarter two, with no impact to full year cash flow Moving to our level of cash. Our cash balance at November 30 was $4.5 billion compared with $4.9 billion at
August 31 a

In [None]:
#now we´ll do it for a complete earning call of a company.. 
for i in range (13): #I know there are 12 rows in combined_df that belont to this company.. 
    # Iterate over each paragraph
    paragraph = combined_df.loc[i,"Macro Paragraph"] # one paragraph 
    # Tokenize the paragraph into sentences
    # paragraph_sentences = nltk.sent_tokenize(paragraph)
    print("PARAGRAPH SECTION" ) 

    embeddings = model.encode(paragraph)
    target_embedings = model.encode(target_sentences_filt)

    # Compute cosine similarity between paragraph and target sentences
    cos_sim = util.cos_sim(embeddings, target_embedings)

    # Add all pairs to a list with their cosine similarity score
    all_sentence_combinations = []
    for j in range(len(cos_sim[0])):
        all_sentence_combinations.append([cos_sim[0][j], 0, j])

    # Sort list by the highest cosine similarity score
    all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)
    print("---------------------------------------------")
    print("Paragraph: ", paragraph)
    print("Top-5 most similar sentences:")


    docs_paragraphs = nlp(paragraph)

    # Print the top 5 most similar sentences for the current paragraph
    for score, i, j in all_sentence_combinations:        
            docs_target = nlp(target_sentences_filt[j])
            # print([token.text for token in docs_target if token.like_num])
            if any([token.text for token in docs_target if token.like_num]):
                numbers_target =  [token.text for token in docs_target if token.like_num]
                numbers_paragraph = [token.text for token in docs_paragraphs if token.like_num]
                if any(x in numbers_target for x in numbers_paragraph):
                    if score >0.450:
                        print("====================================")
                        print("TARGET:", target_sentences_filt[j])
                        print("Similarity Score: {:.4f}".format(score))  


PARAGRAPH SECTION
---------------------------------------------
Paragraph:  OPERATOR: Welcome to Accenture's first-quarter FY14 earnings conference call (Operator Instructions)
As a reminder, this conference is being recorded. I would now like to turn the conference over to our host, Head
of Investor Relations, Ms KC McClure. Please go ahead KC MCCLURE, MANAGING DIRECTOR OF IR, ACCENTURE PLC: Thank you, Tom. Thanks everyone for
joining us today on our first-quarter FY15 earnings announcement. As Tom just mentioned, I'm KC McClure,
Managing Director, Head of Investor Relations. With me today are Pierre Nanterme, our Chairman and Chief
Executive Officer and David Rowland, our Chief Financial Officer. We hope you've had an opportunity to review
the news release we issued a short time ago
Top-5 most similar sentences:
131
133
146
PARAGRAPH SECTION
---------------------------------------------
Paragraph:  Let me quickly outline the agenda for today's call. Pierre will begin with an overview

**NOTE:** we have to figure the way to only include those whose figures/digits match.. otherwise its bllshit. Ill try entity recognition now.. 

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm


In [None]:
#additional approach needs to have the exact same text on one as in the other.. 
import spacy

nlp = spacy.load('en_core_web_sm')

#target_sentences

# Process the paragraph
for paragraph_sentence in paragraph_sentences:
    doc = nlp(paragraph_sentence)
    print("ORIGINAL:",doc)
    # Extract the relevant sentences
    extracted_sentences = []
    for sent in doc.sents:
        sent_text = sent.text.strip()
        for sentence in target_sentences_filt:
            doc2 = nlp(sentence)
            keywords =  [token.text for token in doc2 if token.pos_ in ['NOUN', 'PROPN', 'ADJ'] or token.like_num]
            # Check if all keywords are present in the sentence
            #print(keywords)
            #print()
            if all(key in sent_text for key in keywords):
                extracted_sentences.append(sentence)
    
    # Print the extracted sentences
    for extracted_sentence in extracted_sentences:
        print("TARGET:",extracted_sentence)


ORIGINAL: Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year.
TARGET: Net income $892m
TARGET: Diluted EPS $1.29
ORIGINAL: This reflects a
12% year-over-year increase Turning to DSOs, our day services outstanding continue to be industry leading.
ORIGINAL: There were 37 days up from 36
days last quarter Free cash flow in the quarter was $821 million resulting from cash generated by operating activities of $873
million net of property and equipment additions of $52 million.
TARGET: Free cash flow $821m
TARGET: Free cash flow $821m
TARGET: Resulting from cash generated by operating activities of $873m, net of property and equipment additions of
$52m
ORIGINAL: Cash flows in the quarter were positively impacted
by a shift in the timing of a portion of compensation payments, which were paid in quarter one in prior years and
beginning this ye

**NOTE:** Better approach but still missing out on many matches... 

And from now on its just trial and error. I believe we have to find out the way to enhance cosine similarity or pair it up with entity recognition to excel performance... 

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Extract entities for each paragraph
paragraph_entities = []

for paragraph_sentence in paragraph_sentences:
    # Process the document
    doc = nlp(paragraph_sentence)
    for paragraph in doc.sents:
        entities = [(ent.text, ent.label_) for ent in paragraph.ents]
        paragraph_entities.append(entities)
print(paragraph_entities)
# Match relevant sentences to their paragraphs
relevant_sentences = []
for target_sentence in target_sentences_filt:
    sentence_entities = [(ent.text, ent.label_) for ent in target_sentence.ents]
    
    for entities in paragraph_entities:
        if set(sentence_entities).issubset(set(entities)):
            relevant_sentences.append((sentence.text, sentence.start, sentence.end))
            break

# Print relevant sentences and their paragraph of origin
for sentence in relevant_sentences:
    paragraph = doc.sent
    print("Relevant Sentence:", sentence)
    print("Paragraph of Origin:", paragraph.text)
    print()


[[('$892 million', 'MONEY'), ('the first quarter', 'DATE'), ('$812 million', 'MONEY'), ('the same quarter last year', 'DATE'), ('Diluted', 'ORG'), ('1.29', 'MONEY'), ('EPS', 'ORG'), ('1.15', 'MONEY'), ('the first quarter last year', 'DATE')], [('12%', 'PERCENT'), ('year-over-year', 'DATE')], [('37 days', 'DATE'), ('36\ndays', 'DATE'), ('last quarter', 'DATE'), ('the quarter', 'DATE'), ('$821 million', 'MONEY'), ('873', 'MONEY'), ('$52 million', 'MONEY')], [('the quarter', 'DATE'), ('quarter one', 'DATE'), ('prior years', 'DATE'), ('this year', 'DATE'), ('quarter two', 'DATE')], [('November 30', 'DATE'), ('$4.5 billion', 'MONEY'), ('$4.9 billion', 'MONEY'), ('August 31', 'DATE'), ('this quarter', 'DATE'), ('November', 'DATE')]]


In [None]:
#different approach
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

sentences = [
    "1Q15 net income = $892m",
    "1Q15 diluted EPS = $1.29",
    "1Q15 YoverY net revenue growth (US dollars) = 7%"
]

# Perform entity recognition
for paragraph_sentence in paragraph_sentences:
    doc = nlp(paragraph_sentence)
    print("ORIGINAL", doc)
    print()
    entities = [ent.text.lower() for ent in doc.ents]

    # Define patterns for structural analysis
    patterns = [
        [{"LOWER": "1q15"}, {"LOWER": "net"}, {"LOWER": "income"}],
        [{"LOWER": "1q15"}, {"LOWER": "diluted"}, {"LOWER": "eps"}]
    ]

    # Initialize Matcher and apply pattern matching
    matcher = Matcher(nlp.vocab)
    matcher.add("PatternMatch", patterns)

    # Check which sentences are extracted
    extracted_sentences = []
    for sentence in sentences:
        sentence_doc = nlp(sentence)
        matches = matcher(sentence_doc)
        if any(matches):
            extracted_sentences.append(sentence)

    # Print the extracted sentences
    for extracted_sentence in extracted_sentences:
        print(extracted_sentence)


ORIGINAL Net income was $892 million for the first quarter compared with $812 million for the same quarter last year Diluted earnings per share were $1.29 compared with EPS of $1.15 in the first quarter last year.

1Q15 net income = $892m
1Q15 diluted EPS = $1.29
ORIGINAL This reflects a
12% year-over-year increase Turning to DSOs, our day services outstanding continue to be industry leading.

1Q15 net income = $892m
1Q15 diluted EPS = $1.29
ORIGINAL There were 37 days up from 36
days last quarter Free cash flow in the quarter was $821 million resulting from cash generated by operating activities of $873
million net of property and equipment additions of $52 million.

1Q15 net income = $892m
1Q15 diluted EPS = $1.29
ORIGINAL Cash flows in the quarter were positively impacted
by a shift in the timing of a portion of compensation payments, which were paid in quarter one in prior years and
beginning this year, will be paid in quarter two, with no impact to full year cash flow Moving to ou

In [None]:
combined_df.loc[0:13,"Earnings Call Name"]

0     Q1 2015 Accenture PLC Earnings Call - Final
1     Q1 2015 Accenture PLC Earnings Call - Final
2     Q1 2015 Accenture PLC Earnings Call - Final
3     Q1 2015 Accenture PLC Earnings Call - Final
4     Q1 2015 Accenture PLC Earnings Call - Final
5     Q1 2015 Accenture PLC Earnings Call - Final
6     Q1 2015 Accenture PLC Earnings Call - Final
7     Q1 2015 Accenture PLC Earnings Call - Final
8     Q1 2015 Accenture PLC Earnings Call - Final
9     Q1 2015 Accenture PLC Earnings Call - Final
10    Q1 2015 Accenture PLC Earnings Call - Final
11    Q1 2015 Accenture PLC Earnings Call - Final
12    Q1 2015 Accenture PLC Earnings Call - Final
13          Q1 2015 ACE Ltd Earnings Call - Final
Name: Earnings Call Name, dtype: object