# Extract Embeddings: Combine Generated Synthetic Data with Financial PhraseBank Data

In [1]:
import os
import sys

import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Data

In [3]:
base_data_path = os.path.join(notebook_dir, '../data/')
combine_data_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank/combined_generated_fin_phrase_bank-v1.csv')

In [4]:
df = DataProcessing.load_from_file(combine_data_path, 'csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,Base Sentence,Sentence Label
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1
7,"On August 25, 2024, to September 25, 2025, Citigroup speculates the net profit at Johnson & Johnson will likely increase.",1
8,"Bank of America predicts on 2024-08-21, the operating income at Visa may rise.",1
9,"According to Goldman Sachs, the research and development expenses at Alphabet would fall in 2029 Q2.",1


## Shuffle Data

In [5]:
shuffled_df = DataProcessing.shuffle_df(df)
shuffled_df

Unnamed: 0,Base Sentence,Sentence Label
0,"Precipitation levels in Denver should stay the same in 2029-07-04, according to Meteorologist Michael Brown.",1
1,"In 2024-08-21, Environmental scientist Lisa Nguyen envisions that the carbon emissions at ExxonMobil has some probability to remain stable.",1
2,"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+_o , as saying .",0
3,"According to Coach James Davis, the scoring average at the Denver Broncos would fall in 2024 of Q3.",1
4,"Dr. David Kim, a weather expert, predicts on Q3 of 2026, the wind speed at San Diego may rise.",1
5,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",1
6,"Incap Contract Manufacturing Services Pvt Ltd , a subsidiary of Incap Corporation of Finland , plans to double its revenues by 2007-2008 .",1
7,"According to economist Emily Patel, the unemployment rate at the United States would fall in January 2029.",1
8,"STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMENE Credit Suisse First Boston ( CFSB ) raised the fair value for shares in four of the largest Nordic forestry groups .",0
9,"On 2027/08/20, the Federal Reserve speculates that interest rates at major banks will likely increase.",1


## Extract Sentence Embeddings

In [6]:
spacy_fe = SpacyFeatureExtraction(shuffled_df, 'Base Sentence')
spacy_fe

<feature_extraction.SpacyFeatureExtraction at 0x345175dd0>

In [8]:
spacy_sentence_embeddings_df = spacy_fe.sentence_feature_extraction(attach_to_df=True)
spacy_sentence_embeddings_df

100%|██████████| 105/105 [00:00<00:00, 234.26it/s]


Unnamed: 0,Base Sentence,Sentence Label,Embedding
0,"Precipitation levels in Denver should stay the same in 2029-07-04, according to Meteorologist Michael Brown.",1,"[-0.12282728, 0.27111018, 0.035108242, 0.003031217, 0.064736284, 0.01719045, 0.0077772248, 0.056214016, -0.056285575, 1.524844, -0.37263608, 0.011970997, 0.022705896, -0.0017075287, -0.16509187, -0.0019095814, 0.035784286, 1.1393528, -0.027593814, -0.08317876, -0.01761704, 0.0014797008, -0.015901804, -0.10865327, 0.044696484, -0.031102858, -0.13074993, 0.004364047, 0.0087294355, -0.0013188692, -0.06773261, 0.09686218, 0.012640004, 0.029441377, 0.13594306, 0.011400068, 0.018702881, 0.03349828, 0.019613286, -0.08120921, 0.047159955, -0.024262289, 0.11456801, 0.019920256, 0.01265728, 0.056524806, -0.067116365, -0.059756096, 0.07825048, -0.012200771, 0.022573339, 0.088026, -0.078573905, -0.13752636, 0.00068475946, 0.024191901, -0.040313948, 0.0066592926, 0.00067934225, -0.15865351, -0.0585..."
1,"In 2024-08-21, Environmental scientist Lisa Nguyen envisions that the carbon emissions at ExxonMobil has some probability to remain stable.",1,"[-0.10615556, 0.2840127, -0.034138516, 0.048429795, -0.051459327, 0.10221126, -0.03155069, 0.04775752, -0.119019, 1.550164, -0.20750885, 0.037153725, -0.03891483, -0.0021015552, -0.14981, 0.08462622, -0.03353012, 1.0599174, -0.12298432, -0.053530406, 0.033334557, 0.06669029, 0.045342278, -0.019844405, 0.14324325, 0.09565377, -0.07173137, 0.016630044, 0.080680445, 0.08708542, -0.076131225, 0.053336438, 0.06115622, 0.005948001, 0.06305403, -0.021259127, 0.064786345, 0.05284349, 0.0088904025, -0.09685647, 0.09848902, -0.013660461, 0.15108989, 0.020437771, -0.026175521, 0.00019325197, -0.072186835, -0.16497515, -0.009737886, 0.0045437724, 0.160857, 0.14052878, -0.06872517, -0.101791374, 0.050171312, -0.0170684, 0.019054309, -0.07746705, 0.055305306, -0.0142838815, -0.059969477, -0.12987989..."
2,"The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+_o , as saying .",0,"[-0.09497906, 0.17504272, -0.04316473, -0.045111295, 0.17251992, -0.029244727, -0.01743192, -0.23873259, -0.0468274, 1.8599616, -0.20045447, -0.06076155, 0.08271708, -0.11028655, -0.10557187, -0.013530947, -0.0008588657, 1.1229628, -0.05330467, -0.05398465, 0.05328592, 0.052132785, 0.0042436747, -0.043303672, -0.06593899, 0.0018710982, -0.1600442, -0.03146998, 0.0860306, 0.019732358, 0.03050232, -0.03539315, 0.09711149, 0.1669377, 0.014413761, -0.019016104, -0.063479, 0.02729283, 0.03540679, -0.01586014, 0.0053353356, 0.09406666, 0.113477565, -0.108105645, -0.0701852, 0.028951146, -0.035146296, 0.020402262, 0.096902125, 0.024123807, -0.02779102, 0.009795973, -0.051292695, -0.00094839284, -0.07242499, 0.024145907, -0.003986896, -0.0931013, 0.056247376, -0.056873057, 0.064774156, -0.0242..."
3,"According to Coach James Davis, the scoring average at the Denver Broncos would fall in 2024 of Q3.",1,"[-0.05958075, 0.24581957, 0.0479352, -0.019386, 0.1786363, -0.03195793, 0.073931, 0.082503, 0.15843263, 1.7585299, -0.2944509, 0.0609033, 0.18507555, 0.020627853, -0.053433884, -0.08984251, 0.054032695, 0.8735086, -0.02547969, -0.06825661, 0.027962396, 0.041074, 0.06239441, -0.16176488, -0.092233256, 0.25184637, -0.19955423, 0.10843928, 0.015474093, 0.10281143, 0.0064961193, 0.0023387999, 0.010181002, 0.16286406, 0.07177703, -0.09885599, 0.080496036, 0.063669786, -0.02421406, 0.010286324, -0.029904753, 0.12700607, -0.008654, -0.020337636, 0.08888666, -0.12328309, -0.086011514, 0.0020621426, 0.03873605, 0.064717606, 0.016041847, -0.018467302, -0.0285772, -0.2491248, -0.0033292002, -0.04648746, -0.028126603, 0.07288599, -0.017137745, -0.15678395, -0.041576583, -0.032831706, -0.049786996,..."
4,"Dr. David Kim, a weather expert, predicts on Q3 of 2026, the wind speed at San Diego may rise.",1,"[-0.13067363, 0.35169357, 0.021784047, -0.053679965, 0.06852861, 0.019200074, 0.10771113, -0.034946084, 0.0024307533, 1.5551059, -0.22897987, -0.16808039, 0.09922205, -0.004558087, -0.2257077, -0.09444069, -0.018291697, 1.1787034, -0.10345774, -0.051168777, 0.0648703, 0.10754419, -0.019828359, -0.1872799, -0.029651377, 0.12786008, -0.09871342, 0.053600334, 0.21595986, 0.01142312, 0.06925498, -0.012103782, -0.060455274, 0.09179687, 0.06636937, -0.032129396, 0.030544207, 0.118631594, -0.05207153, 0.019260623, -0.020367397, 0.030811694, 0.06418965, 0.04296716, 0.01737682, 0.05781513, -0.117226504, -0.15621205, 0.11340021, 0.061519865, 0.03221283, 0.15113835, -0.060592227, -0.02652539, -0.10352721, 0.06544248, 0.026431229, 0.024830176, 0.023281036, -0.19475557, -0.07726128, -0.006189312, 0..."
5,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",1,"[0.027561586, 0.17978388, -0.12160311, -0.17506763, 0.09838764, -0.028442068, 0.0070442134, -0.10004863, -0.07778189, 2.3718472, -0.29632595, 0.04517076, 0.07327089, 0.03280646, -0.097564556, -0.13470462, -0.014623605, 1.1926669, -0.12083377, -0.02115372, 0.044567905, 0.025868911, -0.021114, -0.045893855, -0.018226549, 0.06699476, -0.09968322, 0.015154851, 0.012717558, -0.08473158, -0.026674055, 0.021809561, 0.06706588, 0.14596973, 0.14126469, 0.0006512243, 0.04734464, -0.0019340527, 0.031127436, -0.15893845, 0.02454067, 0.06885848, 0.101522245, -0.150205, -0.015362305, 0.0045821685, -0.061781213, -0.09760085, 0.039505415, 0.09562087, 0.010188397, -0.003922761, -0.068458244, 0.029470773, 0.08056933, -0.05994092, 0.019173078, -0.09146221, 0.03059926, -0.1715696, 0.060933698, 0.024337906..."
6,"Incap Contract Manufacturing Services Pvt Ltd , a subsidiary of Incap Corporation of Finland , plans to double its revenues by 2007-2008 .",1,"[-0.030612804, 0.13095923, -0.06612695, -0.22371274, 0.08800603, -0.12893744, 0.004493895, -0.12591381, -0.107524864, 1.5189983, -0.31901017, 0.06413592, -0.058893755, -0.113733254, 0.022839928, -0.06774164, -0.03214064, 1.1019056, -0.06510759, 0.09005547, 0.039896842, 0.116881296, -0.08431413, 0.014112558, 0.105258636, -0.026337378, -0.0354307, -0.064720914, 0.14625883, 0.083749525, 0.06404439, -0.08565395, -0.15040323, 0.25613773, -0.0209818, -0.07075304, -0.033868317, 0.084723085, 0.02966366, -0.07561489, 0.0137408, 0.05565487, 0.13154468, -0.029696837, -0.12019548, -0.06637102, -0.022961797, -0.073182836, 0.14342448, 0.19705144, 0.03946726, -0.020988055, -0.05809728, 0.019074714, -0.02919341, 0.08736504, 0.05118023, 0.047664613, 0.007691807, -0.0959507, -0.07485854, -0.09772203, -0..."
7,"According to economist Emily Patel, the unemployment rate at the United States would fall in January 2029.",1,"[-0.10875027, 0.2722385, 0.037852425, -0.0144299, -0.007869415, -0.08095598, 0.05320638, -0.025100464, 0.065632984, 1.9728091, -0.4008701, -0.017748946, 0.055823416, 0.012555205, -0.07332679, -0.045071844, -0.03895979, 0.83433324, -0.07616797, 0.028996473, 0.09256933, 0.061287295, 0.13524275, -0.16592506, -0.07123984, 0.05605516, -0.15823686, -0.0028169483, -0.035262287, 0.13769738, -0.03922892, 0.078402944, 0.056524683, 0.03252521, 0.05159879, -0.016281262, -0.08797445, 0.04322957, -0.13452667, 0.006801155, 0.06516784, 0.008816313, 0.0095608905, -0.1144742, 0.08917142, 0.07475661, -0.035721157, -0.19276722, 0.108253576, -0.09213104, -0.034746576, 0.12791488, -0.07823473, -0.21839593, 0.0041529476, -0.07970811, 0.048019793, -0.0195492, -0.0044494127, -0.2597514, -0.096701674, -0.091073..."
8,"STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMENE Credit Suisse First Boston ( CFSB ) raised the fair value for shares in four of the largest Nordic forestry groups .",0,"[-0.005943053, 0.15729912, -0.101663664, -0.04747168, 0.17984045, 0.026353298, 0.040704496, -0.031741887, -0.0013212783, 1.2830931, -0.30156302, 0.04194991, 0.047622945, -0.11694745, -0.014253504, 0.0237644, -0.03577601, 0.83854586, -0.054321844, 0.0024243295, 0.0076728556, 0.048846744, 0.04741292, -0.0002675462, 0.06477594, 0.0076617124, -0.007715884, 0.06146602, 0.009252985, 0.12989178, 0.0676145, -0.0073582274, 0.032229114, 0.1501138, 0.13244116, -0.009811595, -0.013941116, 0.0010493121, -0.019024882, -0.00927352, -0.026203226, 0.078611776, 0.08331343, 0.026452126, -0.015870191, 0.009790953, 0.035399493, -0.01720727, 0.07744529, -0.037347812, -0.101326965, 0.07356499, 0.037135232, 0.04475773, 0.0045982115, -0.008885927, -0.0028634022, -0.090517275, 0.007497437, -0.098485835, -0.0290..."
9,"On 2027/08/20, the Federal Reserve speculates that interest rates at major banks will likely increase.",1,"[-0.1819624, 0.25178966, -0.06822676, -0.019926462, -0.089038126, -0.06996482, -0.115359366, -0.04326476, -0.07003129, 2.3920352, -0.25396737, 0.047992382, 0.18774918, 0.03728704, -0.06693172, -0.06665924, -0.14754266, 1.0390995, -0.17305812, -0.03133512, -0.019005114, 0.11596616, -0.011821462, -0.08653368, -0.01892288, 0.15589063, -0.10245601, 0.05629094, -0.0012247115, 0.053754214, -0.032278627, 0.09060652, 0.13777657, -0.003365064, 0.030745892, 0.056799743, -0.05828475, 0.110756606, -0.047652643, -0.03982369, -0.058110815, 0.030933293, 0.08605982, -0.06819051, 0.13336769, 0.04059946, -0.012528418, -0.08884684, -0.021977596, 0.097921185, 0.03803194, -0.01231165, -0.06740895, 0.06621093, 0.05954753, -0.045862116, 0.025274118, -0.17518374, -0.105998576, -0.058269948, 0.041813303, -0.03..."


In [None]:
df['Embeddings'] = spacy_sentence_embeddings
df

## Split Data

In [None]:
X_train, X_test, y_train, y_test = DataProcessing.split_data(spacy_sentence_embeddings, )