In [55]:
import os
import sys

import pandas as pd

from datasets import Dataset, DatasetDict

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from text_generation_models import TextGenerationModelFactory

In [4]:
from datasets import load_dataset

imdb = load_dataset("imdb")
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
# Get original dataset
data_path = os.path.join(notebook_dir, '../data/')

In [30]:
def load_data_to_df(base_path, file_name):
    file_path = os.path.join(base_path, file_name)
    print(file_path)
    df = DataProcessing.load_from_file(file_path, 'csv')
    return df

In [35]:
predictions_path = os.path.join(data_path, 'prediction_logs/')
prediction_file_path = os.path.join(predictions_path)

p_batch_1 = os.path.join(prediction_file_path, 'batch_1-prediction/batch_1-from_df.csv')
train_p_df = load_data_to_df(prediction_file_path, p_batch_1)

p_batch_2 = os.path.join(prediction_file_path, 'batch_2-prediction/batch_2-from_df.csv')
test_p_df = load_data_to_df(prediction_file_path, p_batch_2)
test_p_df

/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv


Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"Based on my analysis as a financial expert, I ...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Goldman Sachs speculates t...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,2
2,"JP Morgan Chase predicts that on January 1, 20...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,3
3,"According to a report by BlackRock, the operat...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,4
4,"In Q2 of 2025, a financial research advisor en...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,5
...,...,...,...,...,...,...,...
67,"On 08/21/2024, the miscellaneous expert at NAS...",1,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,2
68,The miscellaneous analyst at Goldman Sachs pre...,1,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,3
69,According to the miscellaneous top executive a...,1,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,4
70,"In 21/08/2024, the miscellaneous reporter at C...",1,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,5


In [36]:
observations_path = os.path.join(data_path, 'observation_logs/')
observation_file_path = os.path.join(observations_path)

o_batch_1 = os.path.join(observation_file_path, 'batch_1-observation/batch_1-from_df.csv')
train_o_df = load_data_to_df(observation_file_path, o_batch_1)

o_batch_2 = os.path.join(observation_file_path, 'batch_2-observation/batch_2-from_df.csv')
test_o_df = load_data_to_df(observation_file_path, o_batch_2)
test_o_df

/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/observation_logs/batch_1-observation/batch_1-from_df.csv
/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/notebook_experiments/../data/observation_logs/batch_2-observation/batch_2-from_df.csv


Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,A financial analyst at JPMorgan observed that ...,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 10/12/2025 to 10/12/2026, a research adviso...",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,A senior level executive at Microsoft predicts...,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to a financial expert at Bloomberg, ...",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 2025, a college student envisions that the ...",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
...,...,...,...,...,...,...,...
201,"On 07/15/2022, the marketing director at Nike ...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,2
202,The financial advisor at Morgan Stanley predic...,0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,3
203,"According to the CEO of Amazon, the customer s...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,4
204,"In 2023 of Q4, the research advisor at Harvard...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,5


In [37]:
train_dfs = [train_p_df, train_o_df]
train_base_df = DataProcessing.concat_dfs(train_dfs)
train_base_df

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit a...,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates...",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operatin...",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and d...",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions th...",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
...,...,...,...,...,...,...,...
268,"On 05/15/2022, the financial advisor at Morgan...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,2
269,The marketing director at Nike predicted on 11...,0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,3
270,"According to the CEO of Apple, the market shar...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,4
271,"In 2023, the research advisor at MIT envisione...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,5


In [38]:
test_dfs = [test_p_df, test_o_df]
test_base_df = DataProcessing.concat_dfs(test_dfs)
test_base_df

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"Based on my analysis as a financial expert, I ...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Goldman Sachs speculates t...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,2
2,"JP Morgan Chase predicts that on January 1, 20...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,3
3,"According to a report by BlackRock, the operat...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,4
4,"In Q2 of 2025, a financial research advisor en...",1,finance,mixtral-8x7b-instruct,NAVI_GATOR,0,5
...,...,...,...,...,...,...,...
273,"On 07/15/2022, the marketing director at Nike ...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,2
274,The financial advisor at Morgan Stanley predic...,0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,3
275,"According to the CEO of Amazon, the customer s...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,4
276,"In 2023 of Q4, the research advisor at Harvard...",0,miscellaneous,mistral-small-3.1,NAVI_GATOR,0,5


In [51]:
train_base_df.rename(columns={'Base Sentence':'text', 'Sentence Label':'label'}, inplace=True)
test_base_df.rename(columns={'Base Sentence':'text', 'Sentence Label':'label'}, inplace=True)

In [52]:
drop_cols = train_base_df.columns.to_list()[2:]
drop_cols

['Domain', 'Model Name', 'API Name', 'Batch ID', 'Template Number']

In [53]:
train_sentence_label_df = DataProcessing.drop_df_columns(train_base_df, drop_cols)
test_sentence_label_df = DataProcessing.drop_df_columns(test_base_df, drop_cols)
test_sentence_label_df

Unnamed: 0,text,label
0,"Based on my analysis as a financial expert, I ...",1
1,"On August 21, 2024, Goldman Sachs speculates t...",1
2,"JP Morgan Chase predicts that on January 1, 20...",1
3,"According to a report by BlackRock, the operat...",1
4,"In Q2 of 2025, a financial research advisor en...",1
...,...,...
273,"On 07/15/2022, the marketing director at Nike ...",0
274,The financial advisor at Morgan Stanley predic...,0
275,"According to the CEO of Amazon, the customer s...",0
276,"In 2023 of Q4, the research advisor at Harvard...",0


In [54]:
train_dataset = Dataset.from_pandas(train_sentence_label_df)
test_dataset = Dataset.from_pandas(test_sentence_label_df)
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 278
})

In [57]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 273
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 278
    })
})

In [59]:
train_test_p_o_path = os.path.join(data_path, 'text_classification_tutorial/')
dataset_dict.save_to_disk(train_test_p_o_path)

Saving the dataset (0/1 shards):   0%|          | 0/273 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/278 [00:00<?, ? examples/s]