# GPT Embeddings Regressions

### All the Function

In [1]:
# takes df and extracts labels for each companies
def return_label(df,end_year):
    return_dict={}
    companies=pd.DataFrame(df[str(end_year)].dropna())
    
    for cik, label in companies.iterrows():
        return_dict[cik]=label[str(end_year)]
        
    return return_dict

In [2]:
# extracts regressors and target
from tqdm import tqdm
def get_xy(corpus,labels):
    X=[]
    Y=[]
    for ix in tqdm(labels.keys()):
        if corpus.get(str(ix),""):
            X.append(corpus[str(ix)])
            Y.append(labels[ix])
    return X,Y

In [3]:
# function for training models
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, precision_score, recall_score
import numpy as np

def do_train_10f(corpus, year):
    labels = return_label(rev_df, year)
    X, Y = get_xy(corpus, labels)
    
    X = np.array(X)
    Y = np.array(Y)

    not_nan_map = ~np.isnan(X)
    map_ = np.array([False if np.isnan(i).any() else True for i in X])

    X = X[map_]
    Y = Y[map_]

    # Initialize lists to store performance metrics for each fold
    lr_train_accs, lr_test_accs, svc_train_accs, svc_test_accs = [], [], [], []
    lr_prs, svc_prs = [], []
    lr_recals, svc_recals = [], []
    lr_f1s, svc_f1s = [], []
    ovr_aurocs, ovo_aurocs = [], []

    # Create a 10-fold stratified cross-validation splitter
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    for train_indices, test_indices in skf.split(X, Y):
        x_train, x_test = X[train_indices], X[test_indices]
        y_train, y_test = Y[train_indices], Y[test_indices]
        
        for test_year in range(year, 2022):
            labels = return_label(rev_df, year)
            X, Y = get_xy(corpus, labels)

        # Train-test split within the fold (0.1 test size)
        x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train)

        # Logistic Regression
        lr = LogisticRegression(n_jobs=-1, max_iter=1000)
        lr.fit(x_train, y_train)

        lr_train_acc = lr.score(x_train, y_train)
        lr_test_acc = lr.score(x_test, y_test)

        lr_pr = precision_score(y_test, lr.predict(x_test), average="weighted", zero_division=0)
        lr_rec = recall_score(y_test, lr.predict(x_test), average="weighted", zero_division=0)
        lr_f1 = precision_recall_fscore_support(y_test, lr.predict(x_test), average="weighted", zero_division=0)

        ovr_auroc = roc_auc_score(y_test, lr.predict_proba(x_test), multi_class="ovr", average="weighted")
        ovo_auroc = roc_auc_score(y_test, lr.predict_proba(x_test), multi_class="ovo", average="weighted")

        # Linear SVC
        l_svc = LinearSVC(dual=False)
        l_svc.fit(x_train, y_train)

        svc_train_acc = l_svc.score(x_train, y_train)
        svc_test_acc = l_svc.score(x_test, y_test)

        svc_pr = precision_score(y_test, l_svc.predict(x_test), average="weighted", zero_division=0)
        svc_rec = recall_score(y_test, l_svc.predict(x_test), average="weighted", zero_division=0)
        svc_f1 = precision_recall_fscore_support(y_test, l_svc.predict(x_test), average="weighted", zero_division=0)

        # Append performance metrics to lists
        lr_train_accs.append(lr_train_acc)
        lr_test_accs.append(lr_test_acc)
        svc_train_accs.append(svc_train_acc)
        svc_test_accs.append(svc_test_acc)
        lr_prs.append(lr_pr)
        lr_recals.append(lr_rec)
        lr_f1s.append(lr_f1)
        ovr_aurocs.append(ovr_auroc)
        ovo_aurocs.append(ovo_auroc)
        svc_prs.append(svc_pr)
        svc_recals.append(svc_rec)
        svc_f1s.append(svc_f1)

    # Calculate and return the mean values of the performance metrics
    return lr_train_accs, lr_test_accs, svc_train_accs, svc_test_accs, lr_prs, lr_recals, lr_f1s, svc_prs, svc_recals, svc_f1s, ovr_aurocs, ovo_aurocs


### Training models

In [4]:
import pickle
import pandas as pd
main_dict = {}
for year in tqdm(range(2011, 2021)):
    # pickle file for k-10 filings
    base_corpus=pickle.load(open(f"GPT/future/{year}_future_documents_embd.pkl","rb"))

    # labels csv file
    rev_df=pd.read_csv(f"labels/{year}.csv",index_col=1)

    performance_dict={'columns': ["lr_train_acc", "lr_test_acc", "svc_train_acc", "svc_test_acc", "lr_precision", "lr_recall", "lr_f1", "svc_precision", "svc_recall", "svc_f1", "OVO_auroc", "OVR_auroc"]}
    for i in range(int(year)+1,2023):
        performance_dict[i]=do_train_10f(base_corpus,i)
    main_dict[f"base_{year}"] = performance_dict


  0%|          | 0/10 [00:00<?, ?it/s]
100%|██████████| 3650/3650 [00:00<00:00, 939445.85it/s]

100%|██████████| 3650/3650 [00:00<00:00, 924357.54it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1008910.61it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1021908.39it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1018983.60it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1015805.83it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1017426.04it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1011309.92it/s]

100%|██████████| 3650/3650 [00:00<00:00, 964177.45it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1012045.32it/s]

100%|██████████| 3650/3650 [00:00<00:00, 1014930.36it/s]
  0%|          | 0/10 [00:07<?, ?it/s]


TypeError: only integer scalar arrays can be converted to a scalar index

### Saving the results into a pickle file

In [None]:
import pickle

# Create a dictionary
my_dict = main_dict

# Specify the filename for the pickle file
pickle_filename = 'gpt_future.pkl'

# Open the file in binary write mode and use pickle.dump to save the dictionary
with open(pickle_filename, 'wb') as pickle_file:
    pickle.dump(my_dict, pickle_file)

print(f'Dictionary has been pickled and saved to {pickle_filename}')

### Training future models

In [None]:
import pickle
import pandas as pd
main_dict = {}
for year in tqdm(range(2011, 2021)):
    # pickle file for k-10 filings
    base_corpus=pickle.load(open(f"GPT/base/{year}_base_documents_embd.pkl","rb"))

    # labels csv file
    rev_df=pd.read_csv(f"labels/{year}.csv",index_col=1)

    performance_dict={'columns': ["lr_train_acc", "lr_test_acc", "svc_train_acc", "svc_test_acc", "lr_precision", "lr_recall", "lr_f1", "svc_precision", "svc_recall", "svc_f1", "OVO_auroc", "OVR_auroc"]}
    for i in range(int(year)+1,2023):
        performance_dict[i]=do_train_10f(base_corpus,i)
    main_dict[f"base_{year}"] = performance_dict


### Saving the results into a pickle file

In [None]:
import pickle

# Create a dictionary
my_dict = main_dict

# Specify the filename for the pickle file
pickle_filename = 'gpt_base.pkl'

# Open the file in binary write mode and use pickle.dump to save the dictionary
with open(pickle_filename, 'wb') as pickle_file:
    pickle.dump(my_dict, pickle_file)

print(f'Dictionary has been pickled and saved to {pickle_filename}')