In [1]:
def return_label(df,end_year):
    return_dict={}
    companies=pd.DataFrame(df[str(end_year)].dropna())
    
    for cik, label in companies.iterrows():
        return_dict[cik]=label[str(end_year)]
        
    return return_dict

In [2]:
from tqdm import tqdm
def get_xy(corpus,labels):
    X=[]
    Y=[]
    for ix in tqdm(labels.keys()):
        if corpus.get(str(ix),""):
            X.append(corpus[str(ix)])
            Y.append(labels[ix])
    return X,Y

In [3]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, precision_score, recall_score
import numpy as np

def do_train_10f(corpus, year):
    labels = return_label(rev_df, year)
    X, Y = get_xy(corpus, labels)
    
    X = np.array(X)
    Y = np.array(Y)

    not_nan_map = ~np.isnan(X)
    map_ = np.array([False if np.isnan(i).any() else True for i in X])

    X = X[map_]
    Y = Y[map_]

    # Initialize lists to store performance metrics for each fold
    lr_train_accs, lr_test_accs, svc_train_accs, svc_test_accs = [], [], [], []
    lr_prs, svc_prs = [], []
    lr_recals, svc_recals = [], []
    lr_f1s, svc_f1s = [], []
    ovr_aurocs, ovo_aurocs = [], []

    # Create a 10-fold stratified cross-validation splitter
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    for train_indices, test_indices in skf.split(X, Y):
        x_train, x_test = X[train_indices], X[test_indices]
        y_train, y_test = Y[train_indices], Y[test_indices]
        
        # Train-test split within the fold (0.1 test size)
        x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train)

        # Logistic Regression
        lr = LogisticRegression(n_jobs=-1, max_iter=1000)
        lr.fit(x_train, y_train)

        lr_train_acc = lr.score(x_train, y_train)
        lr_test_acc = lr.score(x_test, y_test)

        lr_pr = precision_score(y_test, lr.predict(x_test), average="weighted", zero_division=0)
        lr_rec = recall_score(y_test, lr.predict(x_test), average="weighted", zero_division=0)
        lr_f1 = precision_recall_fscore_support(y_test, lr.predict(x_test), average="weighted", zero_division=0)

        ovr_auroc = roc_auc_score(y_test, lr.predict_proba(x_test), multi_class="ovr", average="weighted")
        ovo_auroc = roc_auc_score(y_test, lr.predict_proba(x_test), multi_class="ovo", average="weighted")

        # Linear SVC
        l_svc = LinearSVC(dual=False)
        l_svc.fit(x_train, y_train)

        svc_train_acc = l_svc.score(x_train, y_train)
        svc_test_acc = l_svc.score(x_test, y_test)

        svc_pr = precision_score(y_test, l_svc.predict(x_test), average="weighted", zero_division=0)
        svc_rec = recall_score(y_test, l_svc.predict(x_test), average="weighted", zero_division=0)
        svc_f1 = precision_recall_fscore_support(y_test, l_svc.predict(x_test), average="weighted", zero_division=0)

        # Append performance metrics to lists
        lr_train_accs.append(lr_train_acc)
        lr_test_accs.append(lr_test_acc)
        svc_train_accs.append(svc_train_acc)
        svc_test_accs.append(svc_test_acc)
        lr_prs.append(lr_pr)
        lr_recals.append(lr_rec)
        lr_f1s.append(lr_f1)
        ovr_aurocs.append(ovr_auroc)
        ovo_aurocs.append(ovo_auroc)
        svc_prs.append(svc_pr)
        svc_recals.append(svc_rec)
        svc_f1s.append(svc_f1)
    print("works till here")
    # Calculate and return the mean values of the performance metrics
    return lr_train_accs, lr_test_accs, svc_train_accs, svc_test_accs, lr_prs, lr_recals, lr_f1s, svc_prs, svc_recals, svc_f1s, ovr_aurocs, ovo_aurocs


In [4]:
import pickle
import pandas as pd
main_dict = {}
for year in tqdm(range(2012, 2021)):
    # pickle file for k-10 filings
    base_corpus=pickle.load(open(f"/scratch/pk2286/Backpropped/embeddings/{year}_backproped_embd.pkl","rb"))

    # labels csv file
    rev_df=pd.read_csv(f"/scratch/pk2286/labels/{2011}.csv",index_col=1)

    performance_dict={'columns': ["lr_train_acc", "lr_test_acc", "svc_train_acc", "svc_test_acc", "lr_precision", "lr_recall", "lr_f1", "svc_precision", "svc_recall", "svc_f1", "OVO_auroc", "OVR_auroc"]}
    
    performance_dict[year]=do_train_10f(base_corpus,year)
    main_dict[f"base_{year}"] = performance_dict


  0%|          | 0/9 [00:00<?, ?it/s]
100%|██████████| 3650/3650 [00:00<00:00, 878577.31it/s]
 11%|█         | 1/9 [11:16<1:30:15, 676.97s/it]

works till here



100%|██████████| 3673/3673 [00:00<00:00, 838358.65it/s]
 22%|██▏       | 2/9 [21:43<1:15:29, 647.09s/it]

works till here



100%|██████████| 3638/3638 [00:00<00:00, 807775.43it/s]
 33%|███▎      | 3/9 [31:58<1:03:16, 632.67s/it]

works till here



100%|██████████| 3664/3664 [00:00<00:00, 850843.20it/s]
 44%|████▍     | 4/9 [42:07<51:55, 623.16s/it]  

works till here



100%|██████████| 3682/3682 [00:00<00:00, 842613.89it/s]
 56%|█████▌    | 5/9 [52:23<41:23, 620.77s/it]

works till here



100%|██████████| 3675/3675 [00:00<00:00, 855861.59it/s]
 67%|██████▋   | 6/9 [1:02:56<31:14, 624.93s/it]

works till here



100%|██████████| 3653/3653 [00:00<00:00, 832344.23it/s]
 78%|███████▊  | 7/9 [1:13:50<21:08, 634.26s/it]

works till here



100%|██████████| 3701/3701 [00:00<00:00, 856400.70it/s]
 89%|████████▉ | 8/9 [1:24:20<10:32, 632.94s/it]

works till here



100%|██████████| 3610/3610 [00:00<00:00, 822546.58it/s]
100%|██████████| 9/9 [1:34:14<00:00, 628.28s/it]

works till here





In [None]:
import os

file_path = "/scratch/pk2286/Backpropped/embeddings/2012_backproped_embd.pkl"

if os.path.exists(file_path):
    print(f"The file '{file_path}' exists.")
else:
    print(f"The file '{file_path}' does not exist.")


In [11]:
list(main_dict.keys())

['base_2012',
 'base_2013',
 'base_2014',
 'base_2015',
 'base_2016',
 'base_2017',
 'base_2018',
 'base_2019',
 'base_2020']

In [30]:
columns = ['year', 'lr_train_acc',
   'lr_test_acc',
   'svc_train_acc',
   'svc_test_acc']
data = []
for key in main_dict:
    year = int(key.split("_")[-1])
    metrics = main_dict[key][year]
    list_ = [sum(i)/len(i) for i in  metrics[:4]]
    list_.insert(0, year)  
    data.append(list_)
    list_ = []
df = pd.DataFrame(data, columns=columns)

In [31]:
df

Unnamed: 0,year,lr_train_acc,lr_test_acc,svc_train_acc,svc_test_acc
0,2012,0.992174,0.462931,0.998944,0.450862
1,2013,0.994294,0.457328,0.998801,0.449569
2,2014,0.996156,0.486463,0.99893,0.456769
3,2015,0.996572,0.469264,0.99971,0.450649
4,2016,0.995627,0.4625,0.998847,0.446552
5,2017,0.993671,0.45,0.998753,0.44569
6,2018,0.995437,0.456034,0.99928,0.449138
7,2019,0.997036,0.447639,0.999713,0.439056
8,2020,0.998229,0.473451,0.999262,0.464602


In [25]:
[sum(i)/len(i) for i in metrics[:4]]

[0.9982292179045744,
 0.47345132743362833,
 0.999262174126906,
 0.46460176991150437]

In [28]:
df