In [94]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import gzip
import shutil
import gensim
import nlpaug.augmenter.word as naw
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer

In [95]:
random_state = 42
np.random.seed(random_state)

In [96]:
gn_vec_zip_path = "GoogleNews-vectors-negative300.bin.gz"
gn_vec_path = "GoogleNews-vectors-negative300.bin"
with gzip.open(gn_vec_zip_path, 'rb') as f_in:
    with open(gn_vec_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [97]:
aug = naw.WordEmbsAug(model_type='word2vec', model_path=gn_vec_path, action="substitute")

In [98]:
data_path = "KECSACQIProject_DATA_2022-03-31_full dataset for CS.csv"
data = pd.read_csv(data_path, header=0, names=["study_id", "label", "text"], encoding='unicode_escape')
data.head()

Unnamed: 0,study_id,label,text
0,2,0,Reason for Exam: CHRONIC LOWER BACK PAIN. GET...
1,3,0,Narrative & Impression MR LUMBAR SPINE Re...
2,4,0,MR LUMBAR SPINE Reason for Exam: PROGRES...
3,5,0,MR CERVICAL SPINE Reason for Exam: HAS HX...
4,6,0,MRI lumbar spine Comparison: No prior ...


In [99]:
def augment_text(df,samples=300,pr=0.2):
    aug.aug_p=pr
    new_text=[]

    df_n=df[df.label==1].reset_index(drop=True)
    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        text = df_n.iloc[i]['text']
        augmented_text = aug.augment(text)
        new_text.append(augmented_text)
    
    ## dataframe
    new_data=shuffle(pd.DataFrame({'text':new_text,'label':1}), random_state=random_state)
    return new_data

In [102]:
def remove_stop_words(X_train):
    print(X_train)
    vectorizer = CountVectorizer(lowercase=True, max_df=0.9, min_df=2)
    X = vectorizer.fit_transform(X_train)
    vocab = vectorizer.get_feature_names_out()
    X_train = X_train.apply(lambda x: gensim.utils.simple_preprocess(x))
    texts = []
    for text in X_train:
        new_text = []
        for word in text:
            if word in vocab:
                new_text.append(word)
        texts.append(' '.join(new_text))
    return pd.Series(texts)

In [106]:
k = 5
folds = KFold(n_splits=k, random_state=random_state, shuffle=True)
fold = 0
for train_index, test_index in folds.split(data['text'], data['label']):
    fold += 1
    X_train, Y_train, study_id_train = data['text'][train_index], data['label'][train_index], data['study_id'][train_index]
    X_test, Y_test, study_id_test = data['text'][test_index], data['label'][test_index], data['study_id'][test_index]
    
    # X_train = remove_stop_words(X_train)
    # X_test = remove_stop_words(X_test)
    X_Y_train = pd.DataFrame({'study_id':study_id_train, 'text':X_train,'label':Y_train})
    X_Y_test = pd.DataFrame({'study_id':study_id_test, 'text':X_test,'label':Y_test})
    
    X_Y_augmented = augment_text(X_Y_train, samples=2)
    X_Y_augmented_train =shuffle(X_Y_train.append(X_Y_augmented).reset_index(drop=True), random_state=random_state)

    X_Y_train.to_csv(f"fold_{fold}_train.csv", index=False)
    X_Y_augmented_train.to_csv(f"fold_{fold}_augmented_train.csv", index=False)
    X_Y_test.to_csv(f"fold_{fold}_test.csv", index=False)
    
    

100%|██████████| 2/2 [00:03<00:00,  1.62s/it]
  X_Y_augmented_train =shuffle(X_Y_train.append(X_Y_augmented).reset_index(drop=True), random_state=random_state)
100%|██████████| 2/2 [00:01<00:00,  1.12it/s]
  X_Y_augmented_train =shuffle(X_Y_train.append(X_Y_augmented).reset_index(drop=True), random_state=random_state)
100%|██████████| 2/2 [00:03<00:00,  1.87s/it]
  X_Y_augmented_train =shuffle(X_Y_train.append(X_Y_augmented).reset_index(drop=True), random_state=random_state)
100%|██████████| 2/2 [00:03<00:00,  1.82s/it]
  X_Y_augmented_train =shuffle(X_Y_train.append(X_Y_augmented).reset_index(drop=True), random_state=random_state)
100%|██████████| 2/2 [00:02<00:00,  1.37s/it]
  X_Y_augmented_train =shuffle(X_Y_train.append(X_Y_augmented).reset_index(drop=True), random_state=random_state)
