## produce label-noised training sets

In [None]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import os
import sys

In [None]:
def noise_training_set(file, noise_per_step_positives, noise_per_step_negatives, pos_steps, neg_steps):
    
    train_df = pd.read_json(file, compression='gzip', lines=True)
    positives_df = train_df[train_df['label'] == 1].copy()
    negatives_df = train_df[train_df['label'] == 0].copy()
    
    pos_sample_amount = int(len(positives_df)*noise_per_step_positives)
    neg_sample_amount = int(len(negatives_df)*noise_per_step_negatives)
    pos_noise_index = []
    neg_noise_index = []
    
    for step in range(0,pos_steps):
            positives_sample_df = positives_df.sample(n=pos_sample_amount, random_state=42)
            pos_selected_index = list(positives_sample_df.index)
            pos_noise_index.extend(pos_selected_index)
            positives_df.drop(pos_selected_index, inplace=True)
            
    for step in range(0,neg_steps):
            negatives_sample_df = negatives_df.sample(n=neg_sample_amount, random_state=42)
            neg_selected_index = list(negatives_sample_df.index)
            neg_noise_index.extend(neg_selected_index)
            negatives_df.drop(neg_selected_index, inplace=True)
            
            
    train_df.loc[pos_noise_index, 'label'] = 0
    train_df.loc[neg_noise_index, 'label'] = 1
    
    file_name = os.path.basename(file)
    new_file_name = file_name.replace('.json.gz', '_{:0.2f}_posnoise_{:0.2f}_negnoise.json.gz'.format(pos_steps*noise_per_step_positives, neg_steps*noise_per_step_negatives))
    
    out_path = '../../../data/interim/wdc-lspc/training-sets-noised/'
    
    os.makedirs(out_path, exist_ok=True)
    train_df.to_json(out_path+new_file_name, compression='gzip', lines=True, orient='records')

In [None]:
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.01, 0.01, 1, 1)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.02, 0.02, 1, 1)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.03, 0.03, 1, 1)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.04, 0.04, 1, 1)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.05, 0.05, 1, 1)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.1, 0.1, 1, 1)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.1, 0.1, 2, 2)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.1, 0.1, 3, 3)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.1, 0.1, 4, 4)
noise_training_set('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', 0.1, 0.1, 5, 5)