In [3]:
import os
from io import StringIO
import pandas as pd
import wget
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/utils.py')
import numpy as np
#from utils import *
import torch
import seaborn as sns
import csv

def produce_NA(X, p_miss, mecha="MCAR", opt=None, p_obs=None, q=None):
    """
    Generate missing values for specifics missing-data mechanism and proportion of missing values. 
    
    Parameters
    ----------
    X : torch.DoubleTensor or np.ndarray, shape (n, d)
        Data for which missing values will be simulated.
        If a numpy array is provided, it will be converted to a pytorch tensor.
    p_miss : float
        Proportion of missing values to generate for variables which will have missing values.
    mecha : str, 
            Indicates the missing-data mechanism to be used. "MCAR" by default, "MAR", "MNAR" or "MNARsmask"
    opt: str, 
         For mecha = "MNAR", it indicates how the missing-data mechanism is generated: using a logistic regression ("logistic"), quantile censorship ("quantile") or logistic regression for generating a self-masked MNAR mechanism ("selfmasked").
    p_obs : float
            If mecha = "MAR", or mecha = "MNAR" with opt = "logistic" or "quanti", proportion of variables with *no* missing values that will be used for the logistic masking model.
    q : float
        If mecha = "MNAR" and opt = "quanti", quantile level at which the cuts should occur.
    
    Returns
    ----------
    A dictionnary containing:
    'X_init': the initial data matrix.
    'X_incomp': the data with the generated missing values.
    'mask': a matrix indexing the generated missing values.s
    """
    
    to_torch = torch.is_tensor(X) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        X = X.astype(np.float32)
        X = torch.from_numpy(X)
    
    if mecha == "MAR":
        mask = MAR_mask(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(X, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(X, p_miss).double()
    else:
        mask = (torch.rand(X.shape) < p_miss).double()
    
    X_nas = X.clone()
    X_nas[mask.bool()] = np.nan
    
    return {'X_init': X.double(), 'X_incomp': X_nas.double(), 'mask': mask}


def process_arff_file(file_path):
    # Read the file
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Find the index where '@data' appears
    data_index = next(i for i, line in enumerate(lines) if line.startswith('@data'))
    
    
    # Iterate over each line
    for i in range(0,data_index):
        if lines[i].startswith("@attribute class"):  # Check if line starts with "@attribute class"
            # If it does, replace it with "@attribute class1"
            lines[i] = lines[i].replace("@attribute class {0.0,1.0}", "@attribute class {0,1}")
            lines[i] = lines[i].replace("@attribute class {positive,negative}", "@attribute class {0,1}")
            lines[i] = lines[i].replace("@attribute class {class1,class2}", "@attribute class {0,1}")
            lines[i] = lines[i].replace("@attribute class {groupA,groupB}", "@attribute class {0,1}")
            
            
    
    #save first (n+1) rows
    lines_arff_header = lines[0:data_index+2]

    lines = lines[data_index+2:]
    lines = [line.rstrip(',') for line in lines]

    lines_csv_file_path = file_path.replace('.arff', '_temp.csv')
    with open(lines_csv_file_path, 'w') as lines_csv_file:
        lines_csv_file.writelines(lines)
        
    print(lines_csv_file_path)
    
    
    df = pd.read_csv(lines_csv_file_path, header=None)

    
    df.iloc[:, -1] = df.iloc[:, -1].astype(str)
    # Replace values in the last column
    df.iloc[:, -1].replace({
        '0.0': '0',
        '1.0': '1',
        'positive': '0',
        'negative': '1',
        'class1': '0',
        'class2': '1',
        'groupA': '0',
        'groupB': '1'
    }, inplace=True)

    os.remove(lines_csv_file_path)

    csv_file_path = file_path.replace('.arff', '_processing.csv')
    
    df.insert(0, 'New_Column', range(1, len(df) + 1))
    df.to_csv(csv_file_path, index=False,header=False)

    majority_data = df[df.iloc[:, -1] == '0']
    minority_data = df[df.iloc[:, -1] == '1']

    first_column = minority_data.iloc[:, 0]  # First column, contains sorting index
    first_column = first_column.reset_index(drop=True)

    actual_minority_data = minority_data.iloc[:, 1:]  # All other columns containing actual minority data

    
    
    perc_arr = [0.05,0.10,0.15,0.20,0.25,0.30]
    perc_arr_for_file = [5,10,15,20,25,30]
    for i in range (0,len(perc_arr)):
        perc = perc_arr[i]
        # Perform operations on the minority data to create missing values
        X_miss_mcar = produce_NA(actual_minority_data.iloc[:, :-1].values, p_miss=perc, mecha="MCAR")

        #print(X_miss_mcar)

        X_mcar = X_miss_mcar['X_incomp']
        R_mcar = X_miss_mcar['mask']
        t_np =X_mcar.numpy() #convert to Numpy array
        df_temp = pd.DataFrame(t_np) #convert to a dataframe
        print(df_temp.shape)
        # Remove the first row
        #df_temp = df_temp.drop(df_temp.index[0])

        # Remove the first column
        #df_temp = df_temp.drop(df_temp.columns[0], axis=1)
        minority_data = pd.concat([first_column, df_temp.reset_index(drop=True)], axis=1)
        minority_data[minority_data.columns[-1] + 1] = 1 #create class label
        minority_data = minority_data.reset_index(drop=True)
        majority_data = majority_data.reset_index(drop=True)
        all_data = pd.concat([majority_data, minority_data], axis=0)
        all_data = all_data.sort_values(by=all_data.columns[0])

        all_data = all_data.iloc[:, 1:]  # Remove the sorting index column
        
        csv_file_path = file_path.replace('.arff', '_'+str(perc_arr_for_file[i])+'perc.csv')
        all_data.to_csv(csv_file_path,header=False,index=False) #save to file
        # Convert the DataFrame to a comma-separated string
        df_csv_string = all_data.to_csv(index=False,header=False)

        # Concatenate the lines from the ARFF file and the DataFrame CSV string
        combined_string = ''.join(lines_arff_header) + '\n' + df_csv_string

        # Write the combined string to a file
        arff_file_path = file_path.replace('.arff', '_'+str(perc_arr_for_file[i])+'perc.arff')
        with open(arff_file_path, 'w') as f:
            f.write(combined_string)
    
     
    
    
    
    #sys.exit()
    
    
    
    
    #read the CSV file again
    
def process_folder(folder_path):
    # Get a list of all files in the folder
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    
    # Process each file
    for file_name in files:
        if file_name.endswith('.arff'):
            file_path = os.path.join(folder_path, file_name)
            process_arff_file(file_path)

# Replace 'folder_path' with the path to your folder containing .arff files
folder_path = ''
process_folder(folder_path)
print("done")


100% [..............................................................................] 14406 / 14406C:\data\self\phd\new_laptop\v4\v1-ESWA\streams-arff-to-missing-realworld\noaa_temp.csv
(5698, 8)
(5698, 8)
(5698, 8)
(5698, 8)
(5698, 8)
(5698, 8)
done
