# Debug various functions used the variance.ipynb, baseline_quant.ipynb, masking.ipynb, and sanity_checks.ipynb

In [93]:
## import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
import wandb
import statistics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import random
from scipy.stats import sem

## Gradient

In [94]:
data = torch.from_numpy(np.array([[0, 0, 1, 0]])).float()

In [95]:
class NN(nn.Module):
    def __init__(self):
        super().__init__()
        input_dim = 4
        n_unit = 3
        output_dim = 4
        self.model = nn.Sequential(
            nn.Linear(input_dim, n_unit),
            nn.Sigmoid(),
            nn.Linear(n_unit, output_dim)
        )
    
    def forward(self, x):
        return self.model(x)

In [96]:
model = NN()

In [97]:
data[0].shape

torch.Size([4])

In [98]:
output = model(data[0])

In [99]:
target = copy.deepcopy(data[0])
# target = torch.from_numpy(np.hstack((output.detach().numpy()[:2], target.detach().numpy()[2:]))).float()

In [100]:
# take in two torch tensors, return MSE for non-missing values only
class custom_MSE(nn.Module):
    def __init__(self) -> None:
        super(custom_MSE, self).__init__()
    
    def missing(self, arr : np.array):
        assert(arr.shape == (2,))
        x1 = arr[0]
        x2 = arr[1]
        return x1 == x2 and x1 != 0.5
    
    def forward(self, y_pred : torch.Tensor, target : torch.Tensor):
        assert(target.shape == torch.Size([4])) ## hardcoded for now
        r = target.shape[0]
        y_pred_vals = y_pred.detach().numpy() ## different name to keep gradient
        target_vals = target.detach().numpy()
        new_target = np.zeros((r,))
        missing_count = 0

        for i in range(0, r, 2):
            pair_target = target_vals[i:i+2]
            pair_y_pred = y_pred_vals[i:i+2]
            if self.missing(pair_target):
                missing_count += 1
                new_target[i:i+2] = pair_y_pred
            else:
                new_target[i:i+2] = pair_target
        
        new_target = torch.from_numpy(new_target).float()

        loss = torch.divide(torch.sum(torch.pow(torch.subtract(y_pred, new_target), 2)), missing_count)
        print(loss)
        return loss

In [101]:
loss_fn = custom_MSE()

In [102]:
output, target

(tensor([ 0.1954,  0.1833, -0.2231,  0.7691], grad_fn=<ViewBackward0>),
 tensor([0., 0., 1., 0.]))

In [103]:
loss = loss_fn(output, target)

tensor(2.0876, grad_fn=<DivBackward0>)


In [104]:
loss

tensor(2.0876, grad_fn=<DivBackward0>)

In [105]:
optimizer = torch.optim.Adam(model.parameters())

In [106]:
optimizer.zero_grad()

In [107]:
loss.backward()

In [108]:
optimizer.step()

In [109]:
for param in model.parameters():
    print(param.grad, param.grad.shape)

tensor([[ 0.0000,  0.0000,  0.0763,  0.0000],
        [ 0.0000,  0.0000,  0.0114,  0.0000],
        [ 0.0000,  0.0000, -0.0746,  0.0000]]) torch.Size([3, 4])
tensor([ 0.0763,  0.0114, -0.0746]) torch.Size([3])
tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000],
        [-1.1711, -1.5150, -1.3266],
        [ 0.7364,  0.9526,  0.8342]]) torch.Size([4, 3])
tensor([ 0.0000,  0.0000, -2.4463,  1.5382]) torch.Size([4])


In [110]:
final = model(data[0])

In [111]:
final

tensor([ 0.1955,  0.1836, -0.2205,  0.7663], grad_fn=<ViewBackward0>)

Bad pipe message: %s [b'\xe7\x9d\x8bQR5^Z\x80a\xcc\xda\xd7\x875i\x00\xbe\x00\x01|\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x00\x06\x00\x07\x00\x08\x00\t\x00\n\x00\x0b\x00\x0c\x00\r\x00\x0e\x00\x0f\x00\x10\x00\x11\x00\x12\x00\x13\x00\x14\x00\x15\x00\x16\x00\x17\x00\x18\x00\x19\x00\x1a\x00\x1b\x00/\x000\x001\x002\x003\x004\x005\x006\x007\x008\x009\x00:\x00;\x00<\x00=\x00>\x00?\x00@\x00A\x00B\x00C\x00D\x00E\x00F\x00g\x00h\x00i\x00j\x00k\x00l\x00m\x00\x84\x00\x85\x00\x86\x00\x87\x00\x88\x00\x89\x00\x96\x00\x97\x00\x98\x00\x99\x00\x9a\x00\x9b\x00\x9c\x00\x9d\x00\x9e\x00\x9f\x00\xa0\x00\xa1\x00\xa2\x00\xa3\x00\xa4\x00\xa5\x00\xa6\x00\xa7\x00\xba\x00\xbb\x00\xbc\x00\xbd\x00\xbe\x00', b"\xc0\x00\xc1\x00\xc2\x00\xc3\x00\xc4\x00\xc5\x13\x01\x13\x02\x13\x03\x13\x04\x13\x05\xc0\x01\xc0\x02\xc0\x03\xc0\x04\xc0\x05\xc0\x06\xc0\x07\xc0\x08\xc0\t\xc0\n\xc0\x0b\xc0\x0c\xc0\r\xc0\x0e\xc0\x0f\xc0\x10\xc0\x11\xc0\x12\xc0\x13\xc0\x14\xc0\x15\xc0\x16\xc0\x17\xc0\x18\xc0\x19\xc0#\xc0$\xc0%\xc0&\xc0'\x

## Imputation
- imputation with data average function
- imputation with known domain average

In [54]:
## create dummy data so we can control everything

# 3 x 14
data_ = [[4, 6, 2, 6, 5, 2, 6, 7, 8, 2, 5, 4, 3, 1],
         [4, 5, 2, 7, 4, 7, 2, 9, 6, 2, 1, 4, 6, 8],
         [1, 3, 7, 2, 7, 9, 2, 7, 3, 5, 7, 1 ,4 ,7]]
data_narray = np.array(data_)
# data_narray = np.random.rand(10, 14)
data_narray

array([[4, 6, 2, 6, 5, 2, 6, 7, 8, 2, 5, 4, 3, 1],
       [4, 5, 2, 7, 4, 7, 2, 9, 6, 2, 1, 4, 6, 8],
       [1, 3, 7, 2, 7, 9, 2, 7, 3, 5, 7, 1, 4, 7]])

the functions

In [55]:
## data average
def impute_with_column_mean(data : pd.DataFrame):
    """
    Imputes missing values (represented as 0) in a DataFrame with the mean of the non-zero values in the same column.

    Parameters:
    - data (pd.DataFrame): The DataFrame to impute.

    Returns:
    - pd.DataFrame: The DataFrame with missing values imputed using column means.
    """
    # Replace '0' with NaN to handle them as missing values
    imputed_data = data.replace(0, np.nan)
    
    # Calculate the mean of each column, ignoring NaNs
    column_means = imputed_data.mean()
    # replace the nans with 0 since that means there's no data at all for that column
    column_means = column_means.replace(np.nan, 0)

    
    # Replace NaN values in each column with the mean of that column
    imputed_data.fillna(column_means, inplace=True)

    return imputed_data

In [56]:
## known domain average
def impute_with_row_average(data : pd.DataFrame):
    """
    Imputes missing values (NaN) in a DataFrame with the average of the non-missing values in the same row.

    Parameters:
    - data (pd.DataFrame): The DataFrame to impute.

    Returns:
    - pd.DataFrame: The DataFrame with missing values imputed.
    """
    # Create a copy of the DataFrame to avoid modifying the original data
    imputed_data = data.copy()
    # Replace '0' with NaN to handle them as missing values
    imputed_data = data.replace(0, np.nan)

    # Iterate over each row
    for index, row in imputed_data.iterrows():
        # Calculate the mean of the non-NaN values in the row
        mean_value = row.mean()
        if np.isnan(mean_value):
            mean_value = 0

        # Replace NaN values in the row with the calculated mean
        imputed_data.loc[index] = row.fillna(mean_value)

    return imputed_data

Mask

In [57]:
def generate_random_array(rows, cols, N):
    if N > cols:
        raise ValueError("N cannot be greater than the number of columns.")
    
    # Create an array of ones with the desired shape
    array = np.ones((rows, cols), dtype=int)
    
    # Randomly assign N zeros to each row
    for i in range(rows):
        indices = np.random.choice(cols, N, replace=False)
        array[i, indices] = 0
    
    return array, array.shape[0] * N

def generate_mask(data, N=4):
    rows, cols = data.shape
    assert(cols == 14)
    mask, n_zeros = generate_random_array(rows, cols, N)
    return data * mask, n_zeros, mask

In [58]:
data_masked_narray, n_zeros, mask = generate_mask(data_narray)

In [59]:
data_masked_narray

array([[4, 6, 2, 6, 5, 2, 0, 7, 8, 0, 0, 0, 3, 1],
       [0, 0, 2, 7, 0, 7, 2, 9, 6, 2, 1, 0, 6, 8],
       [0, 3, 0, 2, 7, 9, 2, 0, 3, 5, 7, 1, 4, 0]])

In [60]:
data_narray

array([[4, 6, 2, 6, 5, 2, 6, 7, 8, 2, 5, 4, 3, 1],
       [4, 5, 2, 7, 4, 7, 2, 9, 6, 2, 1, 4, 6, 8],
       [1, 3, 7, 2, 7, 9, 2, 7, 3, 5, 7, 1, 4, 7]])

In [61]:
mask

array([[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1],
       [0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0]])

In [62]:
mask ^ 1

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0],
       [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]])

Debugging code for imputing with data average

In [63]:
# mean of each column (data average)
data_masked_narray

array([[4, 6, 2, 6, 5, 2, 0, 7, 8, 0, 0, 0, 3, 1],
       [0, 0, 2, 7, 0, 7, 2, 9, 6, 2, 1, 0, 6, 8],
       [0, 3, 0, 2, 7, 9, 2, 0, 3, 5, 7, 1, 4, 0]])

In [64]:
data_average_impute = impute_with_column_mean(pd.DataFrame(data_masked_narray))

'''
notes (9.23.24)
- yep, looks weird, not imputing by average and seeing some nan values 
'''

'\nnotes (9.23.24)\n- yep, looks weird, not imputing by average and seeing some nan values \n'

In [65]:
data_masked_narray

array([[4, 6, 2, 6, 5, 2, 0, 7, 8, 0, 0, 0, 3, 1],
       [0, 0, 2, 7, 0, 7, 2, 9, 6, 2, 1, 0, 6, 8],
       [0, 3, 0, 2, 7, 9, 2, 0, 3, 5, 7, 1, 4, 0]])

In [66]:
### take out code in function to test
data = pd.DataFrame(data_masked_narray)

imputed_data = data.replace(0, np.nan)

column_means = imputed_data.mean()
column_means = column_means.replace(np.nan, 0)

imputed_data.fillna(column_means, inplace=True)
imputed_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,4.0,6.0,2.0,6,5.0,2,2.0,7.0,8,3.5,4.0,1.0,3,1.0
1,4.0,4.5,2.0,7,6.0,7,2.0,9.0,6,2.0,1.0,1.0,6,8.0
2,4.0,3.0,2.0,2,7.0,9,2.0,8.0,3,5.0,7.0,1.0,4,4.5


In [67]:
data_average_impute

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,4.0,6.0,2.0,6,5.0,2,2.0,7.0,8,3.5,4.0,1.0,3,1.0
1,4.0,4.5,2.0,7,6.0,7,2.0,9.0,6,2.0,1.0,1.0,6,8.0
2,4.0,3.0,2.0,2,7.0,9,2.0,8.0,3,5.0,7.0,1.0,4,4.5


Debugging code for impute with known domain average

In [68]:
data_masked_narray

array([[4, 6, 2, 6, 5, 2, 0, 7, 8, 0, 0, 0, 3, 1],
       [0, 0, 2, 7, 0, 7, 2, 9, 6, 2, 1, 0, 6, 8],
       [0, 3, 0, 2, 7, 9, 2, 0, 3, 5, 7, 1, 4, 0]])

In [69]:
known_average_impute = impute_with_row_average(pd.DataFrame(data_masked_narray))
known_average_impute

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,4.0,6.0,2.0,6,5.0,2,4.4,7.0,8,4.4,4.4,4.4,3,1.0
1,5.0,5.0,2.0,7,5.0,7,2.0,9.0,6,2.0,1.0,5.0,6,8.0
2,4.3,3.0,4.3,2,7.0,9,2.0,4.3,3,5.0,7.0,1.0,4,4.3


In [70]:
imputed_data = data.replace(0, np.nan)
# Iterate over each row
for index, row in imputed_data.iterrows():
#     # Calculate the mean of the non-NaN values in the row
    mean_value = row.mean()
    if np.isnan(mean_value):
        mean_value = 0
    print(mean_value)
    # Replace NaN values in the row with the calculated mean
    imputed_data.loc[index] = row.fillna(mean_value)

4.4
5.0
4.3
