In [3]:
import numpy as np
from logistic_regression.GrdDscntQuant import grdescentquant
from logistic_regression.normal_logistic import normallogistic
from logistic_regression.GrdDscnt import grdescentnormal
from logistic_regression.quant_logistic import quant_logistic
from logistic_regression.GrdDscntUncoded import grdescentuncoded
from logistic_regression.uncoded_logistic import  uncoded_logistic
from quantization.quantize import quantize
from coded_computation.master import master
import matplotlib.pyplot as plt
import pandas as pd
from scipy.io import arff
from pad_and_clean import pad
from pad_and_clean import clean_and_scale
import time
from gen_data import gen_data, gen_nonlinear_data
from sklearn.model_selection import train_test_split

def get_loss(w,X,y):
    #calculates 1-0 prediction error
    log_odds = X@w
    probs = 1 / (1 + np.exp(-log_odds))
    preds = np.where(probs > 0.5, 1,-1)
    test_loss = np.mean(preds != y)

    return test_loss
def plot_3d_bar(data, z):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    # Define the x, y coordinates and the z heights
    _x = np.arange(data.shape[1])
    _y = np.arange(data.shape[0])
    _x, _y = np.meshgrid(_x, _y)
    x, y = _x.ravel(), _y.ravel()

    # The z values represent the bar heights
    z = np.zeros_like(x)
    dz = data.ravel()

    # Plot 3D bars
    ax.bar3d(x, y, z, 1, 1, dz, shade=True)

    # Labels
    ax.set_xlabel('w_lvl')
    ax.set_ylabel('grd_lvl')
    ax.set_zlabel(z)

    plt.show()

def run(X,y, filename):
    repetitions = 10
    func = quant_logistic
    G = np.array([
        [1, 1, 1, 1, 1, 1, 1],
        [-1, -1, -1, 1, 1, 1, 1],
        [-1, 1, 1, -1, -1, 1, 1],
        [1, -1, -1, -1, -1, 1, 1],
        [1, -1, 1, -1, 1, -1, 1],
        [-1, 1, -1, -1, 1, -1, 1],
        [-1, -1, 1, 1, -1, -1, 1],
        [1, 1, -1, 1, -1, -1, 1]
    ]).T

    stepsize = 0.1
    maxiter = 10000

    #X,y = gen_nonlinear_data(500, 40, 1)

    X, Xt, y, yt = train_test_split(X, y, test_size=0.2)
    X, y = pad(X, y, 7)
    Xt, yt = pad(Xt, yt, 7)

    Master_uncoded = master(X, None, 21)
    Master = master(X, G, 3)
    times_grid = np.zeros((5, 4))
    loss_grid = np.zeros((5, 4))  # Initialize for each dataset
    iters_grid = np.zeros((5, 4))
    test_loss = np.zeros((5, 4))
    test_loss_uncoded = np.zeros((5,4))
    uncoded_times = np.zeros((5, 4))
    # loss from normal logistic regression
    w0 = np.random.uniform(-1, 1, (X.shape[1], 1))
    start_time = time.time()
    w, num_iters = grdescentnormal(normallogistic, w0, stepsize, maxiter, Master_uncoded, y, X, tolerance=1e-02)
    normal_loss = get_loss(w, Xt, yt)
    end_time = time.time()
    print(f"loss from unquantized logistic regresison: {normal_loss} on {num_iters} iterations in {end_time - start_time} seconds")
    for i in range(repetitions):
        for w_lvl in range(4, 9):
            for grd_lvl in range(2, 6):
                # logic for calculations
                w0 = np.random.uniform(-1, 1, (X.shape[1], 1))
                w0 = quantize(w0, w_lvl, "unif")

                start_time = time.time()
                #w, num_iters = grdescentquant(func, w0, stepsize, maxiter, Master, w_lvl, grd_lvl, X, y, filename, tolerance=1e-02)
                w, num_iters = grdescentquant(func, w0, stepsize, maxiter, Master, w_lvl, grd_lvl, X, y, filename, 1e-02)
                # grdescentquant(func, w, stepsize, maxiter, Master, w_lvl, grd_lvl, X, y, filename, tolerance)

                end_time = time.time()

                times_grid[w_lvl - 4, grd_lvl - 2] += end_time - start_time
                loss_grid[w_lvl - 4, grd_lvl - 2] += get_loss(w, X, y)
                iters_grid[w_lvl - 4, grd_lvl - 2] += num_iters
                test_loss[w_lvl - 4, grd_lvl - 2] += get_loss(w, Xt, yt)

                start_time = time.time()
                w, num_iters = grdescentuncoded(uncoded_logistic, w0, stepsize, maxiter, Master_uncoded, w_lvl, grd_lvl, X, y, tolerance=1e-02)
                end_time = time.time()
                uncoded_times[w_lvl - 4, grd_lvl - 2] += end_time - start_time
                test_loss_uncoded[w_lvl - 4, grd_lvl - 2] += get_loss(w,Xt,yt)

    test_loss = test_loss/repetitions
    loss_grid = loss_grid/repetitions
    iters_grid = iters_grid/repetitions
    times_grid = times_grid/repetitions
    test_loss_uncoded = test_loss_uncoded/repetitions
    uncoded_times = uncoded_times/repetitions



    print(f" training loss: \n{loss_grid}")
    print(f" avg iterations: \n{iters_grid}")
    print(f" test loss: \n{test_loss}")
    print(f"run time: \n{times_grid}")
    print(f" test loss uncoded: \n{test_loss_uncoded}")
    print(f" times uncoded: \n{uncoded_times}")
    return test_loss, loss_grid, iters_grid, times_grid, test_loss_uncoded, uncoded_times

(500, 45) (500, 1)
(400, 45) (400, 1)
(100, 45) (100, 1)


In [2]:
X, y = gen_nonlinear_data(500, 40, 1)

test_loss, loss_grid, iters_grid, times_grid, test_loss_uncoded, uncoded_times = run(X,y, "nonlinear_data.csv")

loss from unquantized logistic regresison: 0.34285714285714286 on 451 iterations in 0.08730387687683105 seconds


  func = lambda x: 1 / (1 + np.exp(x))
  func = lambda x: 1 / (1 + np.exp(x))
  func = lambda x: 1 / (1 + np.exp(x))
  probs = 1 / (1 + np.exp(-log_odds))
  func = lambda x: 1 / (1 + np.exp(x))


 training loss: 
[[0.50172414 0.48916256 0.49162562 0.49975369]
 [0.48029557 0.48522167 0.48916256 0.48423645]
 [0.45073892 0.46280788 0.47044335 0.47389163]
 [0.41428571 0.41600985 0.46847291 0.44408867]
 [0.40369458 0.42389163 0.42463054 0.41945813]]
 avg iterations: 
[[153.2 169.7 158.9 156.5]
 [164.8 183.8 171.  166.8]
 [254.3 217.7 218.1 222. ]
 [319.2 317.4 268.7 310.1]
 [396.7 324.9 304.4 322. ]]
 test loss: 
[[0.5247619  0.52285714 0.53333333 0.52      ]
 [0.51714286 0.51333333 0.50571429 0.53047619]
 [0.52190476 0.51904762 0.51809524 0.52857143]
 [0.53809524 0.54380952 0.52857143 0.53142857]
 [0.52666667 0.53333333 0.5352381  0.52      ]]
run time: 
[[1.86738126 2.81303153 3.20039506 3.76434965]
 [2.13581932 3.05183694 3.43894372 3.93574274]
 [3.36313109 3.6574528  4.55741909 5.39001472]
 [4.59711294 5.74423852 5.80118799 8.08476105]
 [6.96178589 6.41896791 7.08599794 9.08447092]]
 test loss uncoded: 
[[0.05333333 0.05238095 0.05333333 0.05238095]
 [0.05333333 0.05142857 0.050

In [3]:
X, y = gen_data(500, 40, 1)
test_loss1, loss_grid1, iters_grid1, times_grid1, test_loss_uncoded1, uncoded_times1 = run(X,y, "linear_data.csv")

loss from unquantized logistic regresison: 0.09523809523809523 on 277 iterations in 0.06266069412231445 seconds
 training loss: 
[[0.39064039 0.43423645 0.37463054 0.43325123]
 [0.27660099 0.30320197 0.31034483 0.35640394]
 [0.20073892 0.20862069 0.28029557 0.2453202 ]
 [0.14827586 0.13300493 0.16477833 0.15615764]
 [0.11847291 0.09605911 0.14901478 0.16206897]]
 avg iterations: 
[[178.9 180.5 161.8 161. ]
 [187.9 186.  172.7 181.2]
 [183.1 199.5 189.9 184.3]
 [208.6 216.5 222.7 221. ]
 [212.2 245.7 230.9 240.7]]
 test loss: 
[[0.36095238 0.46       0.34952381 0.42857143]
 [0.3447619  0.30952381 0.31619048 0.39714286]
 [0.24285714 0.22761905 0.31428571 0.26666667]
 [0.20190476 0.17904762 0.21619048 0.19904762]
 [0.18380952 0.16095238 0.19809524 0.21142857]]
run time: 
[[2.01193604 2.64873683 2.98601706 3.55165174]
 [2.20239587 2.87648902 3.23814838 4.04705091]
 [2.40651972 3.21706297 3.68352432 4.32911398]
 [2.8330157  3.68087466 4.60713906 5.31738663]
 [3.23016615 4.54968331 5.1614278

In [None]:

data = arff.loadarff('/Users/willem/Downloads/speeddating.arff')
df = pd.DataFrame(data[0])

hill_train_x, hill_test_x, hill_train_y, hill_test_y = clean_and_scale(df, "match")
hill_train_x, hill_train_y = pad(hill_train_x, hill_train_y, 7)
hill_test_x, hill_test_y = pad(hill_test_x, hill_test_y, 7)

print(hill_train_x.shape, hill_train_y.shape, hill_test_x.shape, hill_test_y.shape)

X = np.vstack((hill_test_x,hill_train_x))
y = np.vstack((hill_test_y, hill_train_y))
test_loss_real, loss_grid_real, iters_grid_real, times_grid_real, test_loss_uncoded_real, uncoded_times_real  = run(X,y, "real_data.csv")


(840, 238) (840, 1) (210, 238) (210, 1)
loss from unquantized logistic regresison: 0.3952380952380952 on 233 iterations in 0.20769214630126953 seconds


In [4]:
from gen_data import gen_seperable_data
X, y, w = gen_seperable_data(500, 40)

test_loss, loss_grid, iters_grid, times_grid, test_loss_uncoded, uncoded_times = run(X,y, "seperable_data.csv")

loss from unquantized logistic regresison: 0.047619047619047616 on 94 iterations in 0.027769088745117188 seconds


  func = lambda x: 1 / (1 + np.exp(x))
  func = lambda x: 1 / (1 + np.exp(x))
  func = lambda x: 1 / (1 + np.exp(x))


ValueError:  did not return arithmetic sequence: [-0.08605444406989449, -0.06381547465695116, -0.04157650524400783, -0.019337535831064498, 0.0029014335818787276, 0.02514040299482216, 0.05849885711423698, 0.08073782652718031, 0.09185731123365215, 0.1140962806465955, 0.1363352500595388, 0.15857421947248213, 0.18081318888542547, 0.2030521582983688, 0.2252911277113121, 0.24753009712425544]

In [None]:
def plot_3d_bar(data):

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    rows, cols = data.shape
    x, y = np.meshgrid(np.arange(cols), np.arange(rows))

    x = x.flatten()
    y = y.flatten()
    z = np.zeros_like(x)

    dz = data.flatten()

    ax.bar3d(x, y, z, 1, 1, dz, shade=True)

    ax.set_xlabel('w_lvl')
    ax.set_ylabel('grd_lvl')
    ax.set_zlabel('z')

    plt.show()

In [None]:
def plot_time_per_query(df):
    import numpy as np

    df_numeric = df.select_dtypes(include=[np.number])

    grouped = df_numeric.groupby(['w-quantization', 'grd-quantization']).mean()

    # Reset the index to make plotting easier
    grouped = grouped.reset_index()

    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D

    # Setting up the figure and 3D axis
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    # Data for the bars
    x = grouped['w-quantization']
    y = grouped['grd-quantization']
    z = grouped['time']  # or any other column you wish to visualize

    # The bar positions and width
    dx = dy = 0.5
    dz = z

    ax.bar3d(x, y, [0]*len(z), dx, dy, dz, color='b')

    # Labeling the axes
    ax.set_xlabel('w-quantization')
    ax.set_ylabel('grd-quantization')
    ax.set_zlabel('Average Time Per Query')

    # Show the plot
    plt.show()

In [None]:
file_path = 'access_measurements.csv'

column_names = [
    'w-quantization', 'grd-quantization', 'imputation', 'access',
    'query type', 'time', 'stop cond', 'iters'
]

df_nonlinear = pd.read_csv(file_path, header=None, names=column_names)

print(df_nonlinear.head())

In [7]:
bad_w = [-0.08605444406989449, -0.06381547465695116, -0.04157650524400783, -0.019337535831064498, 0.0029014335818787276, 0.02514040299482216, 0.05849885711423698, 0.08073782652718031
         , 0.09185731123365215, 0.1140962806465955, 0.1363352500595388, 0.15857421947248213, 0.18081318888542547, 0.2030521582983688, 0.2252911277113121, 0.24753009712425544]

for i in range(1,len(bad_w)):
    print(f"\n {bad_w[i] - bad_w[i-1]}")






 0.02223896941294333

 0.02223896941294333

 0.02223896941294333

 0.022238969412943226

 0.022238969412943434

 0.03335845411941482

 0.02223896941294333

 0.011119484706471838

 0.022238969412943344

 0.022238969412943316

 0.022238969412943316

 0.022238969412943344

 0.022238969412943344

 0.022238969412943288

 0.022238969412943344
