# Makes a big database for testing logistic regression fitting on the GPU

In [1]:
import os
import pandas as pd
import numpy as np
import scipy.stats as ss
from tqdm import tqdm

In [2]:
# let's make a 20 GB table, let's have 5 variables
# make some random number generators for each variable
x0_0 = ss.norm(loc=1.0, scale=0.1)
x0_1 = ss.norm(loc=0.5, scale=0.2)

x1_0 = ss.norm(loc=2.5, scale=1.5)
x1_1 = ss.norm(loc=-3.0, scale=1.5)

x2_0 = ss.norm(loc=10.0, scale=2.0)
x2_1 = ss.norm(loc=11.0, scale=2.1)

x3_0 = ss.norm(loc=4.5, scale=0.1)
x3_1 = ss.norm(loc=6.7, scale=0.2)

x4_0 = ss.norm(loc=-10, scale=3)
x4_1 = ss.norm(loc=-6, scale=2)

In [3]:
n_bytes_in_kilo = 1024
n_bytes_in_mega = n_bytes_in_kilo * 1024
n_bytes_in_giga = n_bytes_in_mega * 1024

n_bytes_total = 10.0 * n_bytes_in_giga

In [4]:
n_bytes_in_float = 4
n_bytes_per_row = 4 * 6
n_rows_in_database = int(n_bytes_total / n_bytes_per_row)

In [5]:
n_rows_in_database

447392426

In [6]:
# let's write to disk in sets of hundred thousand
n_rows_temp = int(1e5)
n_bytes_temp = n_rows_temp * n_bytes_per_row
n_bytes_temp

2400000

In [26]:
%%time
n_bytes_written = 0
n_gigs_written = n_bytes_written / n_bytes_in_giga
n_gigs_total = n_bytes_total / n_bytes_in_giga

df_file = os.path.join('..', 'data', 'test_db.hdf5')
if os.path.isfile(df_file):
    os.remove(df_file)

    
# bar = tqdm(total=n_gigs_total, desc='generating data and writing to file: {f}'.format(f=df_file))

count = 0
while n_gigs_written < n_gigs_total:
    # generate data
    length = int(1e5)/2
    start_idx = count*int(1e5)
    half_point = (int(1e5) / 2) + count*int(1e5) - 1
    end_idx = (count+1) * int(1e5)
    temp_df = pd.DataFrame(index=range(start_idx, end_idx), 
                           columns=['x0', 'x1', 'x2', 'x3', 'x4', 'y'], 
                           dtype=np.float32)
    
    temp_df.loc[start_idx:half_point, 'x0'] = x0_0.rvs(size=(length, )).astype(np.float32)
    temp_df.loc[half_point+1:end_idx, 'x0'] = x0_1.rvs(size=(length, )).astype(np.float32)
    
    temp_df.loc[start_idx:half_point, 'x1'] = x1_0.rvs(size=(length, )).astype(np.float32)
    temp_df.loc[half_point+1:end_idx, 'x1'] = x1_1.rvs(size=(length, )).astype(np.float32)
    
    temp_df.loc[start_idx:half_point, 'x2'] = x2_0.rvs(size=(length, )).astype(np.float32)
    temp_df.loc[half_point+1:end_idx, 'x2'] = x2_1.rvs(size=(length, )).astype(np.float32)
    
    temp_df.loc[start_idx:half_point, 'x3'] = x3_0.rvs(size=(length, )).astype(np.float32)
    temp_df.loc[half_point+1:end_idx, 'x3'] = x3_1.rvs(size=(length, )).astype(np.float32)
    
    temp_df.loc[start_idx:half_point, 'x4'] = x4_0.rvs(size=(length, )).astype(np.float32)
    temp_df.loc[half_point+1:end_idx, 'x4'] = x4_1.rvs(size=(length, )).astype(np.float32)
    
    temp_df.loc[start_idx:half_point, 'y'] = np.zeros(shape=(length, ), dtype=np.float32)
    temp_df.loc[half_point+1:end_idx, 'y'] = np.ones(shape=(length, ), dtype=np.float32)
    count += 1
    # shuffle
    temp_index_shuffled = np.random.permutation(temp_df.index)
    
    temp_df = temp_df.loc[temp_index_shuffled, :]
    
    # write it to the hdf file
    
    temp_df.to_hdf(df_file, 'table', mode='a', append=True)
    n_bytes_written += temp_df.values.nbytes
    n_gigs_written = n_bytes_written / n_bytes_in_giga
#     bar.update(n_gigs_written)
    if (n_gigs_written % 1.0) < 0.01:
        print(n_gigs_written)
    
    if n_bytes_written >= n_bytes_total:
        break
    
# bar.close()
    

  return self._random_state.standard_normal(self._size)
  a = empty(shape, dtype, order)


0.0022351741790771484
0.004470348358154297
0.006705522537231445
0.008940696716308594
1.0013580322265625
1.0035932064056396
1.0058283805847168
1.008063554763794
2.000480890274048
2.002716064453125
2.004951238632202
2.0071864128112793
2.0094215869903564
3.0018389225006104
3.0040740966796875
3.0063092708587646
3.008544445037842
4.000961780548096
4.003196954727173
4.00543212890625
4.007667303085327
4.009902477264404
5.000084638595581
5.002319812774658
5.004554986953735
5.0067901611328125
5.00902533531189
6.0014426708221436
6.003677845001221
6.005913019180298
6.008148193359375
7.000565528869629
7.002800703048706
7.005035877227783
7.00727105140686
7.0095062255859375
8.001923561096191
8.004158735275269
8.006393909454346
8.008629083633423
9.001046419143677
9.003281593322754
9.005516767501831
9.007751941680908
9.009987115859985
10.000169277191162
CPU times: user 6min 25s, sys: 38.3 s, total: 7min 3s
Wall time: 7min 11s


In [27]:
bar.close()

NameError: name 'bar' is not defined

In [None]:
n_gigs_written

In [None]:
n_gigs_total

In [None]:
np.random.normal()