## LHS로 lr, lc, rc, ld, rd, lnnz, rnnz 생성하기

In [1]:
from scipy.stats.distributions import norm
from pyDOE import *
import numpy as np
import pandas as pd
import random

In [2]:
sample = 4000000

In [3]:
# lhs를 사용해 lr, lc, rc 생성
lr = lhs(1, samples=sample)
lr = lr * 150000

lc = lhs(1, samples=sample)
lc = lc * 100000

rc = lhs(1, samples=sample)
rc = rc * 50000

In [4]:
# ld로 사용 가능한 density
ld_list = \
[0.00108175,
0.00082282,
0.00056263,
0.00034241,
0.00015297,
0.00002088,
0.00163948,
0.00078778,
0.00041097,
0.00019487,
0.00008429,
0.00001651,
0.02533638,
0.00952101,
0.00296184,
0.00082185,
0.00018467,
0.00000464,
0.01084252,
0.00860544,
0.00491597,
0.00160539,
0.00047003,
0.00002483,
0.00370564,
0.00182521,
0.00082487,
0.00031941,
0.00013363,
0.00000434]

In [5]:
# rd로 사용 가능한 density
rd_list = \
[0.0005,
0.001,
0.005,
0.01,
0.03,
0.05,
0.07,
0.1,
0.13,
0.15,
0.17,
0.2, 
0.23,
0.25,
0.27,
0.3]

In [6]:
# 균등한 분포로 ld, rd 생성
ld = np.random.choice(ld_list, size=(sample,1))
rd = np.random.choice(rd_list, size=(sample,1))

ld_rd = np.concatenate((ld, rd), axis = 1)

In [7]:
# lr, lc, rc, ld, rd 연결
lr_to_rd = np.concatenate((lr,lc,rc,ld,rd), axis = 1)

# lnnz 생성
lnnz = lr_to_rd[:,0] * lr_to_rd[:,1] * lr_to_rd[:,3]
lnnz = lnnz.reshape(-1,1)

# rnnz 생성
rnnz = lr_to_rd[:,1] * lr_to_rd[:,2] * lr_to_rd[:,4]
rnnz = rnnz.reshape(-1,1)

# lr, lc, rc, ld, rd, lnnz, rnnz 생성
lr_to_rnnz = np.concatenate((lr_to_rd, lnnz, rnnz), axis = 1)

# DataFrame 생성
lr_to_rnnz_df = pd.DataFrame(lr_to_rnnz,columns=['lr','lc','rc','ld','rd','lnnz','rnnz'])

lr_to_rnnz_df = lr_to_rnnz_df.astype({'lr': 'int','lc': 'int','rc': 'int','lnnz': 'int','rnnz': 'int'}) 

In [8]:
lt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] <= 0.1]
gt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] > 0.1]

print("lt 0.1 rd : {}".format(len(lt)))
print("gt 0.1 rd : {}".format(len(gt)))

lt 0.1 rd : 1998748
gt 0.1 rd : 2001252


### 데이터전처리

In [9]:
### pd.options.display.float_format = '{:.8f}'.format
intmaxvalue = 2147483647

# 좌측 sparsematrix의 nnz가 intMaxValue를 넘어가는 경우 제외
temp1 = lr_to_rnnz_df[ lr_to_rnnz_df['lnnz'] >= intmaxvalue ]
lr_to_rnnz_df = lr_to_rnnz_df[ lr_to_rnnz_df['lnnz'] < intmaxvalue ]

# 우측 densematrix의 nnz가 intMaxValue를 넘어가는 경우 제외
temp2 = lr_to_rnnz_df[ (lr_to_rnnz_df['lc'] * lr_to_rnnz_df['rc']) >= intmaxvalue ]
lr_to_rnnz_df = lr_to_rnnz_df[ (lr_to_rnnz_df['lc'] * lr_to_rnnz_df['rc']) < intmaxvalue ]

# 결과 densematrix의 nnz가 intMaxValue를 넘어가는 경우 제외
temp3 = lr_to_rnnz_df[ (lr_to_rnnz_df['lr'] * lr_to_rnnz_df['rc']) >= intmaxvalue ]
lr_to_rnnz_df = lr_to_rnnz_df[ (lr_to_rnnz_df['lr'] * lr_to_rnnz_df['rc']) < intmaxvalue ]

# rnnz가 30,000 이상인 데이터만
#temp4 = lr_to_rnnz_df[ (lr_to_rnnz_df['lnnz'] <= 40000) | (lr_to_rnnz_df['rnnz'] <= 40000) ]
#lr_to_rnnz_df = lr_to_rnnz_df[ (lr_to_rnnz_df['lnnz'] > 60000) & (lr_to_rnnz_df['rnnz'] > 60000) ]

# lnnz, rnnz가 70,000,000 이하인 데이터만
temp5 = lr_to_rnnz_df[ (lr_to_rnnz_df['lnnz'] >= 70000000) | (lr_to_rnnz_df['rnnz'] >= 70000000) ]
lr_to_rnnz_df = lr_to_rnnz_df[ (lr_to_rnnz_df['lnnz'] < 70000000) & (lr_to_rnnz_df['rnnz'] < 70000000) ]
lr_to_rnnz_df

Unnamed: 0,lr,lc,rc,ld,rd,lnnz,rnnz
0,30613,25153,14741,0.000319,0.1000,245953,37079555
2,116468,16399,6978,0.025336,0.1000,48393069,11443666
5,118761,7097,16922,0.000822,0.2700,692712,32427438
7,3427,15588,38565,0.000134,0.0005,7139,300586
17,17854,2835,49034,0.000195,0.0300,9864,4170638
...,...,...,...,...,...,...,...
3999982,148151,2769,3955,0.009521,0.0500,3906579,547728
3999985,146811,7813,10044,0.000319,0.0700,366400,5494059
3999986,27495,94978,2651,0.000342,0.0500,894206,12593068
3999992,97467,58505,5503,0.000134,0.0010,762006,321958


In [10]:
lt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] <= 0.1]
gt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] > 0.1]

print("lt 0.1 rd : {}".format(len(lt)))
print("gt 0.1 rd : {}".format(len(gt)))

lt 0.1 rd : 992605
gt 0.1 rd : 430062


In [11]:
# density 분포가 균등하게 만들기
lt = lt.sample(frac=0.45, random_state=1)
#lt = lt.sample(frac=1, random_state=1)
#gt = lt.sample(frac=0.7, random_state=1)
print("rd lt 0.1 : {}".format(len(lt)))
print("rd gt 0.1 : {}".format(len(gt)))

rd lt 0.1 : 446672
rd gt 0.1 : 430062


In [12]:
# 균등하게 나눈 dataframe 합치기
lr_to_rnnz_df = pd.concat([lt,gt])
print(len(lr_to_rnnz_df))

876734


In [13]:
# dataframe의 행을 무작위로 섞기
lr_to_rnnz_df = lr_to_rnnz_df.sample(frac=1).reset_index(drop=True)

In [14]:
# 남은 데이터에 D-optimal를 적용하기 위해 csv로 전환
lr_to_rnnz_df.to_csv('spmm-nonsquare-lr-150000-lc-100000-rc-50000-higdrd-v5.csv',index=False)