## LHS로 lr, lc, rc, ld, rd, lnnz, rnnz 생성하기

In [1]:
from scipy.stats.distributions import norm
from pyDOE import *
import numpy as np
import pandas as pd
import random

In [2]:
sample = 4000000

In [3]:
# lhs를 사용해 lr, lc, rc 생성
lr = lhs(1, samples=sample)
lr = lr * 150000

lc = lhs(1, samples=sample)
lc = lc * 100000

rc = lhs(1, samples=sample)
rc = rc * 50000

In [4]:
# ld로 사용 가능한 density
# 평균 0.000016, 표준편차 0.000022로 lhs를 사용해 구한 뒤,
# 음수값 제거 및 10 곱하기
ld_list = \
[0.00108175,
0.00082282,
0.00056263,
0.00034241,
0.00015297,
0.00002088,
0.00163948,
0.00078778,
0.00041097,
0.00019487,
0.00008429,
0.00001651,
0.02533638,
0.00952101,
0.00296184,
0.00082185,
0.00018467,
0.00000464,
0.01084252,
0.00860544,
0.00491597,
0.00160539,
0.00047003,
0.00002483,
0.00370564,
0.00182521,
0.00082487,
0.00031941,
0.00013363,
0.00000434]

In [5]:
# rd로 사용 가능한 density
rd_list = \
[0.0005,
0.001,
0.005,
0.01,
0.03,
0.05,
0.07,
0.1,
0.13,
0.15,
0.17,
0.2, 
0.23,
0.25,
0.27,
0.3]

In [6]:
# 각각 균등한 분포를 가지는 ld, rd 생성
ld = np.random.choice(ld_list, size=(sample,1))
rd = np.random.choice(rd_list, size=(sample,1))
# ld, rd 연결
ld_rd = np.concatenate((ld, rd), axis = 1)

In [7]:
# lr, lc, rc, ld, rd 연결
lr_to_rd = np.concatenate((lr,lc,rc,ld,rd), axis = 1)

# lnnz 생성
lnnz = lr_to_rd[:,0] * lr_to_rd[:,1] * lr_to_rd[:,3]
lnnz = lnnz.reshape(-1,1)

# rnnz 생성
rnnz = lr_to_rd[:,1] * lr_to_rd[:,2] * lr_to_rd[:,4]
rnnz = rnnz.reshape(-1,1)

# lr, lc, rc, ld, rd, lnnz, rnnz 연결
lr_to_rnnz = np.concatenate((lr_to_rd, lnnz, rnnz), axis = 1)

# DataFrame 생성
lr_to_rnnz_df = pd.DataFrame(lr_to_rnnz,columns=['lr','lc','rc','ld','rd','lnnz','rnnz'])

# lr, lc, rc, lnnz, rnnz 타입 변환
lr_to_rnnz_df = lr_to_rnnz_df.astype({'lr': 'int','lc': 'int','rc': 'int','lnnz': 'int','rnnz': 'int'}) 

In [8]:
# rd가 0.1 이하인 개수와 0.1 초과인 개수가 균등한지 확인
lt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] <= 0.1]
gt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] > 0.1]
print("lt 0.1 rd : {}".format(len(lt)))
print("gt 0.1 rd : {}".format(len(gt)))

lt 0.1 rd : 2001853
gt 0.1 rd : 1998147


### 데이터전처리 (생성 가능한 Matrix 조합을 가지도록)

In [9]:
intmaxvalue = 2147483647

# 좌측 sparsematrix의 nnz가 intMaxValue를 넘어가는 경우 제외
temp1 = lr_to_rnnz_df[ lr_to_rnnz_df['lnnz'] >= intmaxvalue ]
lr_to_rnnz_df = lr_to_rnnz_df[ lr_to_rnnz_df['lnnz'] < intmaxvalue ]

# 우측 densematrix의 nnz가 intMaxValue를 넘어가는 경우 제외
temp2 = lr_to_rnnz_df[ (lr_to_rnnz_df['lc'] * lr_to_rnnz_df['rc']) >= intmaxvalue ]
lr_to_rnnz_df = lr_to_rnnz_df[ (lr_to_rnnz_df['lc'] * lr_to_rnnz_df['rc']) < intmaxvalue ]

# 결과 densematrix의 nnz가 intMaxValue를 넘어가는 경우 제외
temp3 = lr_to_rnnz_df[ (lr_to_rnnz_df['lr'] * lr_to_rnnz_df['rc']) >= intmaxvalue ]
lr_to_rnnz_df = lr_to_rnnz_df[ (lr_to_rnnz_df['lr'] * lr_to_rnnz_df['rc']) < intmaxvalue ]

# lnnz, rnnz가 70,000,000 이하인 데이터만
temp5 = lr_to_rnnz_df[ (lr_to_rnnz_df['lnnz'] >= 70000000) | (lr_to_rnnz_df['rnnz'] >= 70000000) ]
lr_to_rnnz_df = lr_to_rnnz_df[ (lr_to_rnnz_df['lnnz'] < 70000000) & (lr_to_rnnz_df['rnnz'] < 70000000) ]
lr_to_rnnz_df

Unnamed: 0,lr,lc,rc,ld,rd,lnnz,rnnz
0,25443,48813,24347,0.000134,0.0005,165966,594228
3,139234,51094,13476,0.000185,0.0005,1313767,344287
6,107786,18744,5644,0.000342,0.0010,691801,105807
12,27324,4907,14028,0.000788,0.1300,105634,8949176
14,37560,35080,21394,0.000788,0.0005,1038011,375266
...,...,...,...,...,...,...,...
3999989,91710,30385,8397,0.001825,0.2500,5086235,63788371
3999991,47729,26240,7778,0.003706,0.2700,4641112,55111894
3999995,52414,59132,9267,0.000185,0.0010,572363,548011
3999998,109950,5228,1624,0.025336,0.2700,14565085,2293365


In [10]:
# rd가 0.1 이하인 개수와 0.1 초과인 개수가 균등한지 확인
lt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] <= 0.1]
gt = lr_to_rnnz_df[lr_to_rnnz_df['rd'] > 0.1]

print("lt 0.1 rd : {}".format(len(lt)))
print("gt 0.1 rd : {}".format(len(gt)))

lt 0.1 rd : 995357
gt 0.1 rd : 429379


In [11]:
# rd 분포가 균등하게 만들기
lt = lt.sample(frac=0.45, random_state=1)
print("rd lt 0.1 : {}".format(len(lt)))
print("rd gt 0.1 : {}".format(len(gt)))

rd lt 0.1 : 447911
rd gt 0.1 : 429379


In [12]:
# rd가 균등해지도록 나눈 lt와 gt를 dataframe으로 합치기
lr_to_rnnz_df = pd.concat([lt,gt])
print(len(lr_to_rnnz_df))

877290


In [13]:
# dataframe의 행을 무작위로 섞기
lr_to_rnnz_df = lr_to_rnnz_df.sample(frac=1).reset_index(drop=True)

In [14]:
# lhs로 생성된 데이터 조합에 D-optimal을 적용하기 위해 csv로 전환
lr_to_rnnz_df.to_csv('./csv-after-lhs/lhs-lr-150000-lc-100000-rc-50000-v1.csv',index=False)