<a href="https://colab.research.google.com/github/yds725/nuclear_plant_proj/blob/code_development/data_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np



def add_id_column(df, file_id):
    
    id_dataframe = pd.DataFrame({'id' : [file_id for i in range(df.shape[0])]})

    df = pd.concat([id_dataframe, df], axis=1)
    
    return df

def data_loader(path, train, nrows, **kwargs):
    
    '''
    Parameters:
    
    path: [str] train용 또는 test용 csv 파일들이 저장되어 있는 폴더 
    train: [boolean] train용 파일들 불러올 시 True, 아니면 False
    nrows: [int] csv 파일에서 불러올 상위 n개의 row 
    lookup_table: [pd.DataFrame] train_label.csv 파일을 저장한 변수 
    event_time: [int] 상태_B 발생 시간 
    normal: [int] 상태_A의 라벨
    
    Return:
    
    data: train 또는 test data
    '''
    
    
    # 1. 해당 파일 경로에서 확장자 제외한 파일 이름만 가지고 오기 
    file_id = int(path.split('/')[-1].split('.')[0])
    
    # 2. train일 경우
    if train : 
        
        # 2-1 label 정보 저장
        lookup_table = kwargs['lookup_table']

        file_label = int(lookup_table[lookup_table['id'] == file_id]['label'])
        
        # 2-2 파일 읽기 
        data = pd.read_csv(path, nrows = nrows)
        
        # 2-3 id컬럼 추가 
        data = add_id_column(data, file_id)
        
        # 2-3 label컬럼 추가 
        event_time = kwargs['event_time']

        data['label'] = np.concatenate((np.repeat(kwargs['normal'], event_time), np.repeat(file_label, data.shape[0]-event_time)))

    # 3. test일 경우 
    else : 
        
        # 3-1 파일 읽기
        data = pd.read_csv(path, nrows = nrows)
        
        # 3-2 id컬럼 추가 
        data = add_id_column(data, file_id)
        
    return data


In [10]:
!ls /content/drive/My\ Drive/Nuclear_SemiProj/*.py

'/content/drive/My Drive/Nuclear_SemiProj/data_loader.py'


In [0]:
import sys
sys.path.append('/content/drive/My Drive/Nuclear_SemiProj')

In [0]:
import os
import pandas as pd 
import numpy as np
from multiprocessing import Pool 
import multiprocessing
from data_loader import data_loader #data_loader.py 파일을 다운 받아 주셔야 합니다. 
from tqdm import tqdm
from functools import partial

In [0]:
from pprint import pprint

In [0]:
def data_loader_all(func, path, train, nrows, **kwargs):
    '''
    Parameters:
    
    func: 하나의 csv파일을 읽는 함수 
    path: [str] train용 또는 test용 csv 파일들이 저장되어 있는 폴더 
    train: [boolean] train용 파일들 불러올 시 True, 아니면 False
    nrows: [int] csv 파일에서 불러올 상위 n개의 row 
    lookup_table: [pd.DataFrame] train_label.csv 파일을 저장한 변수 
    event_time: [int] 상태_B 발생 시간 
    normal: [int] 상태_A의 라벨
    
    Return:
    
    combined_df: 병합된 train 또는 test data
    '''
    
    # 읽어올 파일들만 경로 저장 해놓기 
    files_in_dir = os.listdir(path)
    
    files_path = [path+'/'+file for file in files_in_dir]
    
    if train :
        func_fixed = partial(func, nrows = nrows, train = True, lookup_table = kwargs['lookup_table'], event_time = kwargs['event_time'], normal = kwargs['normal'])
        
    else : 
        func_fixed = partial(func, nrows = nrows, train = False)
    
    
    # 여러개의 코어를 활용하여 데이터 읽기 
    if __name__ == '__main__':
        pool = Pool(processes = multiprocessing.cpu_count()) 
        df_list = list(tqdm(pool.imap(func_fixed, files_path), total = len(files_path)))
        pool.close()
        pool.join()
    
    # 데이터 병합하기 
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df
    

In [0]:
## 데이터 읽기
train_path = '/content/drive/My Drive/Nuclear_SemiProj/train'
test_path = '/content/drive/My Drive/Nuclear_SemiProj/test'
label = pd.read_csv('/content/drive/My Drive/Nuclear_SemiProj/train_label.csv')
train = data_loader_all(data_loader, path = train_path, train = True, nrows = 100, normal = 999, event_time = 10, lookup_table = label)

  1%|          | 10/828 [00:05<07:06,  1.92it/s]

KeyboardInterrupt: ignored

In [0]:
train.to_csv("/content/drive/My Drive/Nuclear_SemiProj/output/entrire_train.csv", index = False)

NameError: ignored

In [0]:
print(train.head(1000) )

NameError: ignored

In [0]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(train)

In [0]:
test = data_loader_all(data_loader, path = test_path, train = False, nrows = 60)

In [0]:
train_data = pd.read_csv("/content/drive/My Drive/Nuclear_SemiProj/output/entrire_train.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
train_data_1 = train_data.query('label != 999') 




    id  time      V0000     V0001  ...         V5118  V5119  V5120  label
10   0    10  30.474394  8.691177  ...  1.421620e-05   85.4    0.0    110
11   0    11  30.470463  8.736521  ... -6.114455e-06   85.4    0.0    110
12   0    12  30.465427  8.753559  ... -1.813291e-05   85.4    0.0    110
13   0    13  30.458532  8.715056  ... -5.745568e-07   85.4    0.0    110
14   0    14  30.475773  8.790241  ...  8.437883e-06   85.4    0.0    110
15   0    15  30.469574  8.722739  ... -2.988467e-06   85.4    0.0    110
16   0    16  30.471422  8.843733  ...  1.136327e-05   85.4    0.0    110
17   0    17  30.465795  8.639923  ...  1.157871e-06   85.4    0.0    110
18   0    18  30.451257  8.643156  ...  4.229167e-06   85.4    0.0    110
19   0    19  30.469449  8.786702  ...  1.503048e-06   85.4    0.0    110
20   0    20  30.497599  8.815079  ...  1.728125e-05   85.4    0.0    110
21   0    21  30.491865  8.708300  ...  1.601781e-05   85.4    0.0    110
22   0    22  30.469387  8.648828  ...

In [0]:
print(train_data_1.head(500) )

      id  time      V0000     V0001  ...         V5118    V5119  V5120  label
10     0    10  30.474394  8.691177  ...  1.421620e-05     85.4    0.0    110
11     0    11  30.470463  8.736521  ... -6.114455e-06     85.4    0.0    110
12     0    12  30.465427  8.753559  ... -1.813291e-05     85.4    0.0    110
13     0    13  30.458532  8.715056  ... -5.745568e-07     85.4    0.0    110
14     0    14  30.475773  8.790241  ...  8.437883e-06     85.4    0.0    110
..   ...   ...        ...       ...  ...           ...      ...    ...    ...
555  102    55  30.492440  8.472538  ... -7.107972e-06  5.82342    0.0    173
556  102    56  30.502597  8.303426  ... -5.954663e-06  5.82342    0.0    173
557  102    57  30.484245  8.221903  ...  1.006442e-05  5.82342    0.0    173
558  102    58  30.476942  8.345245  ...  2.110309e-05  5.82342    0.0    173
559  102    59  30.472778  8.313038  ...  1.355394e-05  5.82342    0.0    173

[500 rows x 5124 columns]


In [0]:
train_data_1.to_csv("/content/drive/My Drive/Nuclear_SemiProj/output/B_x_train.csv", index = False)

In [20]:
#b_train_data = pd.read_csv("/content/drive/My Drive/Nuclear_SemiProj/output/B_train.csv")

chunksize = 10000
tfr = pd.read_csv("/content/drive/My Drive/Nuclear_SemiProj/output/B_train.csv", chunksize=chunksize, iterator=True)
b_train_data = pd.concat(tfr, ignore_index=True)




  exec(code_obj, self.user_global_ns, self.user_ns)


In [23]:
print(b_train_data.shape)

(74520, 5124)


In [0]:
def read_csv(filename):
    'converts a filename to a pandas dataframe'
    return pd.read_csv(filename)

In [0]:
#print(b_train_data.head(100))

# pd.read_csv

# wrap your csv importer in a function that can be mapped
# def read_csv(filename):
#     'converts a filename to a pandas dataframe'
#     return pd.read_csv(filename)


# def main():

#     # get a list of file names
#     files = os.listdir('.')
#     file_list = [filename for filename in files if filename.split('.')[1]=='csv']

#     # set up your pool
#     with Pool(processes=8) as pool: # or whatever your hardware can support

#         # have your pool map the file names to dataframes
#         df_list = pool.map(read_csv, file_list)

#         # reduce the list of dataframes to a single dataframe
#         combined_df = pd.concat(df_list, ignore_index=True)

# if __name__ == '__main__':
#     main()

# with Pool(processes = multiprocessing.cpu_count()) as pool:
#  b_train_data = pool.imap(read_csv, "/content/drive/My Drive/Nuclear_SemiProj/output/B_train.csv")

# print(b_train_data.head(100))

b_x_train_data = b_train_data.drop('label', axis=1)
b_y_train_data = b_train_data['label'].to_frame() 

# print(b_train_data.head(100))
# print(b_x_train_data.head(100))
# print(b_y_train_data.head(100))





In [25]:
# set up your pool
# with Pool(processes = multiprocessing.cpu_count()) as pool: # or whatever your hardware can support
#   pool.imap(write_csv, "/content/drive/My Drive/Nuclear_SemiProj/output/b_x_train.csv")
# pool = Pool(processes = multiprocessing.cpu_count()) 
# pool.imap(pandas.to_csv, files_path)

chunksize = 10000
tfr_x = pd.read_csv("/content/drive/My Drive/Nuclear_SemiProj/output/b_x_train.csv", chunksize=chunksize, iterator=True)
b_x_train_data = pd.concat(tfr_x, ignore_index=True)



# b_x_train_data.to_csv("/content/drive/My Drive/Nuclear_SemiProj/output/b_x_train.csv", index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
print(b_x_train_data.head(1000))

      id  time      V0000     V0001  ...  V5117         V5118  V5119  V5120
0      0    10  30.474394  8.691177  ...    0.0  1.421620e-05   85.4    0.0
1      0    11  30.470463  8.736521  ...    0.0 -6.114455e-06   85.4    0.0
2      0    12  30.465427  8.753559  ...    0.0 -1.813291e-05   85.4    0.0
3      0    13  30.458532  8.715056  ...    0.0 -5.745568e-07   85.4    0.0
4      0    14  30.475773  8.790241  ...    0.0  8.437883e-06   85.4    0.0
..   ...   ...        ...       ...  ...    ...           ...    ...    ...
995  108    15  30.480877  8.807196  ...    0.0  2.017248e-06   85.4    0.0
996  108    16  30.486193  8.815195  ...    0.0  4.512549e-06   85.4    0.0
997  108    17  30.473104  8.658621  ...    0.0 -9.544943e-06   85.4    0.0
998  108    18  30.487434  8.737804  ...    0.0  4.466532e-06   85.4    0.0
999  108    19  30.455963  8.688224  ...    0.0  7.280729e-06   85.4    0.0

[1000 rows x 5123 columns]


In [0]:
b_y_train_data.to_csv("/content/drive/My Drive/Nuclear_SemiProj/output/b_y_train.csv", index=False, chunksize=10000)