In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def load_data(dataset, method):
    '''
    dataset = {'A123', 'A456', B456}
    method = {'raw', 'scaled'}
    '''

    # data with scaled wafer process time(316) 
    if method == 'scaled':
        
        data_dir = os.path.join('.', 'MyCMP')
        x_train = pd.read_csv(data_dir + '/x_train_' + dataset + '.csv')
        x_test = pd.read_csv(data_dir + '/x_test_' + dataset + '.csv')
        y_train = pd.read_csv(data_dir + '/y_train_' + dataset + '.csv')
        y_test = pd.read_csv(data_dir + '/y_test_' + dataset + '.csv')
        
        return x_train, x_test, y_train, y_test
    
    # raw data 
    elif method == 'raw':
        
        raw_data_dir = os.path.join('.', '2016 PHM DATA CHALLENGE CMP DATA SET/')
    
        # load training data
        file_train = os.listdir(raw_data_dir + 'training') # 讀取檔案資料
        x_train = pd.read_csv(raw_data_dir + 'training/' + file_train[0])
        for i in range(1, len(file_train)):
            tmp = pd.read_csv(raw_data_dir + 'training/' + file_train[i])
            x_train = pd.concat([x_train, tmp])

        y_train = pd.read_csv(raw_data_dir + 'CMP-training-removalrate.csv')

        # load test data
        file_test = os.listdir(raw_data_dir + 'test') # 讀取檔案資料
        x_test = pd.read_csv(raw_data_dir + 'test/' + file_test[0])
        for i in range(1, len(file_test)):
            tmp = pd.read_csv(raw_data_dir + 'test/' + file_test[i])
            x_test = pd.concat([x_test, tmp])

        y_test = pd.read_csv(raw_data_dir + 'CMP-testing-removalrate.csv')

        # drop outlier
        x_train = x_train[x_train["WAFER_ID"] != 1834206730]
        x_train = x_train[x_train["WAFER_ID"] != 1834206944]
        x_train = x_train[x_train["WAFER_ID"] != 1834206972]
        x_train = x_train[x_train["WAFER_ID"] != 2058207580]

        y_train = y_train.drop([892, 894, 897, 1675])

        # split training data
        Atrain = x_train[x_train["STAGE"]=="A"]
        x_A123_train = Atrain[Atrain["CHAMBER"] <= 3]
        x_A456_train = Atrain[Atrain["CHAMBER"] >= 4]
        x_B456_train = x_train[x_train["STAGE"]=="B"]

        Atrain = y_train[y_train["STAGE"] == "A"]
        y_A123_train = Atrain[Atrain["AVG_REMOVAL_RATE"] >= 100]
        y_A456_train = Atrain[Atrain["AVG_REMOVAL_RATE"] < 100]
        y_B456_train = y_train[y_train["STAGE"] == "B"]

        # sort data
        x_A123_train = x_A123_train.sort_values(["WAFER_ID", "CHAMBER"])
        x_A456_train = x_A456_train.sort_values(["WAFER_ID", "CHAMBER"])
        x_B456_train = x_B456_train.sort_values(["WAFER_ID", "CHAMBER"])

        y_A123_train = y_A123_train.sort_values(["WAFER_ID"])
        y_A456_train = y_A456_train.sort_values(["WAFER_ID"])
        y_B456_train = y_B456_train.sort_values(["WAFER_ID"])

        # split test data
        Atest = x_test[x_test["STAGE"]=="A"]
        x_A123_test = Atest[Atest["CHAMBER"] <= 3]
        x_A456_test = Atest[Atest["CHAMBER"] >= 4]
        x_B456_test = x_test[x_test["STAGE"]=="B"]

        Atest = y_test[y_test["STAGE"] == "A"]
        y_A123_test = Atest[Atest["AVG_REMOVAL_RATE"] >= 100]
        y_A456_test = Atest[Atest["AVG_REMOVAL_RATE"] < 100]
        y_B456_test = y_test[y_test["STAGE"] == "B"]

        # sort data
        x_A123_test = x_A123_test.sort_values(["WAFER_ID", "CHAMBER"])
        x_A456_test = x_A456_test.sort_values(["WAFER_ID", "CHAMBER"])
        x_B456_test = x_B456_test.sort_values(["WAFER_ID", "CHAMBER"])

        y_A123_test = y_A123_test.sort_values(["WAFER_ID"])
        y_A456_test = y_A456_test.sort_values(["WAFER_ID"])
        y_B456_test = y_B456_test.sort_values(["WAFER_ID"])

        if dataset == 'A123':
            return x_A123_train, x_A123_test, y_A123_train, y_A123_test

        elif dataset == 'A456':
            return x_A456_train, x_A456_test, y_A456_train, y_A456_test

        elif dataset == 'B456':
            return x_B456_train, x_B456_test, y_B456_train, y_B456_test

In [3]:
# obtain the wafer order sorted by start time
def get_wafer_order(df):
    
    df = df.sort_values(['TIMESTAMP'])
    wafer_list = set(df['WAFER_ID'])
    wafer_time = {}
    for i in wafer_list:
        target = df[df['WAFER_ID'] == i]
        wafer_time[i] = min(target["TIMESTAMP"])

    wafer_time_df = pd.DataFrame(wafer_time.items(), columns=['WAFER_ID', 'StartTime'])
    wafer_time_df = wafer_time_df.sort_values(['StartTime'])
    wafer_time_df.reset_index(drop=True, inplace=True)
    wafer_order = CategoricalDtype(wafer_time_df['WAFER_ID'], ordered=True)
    
    return wafer_order

In [4]:
# sort the data and labels
def get_sorted_data(dataset):

    x_train_raw, x_test_raw, _, _ = load_data(dataset=dataset, method='raw')
    x_all_raw = pd.concat((x_train_raw, x_test_raw))
    x_all_raw_order = get_wafer_order(x_all_raw)

    x_train, x_test, y_train, y_test = load_data(dataset=dataset, method='scaled')
    x_all = pd.concat((x_train, x_test))
    y_all = pd.concat((y_train, y_test))

    x_all['WAFER_ID'] = x_all['WAFER_ID'].astype(x_all_raw_order)
    x_all = x_all.sort_values(['WAFER_ID'])
    x_all.reset_index(drop=True, inplace=True)

    y_all['WAFER_ID'] = y_all['WAFER_ID'].astype(x_all_raw_order)
    y_all = y_all.sort_values(['WAFER_ID'])
    y_all.reset_index(drop=True, inplace=True)

    return x_all, y_all

In [5]:
# min-max normalization
def norm(train, test):

    normalizer = MinMaxScaler(feature_range=(0, 1)).fit(train) # scaling training data to [0,1] range
    train_ret = normalizer.transform(train)
    test_ret = normalizer.transform(test)

    return train_ret, test_ret

# create three files(list.txt, train.csv, test.csv) for each sub-datasets
def get_data_files(dataset):

    x_all, y_all = get_sorted_data(dataset)

    data = x_all.merge(y_all, how='left', on='WAFER_ID')

    # split the data
    train = data[:int((len(data)/316)*0.7)*316]
    test =  data[int((len(data)/316)*0.7)*316:]

    train_labels = train['MRR'].tolist()
    test_labels = test['MRR'].tolist()
    
    # retain only the SVID columns
    xtrain = train.iloc[:, 2:-2]
    xtest = test.iloc[:, 2:-2]

    xtrain = xtrain.fillna(xtrain.mean())
    xtest = xtest.fillna(xtest.mean())
    xtrain = xtrain.fillna(0)
    xtest = xtest.fillna(0)

    # trim the column names
    xtrain = xtrain.rename(columns=lambda x: x.strip())
    xtest = xtest.rename(columns=lambda x: x.strip())

    x_train, x_test = norm(xtrain.values, xtest.values)

    train_df = pd.DataFrame(x_train, columns = xtrain.columns)
    test_df = pd.DataFrame(x_test, columns = xtest.columns)

    # attach the lable column
    train_df['MRR'] = train_labels
    test_df['MRR'] = test_labels

    output_dir = f'./{dataset}'
    os.makedirs(output_dir, exist_ok=True)

    train_df.to_csv(f'{output_dir}/train.csv')
    test_df.to_csv(f'{output_dir}/test.csv')

    f = open(f'{output_dir}/list.txt', 'w')
    for col in xtrain.columns:
        f.write(col+'\n')
    f.close()


In [6]:
# create the data files for each sub-datasets (list.txt, train.csv, test.csv in a folder named after its sub-dataset)
get_data_files('A456')
get_data_files('B456')