In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# 한글 폰트 설정
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.style.use('default')

print("=" * 80)
print("제약 제조 데이터 전처리 및 특성 엔지니어링 실습")
print("=" * 80)

제약 제조 데이터 전처리 및 특성 엔지니어링 실습


## 1. 데이터 수집 및 로딩

In [23]:
 # 실험실 데이터 (배치별 원료, 중간제품, 완제품 품질 데이터)
laboratory_df = pd.read_csv('../Laboratory.csv', sep=';')
print(f"✅ Laboratory 데이터 로딩 완료: {laboratory_df.shape}")

# 공정 데이터 (배치별 집계된 공정 센서 데이터)
process_df = pd.read_csv('../Process.csv', sep=';')
print(f"✅ Process 데이터 로딩 완료: {process_df.shape}")

# 정규화 계수 데이터
normalization_df = pd.read_csv('../Normalization.csv', sep=';')
print(f"✅ Normalization 데이터 로딩 완료: {normalization_df.shape}")


print(f"\n📈 데이터 개요:")
print(f"- 총 배치 수: {len(laboratory_df)}")
print(f"- Laboratory 변수 수: {len(laboratory_df.columns)}")
print(f"- Process 변수 수: {len(process_df.columns)}")

✅ Laboratory 데이터 로딩 완료: (1005, 55)
✅ Process 데이터 로딩 완료: (1005, 35)
✅ Normalization 데이터 로딩 완료: (25, 3)

📈 데이터 개요:
- 총 배치 수: 1005
- Laboratory 변수 수: 55
- Process 변수 수: 35


In [24]:
laboratory_df

Unnamed: 0,batch,code,strength,size,start,api_code,api_batch,smcc_batch,lactose_batch,starch_batch,...,tbl_tensile,fct_tensile,tbl_yield,batch_yield,dissolution_av,dissolution_min,resodual_solvent,impurities_total,impurity_o,impurity_l
0,1,25,5MG,240000,nov.18,5,2,1,2,1,...,1.412698,1.926183,95.785,94.697,93.83,86,0.06,0.33,0.05,0.16
1,2,25,5MG,240000,nov.18,5,2,1,2,1,...,1.412698,1.986377,98.467,97.348,99.67,92,0.04,0.34,0.06,0.16
2,3,25,5MG,240000,nov.18,5,2,1,2,1,...,1.412698,2.016473,98.496,99.242,97.33,92,0.03,0.28,0.05,0.16
3,4,25,5MG,240000,nov.18,5,2,1,2,1,...,1.474120,1.956280,97.736,98.106,94.50,89,0.03,0.30,0.05,0.18
4,5,25,5MG,240000,nov.18,5,2,1,2,1,...,1.443409,1.926183,98.106,98.106,92.00,88,0.04,0.31,0.05,0.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,17,20M,960000,apr.21,3,254,18,22,17,...,0.901424,1.100889,98.623,98.343,91.00,85,0.02,0.05,0.05,0.05
1001,1002,17,20M,960000,apr.21,3,254,18,22,17,...,0.939783,1.064192,98.626,98.532,89.17,84,0.02,0.05,0.05,0.05
1002,1003,17,20M,960000,apr.21,3,254,18,22,17,...,0.939783,1.108944,98.157,98.343,90.83,87,0.02,0.05,0.05,0.05
1003,1004,17,20M,960000,apr.21,3,254,18,22,17,...,1.035679,1.225385,99.336,99.242,92.67,90,0.02,0.05,0.05,0.05


In [25]:
process_df

Unnamed: 0,batch,code,tbl_speed_mean,tbl_speed_change,tbl_speed_0_duration,total_waste,startup_waste,weekend,fom_mean,fom_change,...,ejection_min,Startup_tbl_fill_maxDifference,Startup_main_CompForce_mean,Startup_tbl_fill_mean,Drug release average (%),Drug release min (%),Residual solvent,Total impurities,Impurity O,Impurity L
0,1,25,99.864656,5.416667,149.583333,2125.416667,5085,no,49.961446,12,...,196,0.38,4.587500,5.466667,93.83,86.0,0.06,0.33,0.05,0.16
1,2,25,99.936342,2.500000,128.333333,887.500000,2115,no,49.962040,5,...,194,0.18,4.390909,5.315455,99.67,92.0,0.04,0.34,0.06,0.16
2,3,25,99.985984,2.500000,83.333333,796.250000,1895,no,49.961176,6,...,184,0.12,4.430000,5.242000,97.33,92.0,0.03,0.28,0.05,0.16
3,4,25,99.976868,2.916667,76.250000,695.833333,1645,no,49.960900,9,...,197,0.24,4.500000,5.221250,94.50,89.0,0.03,0.30,0.05,0.18
4,5,25,99.968284,2.500000,121.250000,829.166667,1971,no,50.000000,5,...,205,0.19,3.960000,5.233000,92.00,88.0,0.04,0.31,0.05,0.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,17,119.973595,1.458333,43.125000,972.395833,3564,no,79.955846,8,...,138,0.20,6.333333,6.523333,,,,,,
1001,1002,17,119.941321,2.604167,43.125000,1055.625000,4090,no,80.000000,8,...,137,0.13,5.987500,6.438125,,,,,,
1002,1003,17,119.893113,3.958333,42.812500,1456.875000,6950,no,79.978230,9,...,120,0.38,6.134783,6.381739,,,,,,
1003,1004,17,120.000000,1.145833,36.041667,404.687500,1793,no,79.964059,9,...,187,0.22,5.887500,6.532500,,,,,,


In [26]:
normalization_df

Unnamed: 0,Product code,Batch Size (tablets),Normalisation factor
0,1,240000,2.4
1,2,1920000,19.2
2,3,960000,9.6
3,4,583000,5.83
4,5,2400000,24.0
5,6,2400000,24.0
6,7,1200000,12.0
7,8,1100000,11.0
8,9,240000,2.4
9,10,960000,9.6


## 2. 데이터 전처리
### 2-1. 데이터 통계 및 결측치 확인

In [27]:
print("🔍 Laboratory 데이터 기본 정보:")
print(laboratory_df.info())

print("\n🔍 Process 데이터 기본 정보:")
print(process_df.info())

🔍 Laboratory 데이터 기본 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1005 entries, 0 to 1004
Data columns (total 55 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   batch                 1005 non-null   int64  
 1   code                  1005 non-null   int64  
 2   strength              1005 non-null   object 
 3   size                  1005 non-null   int64  
 4   start                 1005 non-null   object 
 5   api_code              1005 non-null   int64  
 6   api_batch             1005 non-null   int64  
 7   smcc_batch            1005 non-null   int64  
 8   lactose_batch         1005 non-null   int64  
 9   starch_batch          1005 non-null   int64  
 10  api_water             1005 non-null   object 
 11  api_total_impurities  1000 non-null   object 
 12  api_l_impurity        996 non-null    object 
 13  api_content           1003 non-null   float64
 14  api_ps01              1005 non-null   object 
 1

In [28]:
# 결측값 확인
print("\n❗ Laboratory 데이터 결측값:")
missing_lab = laboratory_df.isnull().sum()
print(missing_lab[missing_lab > 0])

print("\n❗ Process 데이터 결측값:")
missing_process = process_df.isnull().sum()
print(missing_process[missing_process > 0])


❗ Laboratory 데이터 결측값:
api_total_impurities     5
api_l_impurity           9
api_content              2
tbl_min_weight          10
tbl_max_weight          10
dtype: int64

❗ Process 데이터 결측값:
Drug release average (%)    18
Drug release min (%)        18
Residual solvent            18
Total impurities            18
Impurity O                  18
Impurity L                  18
dtype: int64


In [29]:
# 기본 통계
print("\n📊 주요 품질 지표 통계:")
quality_cols = ['dissolution_av', 'dissolution_min', 'impurities_total']
if all(col in laboratory_df.columns for col in quality_cols):
    print(laboratory_df[quality_cols].describe())


📊 주요 품질 지표 통계:
       dissolution_av  dissolution_min  impurities_total
count     1005.000000      1005.000000       1005.000000
mean        90.649811        85.589055          0.138886
std          3.365709         4.234331          0.098889
min         82.500000        74.000000          0.050000
25%         88.330000        83.000000          0.050000
50%         90.330000        85.000000          0.090000
75%         92.830000        88.000000          0.230000
max        102.670000       100.000000          0.600000


### 2-2. 결측치 처리

In [33]:
# 데이터 복사
lab_clean = laboratory_df.copy()
process_clean = process_df.copy()

print("🧹 1) 결측값 처리")

# 수치형 컬럼 식별
numeric_cols_lab = lab_clean.select_dtypes(include=[np.number]).columns
numeric_cols_process = process_clean.select_dtypes(include=[np.number]).columns

# 결측값을 중앙값으로 대체
for col in numeric_cols_lab:
    if lab_clean[col].isnull().sum() > 0:
        missing_count = lab_clean[col].isnull().sum()
        median_val = lab_clean[col].median()
        lab_clean[col].fillna(median_val, inplace=True)
        print(f"   - {col}: 결측값 {missing_count}개를 중앙값 {median_val:.2f}로 대체")

for col in numeric_cols_process:
    if process_clean[col].isnull().sum() > 0:
        missing_count = process_clean[col].isnull().sum()
        median_val = process_clean[col].median()
        process_clean[col].fillna(median_val, inplace=True)
        print(f"   - {col}: 결측값 {missing_count}개를 중앙값 {median_val:.2f}로 대체")

print("\n🔗 2) 데이터 병합")

# batch 기준으로 데이터 병합
merged_df = pd.merge(lab_clean, process_clean, on='batch', how='inner')
print(f"   - 병합 후 데이터 크기: {merged_df.shape}")

# 정규화 계수 병합
if 'code_x' in merged_df.columns:
    merged_df = pd.merge(merged_df, normalization_df, left_on='code_x', right_on='Product code', how='left')

print("\n🚫 3) 이상값 제거")

🧹 1) 결측값 처리
   - api_content: 결측값 2개를 중앙값 94.40로 대체
   - tbl_min_weight: 결측값 10개를 중앙값 219.00로 대체
   - tbl_max_weight: 결측값 10개를 중앙값 228.00로 대체
   - Drug release average (%): 결측값 18개를 중앙값 90.33로 대체
   - Drug release min (%): 결측값 18개를 중앙값 85.00로 대체
   - Residual solvent: 결측값 18개를 중앙값 0.04로 대체
   - Total impurities: 결측값 18개를 중앙값 0.09로 대체
   - Impurity O: 결측값 18개를 중앙값 0.05로 대체
   - Impurity L: 결측값 18개를 중앙값 0.05로 대체

🔗 2) 데이터 병합
   - 병합 후 데이터 크기: (1005, 89)

🚫 3) 이상값 제거


### 2-3. 이상치 제거

In [34]:
# IQR 방법으로 이상값 제거 함수
def remove_outliers_iqr(df, columns):
    """
    IQR(Interquartile Range) 방법을 사용하여 이상값을 제거하는 함수
    
    Parameters:
    -----------
    df : pandas.DataFrame
        이상값을 제거할 데이터프레임
    columns : list
        이상값을 확인할 컬럼 리스트
    
    Returns:
    --------
    df_clean : pandas.DataFrame
        이상값이 제거된 데이터프레임
    outliers_removed : int
        제거된 이상값의 총 개수
    
    설명:
    ----
    - IQR = Q3 - Q1 (3분위수 - 1분위수)
    - 이상값 기준: Q1 - 1.5*IQR 미만 또는 Q3 + 1.5*IQR 초과인 값
    - 각 컬럼별로 순차적으로 이상값을 제거
    """
    df_clean = df.copy()
    outliers_removed = 0
    
    for col in columns:
        if col in df_clean.columns and df_clean[col].dtype in ['int64', 'float64']:
            # 1분위수(Q1)와 3분위수(Q3) 계산
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            
            # IQR(사분위수 범위) 계산
            IQR = Q3 - Q1
            
            # 이상값 기준선 설정 (1.5 * IQR 규칙)
            lower_bound = Q1 - 1.5 * IQR  # 하한선
            upper_bound = Q3 + 1.5 * IQR  # 상한선
            
            # 이상값 식별 및 제거
            outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)]
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
            outliers_removed += len(outliers)
    
    return df_clean, outliers_removed

In [32]:
# 주요 품질 지표에서 이상값 제거
quality_indicators = ['dissolution_av', 'dissolution_min', 'impurities_total', 'tbl_speed_mean']
merged_clean, outliers_count = remove_outliers_iqr(merged_df, quality_indicators)
print(f"   - 제거된 이상값: {outliers_count}개")
print(f"   - 최종 데이터 크기: {merged_clean.shape}")

   - 제거된 이상값: 121개
   - 최종 데이터 크기: (884, 92)


## 3. 특성 엔지니어링

In [37]:
feature_df = merged_clean.copy()
    
print("🔧 1) 새로운 특성 생성")

# 데이터 타입 확인 및 정리
print("   🔍 데이터 타입 확인 중...")
for col in feature_df.columns:
    if feature_df[col].dtype == 'object':
        # 숫자로 변환 가능한지 확인
        try:
            feature_df[col] = pd.to_numeric(feature_df[col], errors='ignore')
        except:
            pass

# 공정 효율성 지표 - batch_yield 컬럼 확인 후 생성
if 'batch_yield' in feature_df.columns:
    # 총 낭비가 없다면 다른 방식으로 효율성 계산
    if 'total_waste' in feature_df.columns:
        feature_df['process_efficiency'] = feature_df['batch_yield'] / (feature_df['total_waste'] + 1)
        print("   - process_efficiency: 공정 효율성 지표 생성")
    else:
        # size 기반 효율성 지표
        feature_df['batch_efficiency'] = feature_df['batch_yield'] / feature_df['size'] * 100
        print("   - batch_efficiency: 배치 효율성 지표 생성")

# 품질 일관성 지표 - 실제 존재하는 컬럼으로 수정
if 'dissolution_av' in feature_df.columns and 'dissolution_min' in feature_df.columns:
    feature_df['quality_consistency'] = feature_df['dissolution_min'] / feature_df['dissolution_av']
    print("   - quality_consistency: 품질 일관성 지표 생성")

# 용출 품질 범위 지표
if 'dissolution_av' in feature_df.columns and 'dissolution_min' in feature_df.columns:
    feature_df['dissolution_range'] = feature_df['dissolution_av'] - feature_df['dissolution_min']
    print("   - dissolution_range: 용출 변동성 지표 생성")

# 압축력 안정성 지표 - 실제 컬럼명에 맞게 수정
compression_mean_cols = [col for col in feature_df.columns if 'CompForce' in col and 'mean' in col]
compression_sd_cols = [col for col in feature_df.columns if 'CompForce' in col and 'sd' in col]

if compression_mean_cols and compression_sd_cols:
    mean_col = compression_mean_cols[0]
    sd_col = compression_sd_cols[0]
    feature_df['compression_stability'] = feature_df[sd_col] / (feature_df[mean_col] + 1e-6)
    print(f"   - compression_stability: 압축력 안정성 지표 생성 ({mean_col}, {sd_col})")

# 태블릿 속도 안정성 지표
if 'tbl_speed_mean' in feature_df.columns and 'tbl_speed_change' in feature_df.columns:
    feature_df['speed_stability'] = feature_df['tbl_speed_change'] / (feature_df['tbl_speed_mean'] + 1e-6)
    print("   - speed_stability: 속도 안정성 지표 생성")

# API 관련 특성 생성 - 수치형 컬럼만 선택
api_cols = [col for col in feature_df.columns if 'api_' in col.lower() 
           and feature_df[col].dtype in ['int64', 'float64']]
if len(api_cols) > 0:
    feature_df['api_quality_score'] = feature_df[api_cols].mean(axis=1)
    print(f"   - api_quality_score: API 품질 종합 점수 생성 (사용된 컬럼: {len(api_cols)}개)")

# 불순물 비율 특성 - 수치형 컬럼만 처리
impurity_cols = [col for col in feature_df.columns 
                if ('impurity' in col.lower() or 'impurities' in col.lower()) 
                and feature_df[col].dtype in ['int64', 'float64']]

if len(impurity_cols) > 1:
    print(f"   🔍 불순물 관련 컬럼: {impurity_cols}")
    # 총 불순물 대비 개별 불순물 비율
    total_col = [col for col in impurity_cols if 'total' in col.lower()]
    if total_col:
        other_impurity_cols = [col for col in impurity_cols if 'total' not in col.lower()]
        for imp_col in other_impurity_cols:
            # 수치형 데이터인지 다시 한번 확인
            if (feature_df[imp_col].dtype in ['int64', 'float64'] and 
                feature_df[total_col[0]].dtype in ['int64', 'float64']):
                feature_df[f'{imp_col}_ratio'] = feature_df[imp_col] / (feature_df[total_col[0]] + 1e-6)
                print(f"   - {imp_col}_ratio: {imp_col} 비율 특성 생성")

# 배치 크기별 정규화 특성
if 'size' in feature_df.columns and feature_df['size'].dtype in ['int64', 'float64']:
    size_related_cols = ['tbl_speed_mean', 'dissolution_av', 'impurities_total']
    for col in size_related_cols:
        if (col in feature_df.columns and 
            feature_df[col].dtype in ['int64', 'float64']):
            feature_df[f'{col}_per_1000tablets'] = feature_df[col] * 1000 / feature_df['size']
            print(f"   - {col}_per_1000tablets: 배치 크기 정규화 특성 생성")

print("\n📊 2) 범주형 변수 인코딩")

# 범주형 변수 원-핫 인코딩
categorical_cols = feature_df.select_dtypes(include=['object']).columns
categorical_cols = [col for col in categorical_cols if col not in ['batch', 'start']]  # 식별자 제외

print(f"   📋 인코딩 대상 범주형 변수: {list(categorical_cols)}")

for col in categorical_cols:
    unique_count = feature_df[col].nunique()
    if unique_count < 20 and unique_count > 1:  # 유니크 값이 1개 초과 20개 미만인 경우만
        try:
            dummies = pd.get_dummies(feature_df[col], prefix=col, drop_first=True)
            feature_df = pd.concat([feature_df, dummies], axis=1)
            feature_df.drop(col, axis=1, inplace=True)
            print(f"   - {col}: 원-핫 인코딩 완료 (유니크 값: {unique_count}개)")
        except Exception as e:
            print(f"   - {col}: 인코딩 실패 - {str(e)}")
    else:
        print(f"   - {col}: 스킵됨 (유니크 값: {unique_count}개)")

print("\n🔢 3) 수치형 특성 추가 변환")

# 로그 변환이 필요한 특성들 (분포가 치우친 경우)
skewed_candidates = ['impurities_total', 'size']
for col in skewed_candidates:
    if (col in feature_df.columns and 
        feature_df[col].dtype in ['int64', 'float64']):
        # 0이 아닌 값들에 대해서만 로그 변환
        if (feature_df[col] > 0).all():
            feature_df[f'{col}_log'] = np.log1p(feature_df[col])
            print(f"   - {col}_log: 로그 변환 특성 생성")

# 제곱근 변환
sqrt_candidates = ['size', 'dissolution_range'] 
for col in sqrt_candidates:
    if (col in feature_df.columns and 
        feature_df[col].dtype in ['int64', 'float64'] and 
        (feature_df[col] >= 0).all()):
        feature_df[f'{col}_sqrt'] = np.sqrt(feature_df[col])
        print(f"   - {col}_sqrt: 제곱근 변환 특성 생성")

# 상호작용 특성 (중요한 변수들 간의 곱)
interaction_pairs = [
    ('dissolution_av', 'dissolution_min'),
    ('tbl_speed_mean', 'main_CompForce mean') if 'main_CompForce mean' in feature_df.columns else None
]

interaction_pairs = [pair for pair in interaction_pairs if pair is not None]

for col1, col2 in interaction_pairs:
    if (col1 in feature_df.columns and col2 in feature_df.columns and
        feature_df[col1].dtype in ['int64', 'float64'] and
        feature_df[col2].dtype in ['int64', 'float64']):
        feature_df[f'{col1}_x_{col2}'] = feature_df[col1] * feature_df[col2]
        print(f"   - {col1}_x_{col2}: 상호작용 특성 생성")

print(f"\n✅ 특성 엔지니어링 완료")
print(f"   📊 원본 특성 수: {len(merged_clean.columns)}")
print(f"   🆕 최종 특성 수: {len(feature_df.columns)}")
print(f"   ➕ 추가된 특성 수: {len(feature_df.columns) - len(merged_clean.columns)}")

# 생성된 특성들 확인
new_features = [col for col in feature_df.columns if col not in merged_clean.columns]
if new_features:
    print(f"\n🔍 새로 생성된 특성들:")
    for i, feature in enumerate(new_features, 1):
        print(f"   {i:2d}. {feature}")

# 최종 데이터 타입 확인
print(f"\n📋 최종 데이터 타입 요약:")
dtype_counts = feature_df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
    print(f"   - {dtype}: {count}개 컬럼")

🔧 1) 새로운 특성 생성
   🔍 데이터 타입 확인 중...
   - process_efficiency: 공정 효율성 지표 생성
   - quality_consistency: 품질 일관성 지표 생성
   - dissolution_range: 용출 변동성 지표 생성
   - compression_stability: 압축력 안정성 지표 생성 (main_CompForce mean, main_CompForce_sd)
   - speed_stability: 속도 안정성 지표 생성
   - api_quality_score: API 품질 종합 점수 생성 (사용된 컬럼: 3개)
   🔍 불순물 관련 컬럼: ['impurities_total', 'impurity_o', 'impurity_l', 'Total impurities', 'Impurity O', 'Impurity L']
   - impurity_o_ratio: impurity_o 비율 특성 생성
   - impurity_l_ratio: impurity_l 비율 특성 생성
   - Impurity O_ratio: Impurity O 비율 특성 생성
   - Impurity L_ratio: Impurity L 비율 특성 생성
   - tbl_speed_mean_per_1000tablets: 배치 크기 정규화 특성 생성
   - dissolution_av_per_1000tablets: 배치 크기 정규화 특성 생성
   - impurities_total_per_1000tablets: 배치 크기 정규화 특성 생성

📊 2) 범주형 변수 인코딩
   📋 인코딩 대상 범주형 변수: ['strength', 'api_water', 'api_total_impurities', 'api_l_impurity', 'api_ps01', 'api_ps05', 'api_ps09', 'weekend']
   - strength: 원-핫 인코딩 완료 (유니크 값: 4개)
   - api_water: 스킵됨 (유니크 값: 111개)
   - api_t

In [39]:
# 전처리된 데이터를 엑셀 파일로 저장
merged_clean.to_excel('preprocessed_pharmaceutical_data.xlsx', index=False)
print("✅ 전처리된 데이터가 'preprocessed_pharmaceutical_data.xlsx' 파일로 저장되었습니다.")
print(f"📊 저장된 데이터 크기: {merged_clean.shape}")

✅ 전처리된 데이터가 'preprocessed_pharmaceutical_data.xlsx' 파일로 저장되었습니다.
📊 저장된 데이터 크기: (884, 92)


In [41]:
merged_clean

Unnamed: 0,batch,code_x,strength,size,start,api_code,api_batch,smcc_batch,lactose_batch,starch_batch,api_water,api_total_impurities,api_l_impurity,api_content,api_ps01,api_ps05,api_ps09,lactose_water,lactose_sieve0045,lactose_sieve015,lactose_sieve025,smcc_water,smcc_td,smcc_bd,smcc_ps01,smcc_ps05,smcc_ps09,starch_ph,starch_water,tbl_min_thickness,tbl_max_thickness,fct_min_thickness,fct_max_thickness,tbl_min_weight,tbl_max_weight,tbl_rsd_weight,fct_rsd_weight,tbl_min_hardness,tbl_max_hardness,tbl_av_hardness,fct_min_hardness,fct_max_hardness,fct_av_hardness,tbl_max_diameter,fct_max_diameter,tbl_tensile,fct_tensile,tbl_yield,batch_yield,dissolution_av,dissolution_min,resodual_solvent,impurities_total,impurity_o,impurity_l,code_y,tbl_speed_mean,tbl_speed_change,tbl_speed_0_duration,total_waste,startup_waste,weekend,fom_mean,fom_change,SREL_startup_mean,SREL_production_mean,SREL_production_max,main_CompForce mean,main_CompForce_sd,main_CompForce_median,pre_CompForce_mean,tbl_fill_mean,tbl_fill_sd,cyl_height_mean,stiffness_mean,stiffness_max,stiffness_min,ejection_mean,ejection_max,ejection_min,Startup_tbl_fill_maxDifference,Startup_main_CompForce_mean,Startup_tbl_fill_mean,Drug release average (%),Drug release min (%),Residual solvent,Total impurities,Impurity O,Impurity L,Product code,Batch Size (tablets),Normalisation factor
0,1,25,5MG,240000,nov.18,5,2,1,2,1,1.53,0.25,0.13,94.5,1.27,18.52,109.999,0.05,17,50,82,4.251,0.45,0.33,31.156,112.141,245.499,4.4,3.012,3.3,3.4,3.4,3.4,111.0,116.0,0.92,0.72,56.84,68.60,46,37.00,56.00,62.72,6.1,6.1,1.412698,1.926183,95.785,94.697,93.83,86,0.06,0.33,0.05,0.16,25,99.864656,5.416667,149.583333,2125.416667,5085,no,49.961446,12,4.392000,3.559876,7.1,4.255404,0.058473,4.3,0.100000,5.332248,0.095938,2.099466,91.016149,103,67,223.319255,248,196,0.38,4.587500,5.466667,93.83,86.0,0.06,0.33,0.05,0.16,25,240000,2.4
2,3,25,5MG,240000,nov.18,5,2,1,2,1,1.53,0.25,0.13,94.5,1.27,18.52,109.999,0.05,17,50,82,4.251,0.45,0.33,31.156,112.141,245.499,4.4,3.012,3.3,3.4,3.4,3.4,111.0,115.0,0.83,0.78,58.80,70.56,46,39.00,57.00,65.66,6.1,6.1,1.412698,2.016473,98.496,99.242,97.33,92,0.03,0.28,0.05,0.16,25,99.985984,2.500000,83.333333,796.250000,1895,no,49.961176,6,7.200000,3.392133,8.7,4.261263,0.054522,4.3,0.004768,5.311097,0.107814,2.113004,88.967819,111,72,212.530393,248,184,0.12,4.430000,5.242000,97.33,92.0,0.03,0.28,0.05,0.16,25,240000,2.4
3,4,25,5MG,240000,nov.18,5,2,1,2,1,1.53,0.25,0.13,94.5,1.27,18.52,109.999,0.05,17,50,82,4.251,0.45,0.33,31.156,112.141,245.499,4.4,3.012,3.3,3.4,3.4,3.4,110.0,117.0,0.53,0.88,58.80,72.52,48,57.00,40.00,63.70,6.1,6.1,1.474120,1.956280,97.736,98.106,94.50,89,0.03,0.30,0.05,0.18,25,99.976868,2.916667,76.250000,695.833333,1645,no,49.960900,9,7.122222,3.416048,9.0,4.357605,0.062705,4.4,0.000000,5.309988,0.115554,2.104527,101.431138,121,83,225.938922,262,197,0.24,4.500000,5.221250,94.50,89.0,0.03,0.30,0.05,0.18,25,240000,2.4
4,5,25,5MG,240000,nov.18,5,2,1,2,1,1.53,0.25,0.13,94.5,1.27,18.52,109.999,0.05,17,50,82,4.251,0.45,0.33,31.156,112.141,245.499,4.4,3.012,3.3,3.4,3.4,3.4,112.0,115.0,0.75,0.69,56.84,68.60,47,39.00,59.00,62.72,6.1,6.1,1.443409,1.926183,98.106,98.106,92.00,88,0.04,0.31,0.05,0.18,25,99.968284,2.500000,121.250000,829.166667,1971,no,50.000000,5,14.450000,3.460359,9.8,4.249461,0.056975,4.2,0.000000,5.319629,0.103194,2.126347,108.978443,132,91,237.305389,264,205,0.19,3.960000,5.233000,92.00,88.0,0.04,0.31,0.05,0.18,25,240000,2.4
5,6,25,5MG,240000,nov.18,5,2,1,2,1,1.53,0.25,0.13,94.5,1.27,18.52,109.999,0.05,17,50,82,4.251,0.45,0.33,31.156,112.141,245.499,4.4,3.012,3.3,3.4,3.4,3.4,112.0,116.0,0.57,0.72,58.80,70.56,49,43.00,55.00,65.66,6.1,6.1,1.504831,2.016473,98.099,97.727,97.67,94,0.07,0.33,0.05,0.20,25,99.976766,2.500000,97.916667,780.833333,1853,no,49.922342,7,8.458333,3.405455,9.2,4.456000,0.061265,4.5,0.000000,5.358582,0.107666,2.120000,108.473939,130,87,235.270303,264,206,0.23,4.530000,5.300000,97.67,94.0,0.07,0.33,0.05,0.20,25,240000,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,17,20M,960000,apr.21,3,254,18,22,17,1.5,0.32,...,94.4,3,36,145,0.08,17,50,82,4.445,0.45,0.33,30.363,119.557,262.707,4.5,2.463,4.1,4.1,4.1,4.2,222.0,234.0,1.31,1.87,35.00,61.00,47,48.02,75.46,58.80,8.1,8.1,0.901424,1.100889,98.623,98.343,91.00,85,0.02,0.05,0.05,0.05,17,119.973595,1.458333,43.125000,972.395833,3564,no,79.955846,8,9.192857,6.069116,8.1,6.245474,0.083523,6.2,0.000000,6.736649,0.055184,1.819130,44.031594,56,7,192.169329,212,138,0.20,6.333333,6.523333,90.33,85.0,0.04,0.09,0.05,0.05,17,960000,9.6
1001,1002,17,20M,960000,apr.21,3,254,18,22,17,1.5,0.32,...,94.4,3,36,145,0.08,17,50,82,4.445,0.45,0.33,30.363,119.557,262.707,4.5,2.463,4.0,4.1,4.1,4.2,219.0,227.0,1.25,0.97,38.00,70.00,49,46.06,68.60,56.84,8.1,8.1,0.939783,1.064192,98.626,98.532,89.17,84,0.02,0.05,0.05,0.05,17,119.941321,2.604167,43.125000,1055.625000,4090,no,80.000000,8,5.943750,5.409258,7.4,6.149187,0.089366,6.1,0.000000,6.718435,0.099598,1.825717,42.977385,57,6,194.642756,219,137,0.13,5.987500,6.438125,90.33,85.0,0.04,0.09,0.05,0.05,17,960000,9.6
1002,1003,17,20M,960000,apr.21,3,254,18,22,17,1.5,0.32,...,94.4,3,36,145,0.08,17,50,82,4.445,0.45,0.33,30.363,119.557,262.707,4.5,2.463,4.0,4.1,4.1,4.1,221.0,228.0,1.11,0.81,38.00,63.00,49,49.98,69.58,57.82,8.1,8.1,0.939783,1.108944,98.157,98.343,90.83,87,0.02,0.05,0.05,0.05,17,119.893113,3.958333,42.812500,1456.875000,6950,no,79.978230,9,6.000000,5.131303,7.0,6.349876,0.097192,6.3,0.000000,6.738101,0.067520,1.815551,47.306766,57,7,204.655685,243,120,0.38,6.134783,6.381739,90.33,85.0,0.04,0.09,0.05,0.05,17,960000,9.6
1003,1004,17,20M,960000,apr.21,3,254,18,22,17,1.5,0.32,...,94.4,3,36,145,0.08,17,50,82,4.445,0.45,0.33,30.363,119.557,262.707,4.5,2.463,4.0,4.1,4.1,4.1,221.0,228.0,1.02,0.99,38.00,68.00,54,56.84,72.52,64.68,8.1,8.2,1.035679,1.225385,99.336,99.242,92.67,90,0.02,0.05,0.05,0.05,17,120.000000,1.145833,36.041667,404.687500,1793,no,79.964059,9,9.970000,4.698833,7.4,6.557320,0.088333,6.6,0.000000,6.657546,0.064084,1.780000,44.155233,56,7,210.176096,238,187,0.22,5.887500,6.532500,90.33,85.0,0.04,0.09,0.05,0.05,17,960000,9.6
