In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer

In [47]:
df=pd.read_excel('dataset_filledsupplier_currency_orderday.xlsx')

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24621 entries, 0 to 24620
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   청구서번호        24621 non-null  object 
 1   No.          24621 non-null  int64  
 2   Subject      24599 non-null  object 
 3   Machinery    24621 non-null  object 
 4   Assembly     24621 non-null  object 
 5   청구품목         24621 non-null  object 
 6   Unnamed: 6   0 non-null      float64
 7   Part No.1    24602 non-null  object 
 8   Part No.2    3592 non-null   object 
 9   청구량          24517 non-null  float64
 10  견적           24171 non-null  object 
 11  견적수량         24517 non-null  float64
 12  견적화폐         24621 non-null  object 
 13  견적단가         24621 non-null  float64
 14  발주번호         24621 non-null  object 
 15  발주처          24621 non-null  object 
 16  발주           24621 non-null  object 
 17  발주수량         24621 non-null  int64  
 18  발주금액         24621 non-null  float64
 19  D/T 

In [49]:
print(len(df['발주처'].unique()))

81


## 클리닝

In [50]:
missing_conditions = df[
    df['발주'].notnull() &  # 발주 일자는 비어있지 않음
    df['미입고 기간'].isnull() &  # 미입고 기간은 비어있음
    df['창고입고'].isnull() & # 창고 입고도 비어있음
    df['선박입고'].isnull()  # 선박 입고도 비어있음

]

print(f"발주 일자는 있지만 미입고 기간, 창고 입고, 선박 입고도 없는 경우: {len(missing_conditions)}개")
df = df.drop(missing_conditions.index)

print(f"삭제된 행의 개수: {len(missing_conditions)}개")
print(f"남은 데이터프레임의 크기: {df.shape}")

발주 일자는 있지만 미입고 기간, 창고 입고, 선박 입고도 없는 경우: 1699개
삭제된 행의 개수: 1699개
남은 데이터프레임의 크기: (22922, 32)


In [51]:
#미입고기간으로 처리.
missing_both = df[df['창고입고'].isnull() & df['미입고 기간'].notnull()]

print(f"창고 입고일은 없고 미입고 기간은 명시되어 있어 미입고 기간으로 분류해야 할 경우 : {len(missing_both)}개")

창고 입고일은 없고 미입고 기간은 명시되어 있어 미입고 기간으로 분류해야 할 경우 : 1620개


In [59]:
df = df[df['미입고 기간'].isnull()]

df['발주'] = pd.to_datetime(df['발주'], errors='coerce')
df['창고입고'] = pd.to_datetime(df['창고입고'], errors='coerce')

# 리드타임 계산
df['리드타임'] = (df['창고입고'] - df['발주']).dt.days
df['리드타임'] = df['리드타임'].apply(lambda x: 1 if x == 0 else x)
df = df[(df['리드타임'] > 1 ) & (df['리드타임'] < 200)]
df = df.dropna(subset=['창고입고'])

In [61]:
print(df[['발주', '창고입고']].head(), df['창고입고'].isnull().sum())

          발주       창고입고
0 2019-01-11 2019-05-03
1 2019-01-11 2019-04-18
2 2019-01-11 2019-05-03
5 2019-06-03 2019-06-15
6 2019-06-03 2019-06-15 0


In [41]:
print(df.columns)
df['리드타임'] = df['리드타임'].astype(float)  # 필요시 데이터 타입 변환


Index(['청구서번호', 'No.', 'Subject', 'Machinery', 'Assembly', '청구품목',
       'Unnamed: 6', 'Part No.1', 'Part No.2', '청구량', '견적', '견적수량', '견적화폐',
       '견적단가', '발주번호', '발주처', '발주', '발주수량', '발주금액', 'D/T', '미입고 기간', '창고입고',
       '창고입고수량', 'Control No.', '입고창고', '창고출고', '창고출고수량', '출고선박', '출고운반선',
       '선박입고', '선박입고수량', '완료 여부', 'cleaned_machinery', 'cleaned_assembly',
       'cleaned_item', 'machinery_assembly', '리드타임', 'machinery_avg_leadtime',
       'assembly_avg_leadtime', '리드타임_supplier_avg', '리드타임_avg'],
      dtype='object')


In [66]:

# 2019년 ~ 2022년 공휴일 리스트
holidays = [
    '2019-01-01', '2019-02-04', '2019-02-05', '2019-02-06', '2019-03-01', 
    '2019-05-05', '2019-05-06', '2019-06-06', '2019-08-15', '2019-09-12', 
    '2019-09-13', '2019-10-03', '2019-10-09', '2019-12-25',
    '2020-01-01', '2020-01-24', '2020-01-25', '2020-01-26', '2020-03-01', 
    '2020-05-05', '2020-06-06', '2020-08-15', '2020-10-01', '2020-10-02', 
    '2020-10-03', '2020-10-09', '2020-12-25',
    '2021-01-01', '2021-02-11', '2021-02-12', '2021-02-13', '2021-03-01',
    '2021-05-05', '2021-06-06', '2021-08-15', '2021-09-20', '2021-09-21',
    '2021-09-22', '2021-10-03', '2021-10-09', '2021-12-25',
    '2022-01-01', '2022-01-31', '2022-02-01', '2022-02-02', '2022-03-01',
    '2022-05-05', '2022-06-06', '2022-08-15', '2022-09-09', '2022-09-10',
    '2022-09-11', '2022-10-03', '2022-10-09', '2022-12-25'
]
holidays = pd.to_datetime(holidays)

def calculate_working_days(row, holidays):
    if pd.notna(row['발주']) and pd.notna(row['창고입고']):
        try:
            return np.busday_count(row['발주'].date(), row['창고입고'].date(), holidays=holidays)
        except Exception as e:
            return np.nan
    else:
        return np.nan

# 근무일 계산 적용
df['working_days'] = df.apply(calculate_working_days, holidays=holidays, axis=1)
print(df.shape)

(19330, 34)


In [71]:
# 월(month), 요일(day of the week) 추출
df['month'] = df['창고입고'].dt.month
df['day_of_week'] = df['창고입고'].dt.dayofweek
# 'month'와 'day_of_week' 열을 OneHotEncoding하여 수치형 피처로 변환
df = pd.get_dummies(df, columns=['month', 'day_of_week'], drop_first=True)
# 계절(season) 추출 (3월~5월: 봄, 6월~8월: 여름, 9월~11월: 가을, 12월~2월: 겨울)
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return '봄'
    elif month in [6, 7, 8]:
        return '여름'
    elif month in [9, 10, 11]:
        return '가을'
    else:
        return '겨울'

df['season'] = df['창고입고'].apply(get_season)

# 'season' 열을 OneHotEncoding하여 수치형 피처로 변환
df = pd.get_dummies(df, columns=['season'], drop_first=True)

In [67]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^\w\s\*/\-\+.,#&]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(사용금지|사)\b', '', text, flags=re.IGNORECASE)
    text = text.strip()
    return text

def clean_supplier_name(name):
    name = name.lower()
    name = re.sub(r'coporation|coropration|coproration|corporration', 'corporation', name)
    name = re.sub(r'\(사용금지\)', '', name)
    name = re.sub(r'u\.s\.a', '_usa', name)
    name = re.sub(r'\.', '', name)
    suffixes = r'(corporation|corp|company|co|incorporated|inc|limited|ltd|상사|공사|엔지니어링|주식회사|주|gmbh|pte ltd|llc)'
    name = re.sub(suffixes, '', name, flags=re.IGNORECASE)
    name = re.sub(r'[^\w\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name



In [68]:

text_columns = ['Machinery', 'Assembly', '청구품목']

for col in text_columns:
    df[col] = df[col].astype(str)
df['cleaned_machinery'] = df['Machinery'].apply(preprocess_text)
df['cleaned_assembly'] = df['Assembly'].apply(preprocess_text)
df['cleaned_item'] = df['청구품목'].apply(preprocess_text)

In [73]:
# Machinery별 평균 리드타임 계산
machinery_avg_leadtime = df.groupby('cleaned_machinery')['리드타임'].mean().reset_index()
machinery_avg_leadtime.rename(columns={'리드타임': 'machinery_avg_leadtime'}, inplace=True)

# 데이터프레임에 평균 리드타임 추가
df = df.merge(machinery_avg_leadtime, on='cleaned_machinery', how='left')

# Assembly별 평균 리드타임 계산
#assembly_avg_leadtime = df.groupby('cleaned_assembly')['리드타임'].mean().reset_index()
#assembly_avg_leadtime.rename(columns={'리드타임': 'assembly_avg_leadtime'}, inplace=True)

# 데이터프레임에 평균 리드타임 추가
#df = df.merge(assembly_avg_leadtime, on='cleaned_assembly', how='left')

In [74]:
df['machinery_assembly'] = df['cleaned_machinery'] + " & " + df['cleaned_assembly']

# 조합에 따른 평균 리드타임 계산
assembly_machinery_avg_leadtime = df.groupby('machinery_assembly')['리드타임'].mean().reset_index()

# 데이터프레임에 평균 리드타임 추가
df = df.merge(assembly_machinery_avg_leadtime, on='machinery_assembly', how='left', suffixes=('', '_avg'))

In [75]:
# 발주처별 평균 리드타임 계산
supplier_leadtime_avg = df.groupby('발주처')['리드타임'].mean().reset_index()

# 원본 데이터프레임에 추가
df = df.merge(supplier_leadtime_avg, on='발주처', how='left', suffixes=('', '_supplier_avg'))

In [40]:
df[['Machinery','Assembly','리드타임', '발주처', 'machinery_avg_leadtime','리드타임_supplier_avg', '리드타임_avg']].head(20)

Unnamed: 0,Machinery,Assembly,리드타임,발주처,machinery_avg_leadtime,리드타임_supplier_avg,리드타임_avg
0,CARGO BOOM VANG BLOCK (STBD 하),BLOCK,112.0,MATSUI(U.S.A) COROPRATION,98.333333,90.609616,102.0
1,SPANISH BOOM VANG BLOCK (PORT 상),BLOCK,97.0,MATSUI(U.S.A) COROPRATION,72.0,90.609616,63.75
2,PURSE BLOCK,TOW BLOCK,112.0,MATSUI(U.S.A) COROPRATION,100.0,90.609616,91.333333
3,NET,H-EX,12.0,KTI,16.627119,18.484472,16.642857
4,NET,NYLON,12.0,KTI,16.627119,18.484472,12.190476
5,NET,NYLON,12.0,KTI,16.627119,18.484472,12.190476
6,NET,NYLON,12.0,KTI,16.627119,18.484472,12.190476
7,NET,NYLON,12.0,KTI,16.627119,18.484472,12.190476
8,NET,NYLON,12.0,KTI,16.627119,18.484472,12.190476
9,NET,H-EX,12.0,KTI,16.627119,18.484472,16.642857


In [76]:
print(df.head())


                청구서번호  No.                                  Subject  \
0  COK-BS-DSP-1901004    1                     COK-F-DECK-190104-01   
1  COK-BS-DSP-1901004    2                     COK-F-DECK-190104-01   
2  COK-BS-DSP-1901004    3                     COK-F-DECK-190104-01   
3  COK-BS-DSP-1906004    1  COK-D-DECK-190527-02, NET 및 SAMSON ROPE   
4  COK-BS-DSP-1906004    2  COK-D-DECK-190527-02, NET 및 SAMSON ROPE   

                          Machinery   Assembly  \
0    CARGO BOOM VANG BLOCK (STBD 하)      BLOCK   
1  SPANISH BOOM VANG BLOCK (PORT 상)      BLOCK   
2                       PURSE BLOCK  TOW BLOCK   
3                               NET       H-EX   
4                               NET      NYLON   

                                               청구품목  Unnamed: 6  \
0  MCKISSICK CONSTRUCTION BLOCKS (WIRE SIZE : 5/8")         NaN   
1  MCKISSICK CONSTRUCTION BLOCKS (WIRE SIZE : 5/8")         NaN   
2                            WESTEC 20TON TOW BLOCK         NaN   
3   

### 리드타임 예측 (회귀)
1. 텍스트 칼럼 결합 및 BERT 임베딩
2. ( 수치형 데이터(견적단가 및 발주량) Scaling )
3. 범주형 데이터(견적화폐) onehotEncoding
4. 모델의 입력, 2.3데이터 결합 => 리드타임 예측

In [77]:
from gensim.models import Word2Vec

# 데이터 전처리 후 토큰화 (문장을 단어 리스트로 변환)
machinery_sentences = [text.split() for text in df['cleaned_machinery']]
assembly_sentences = [text.split() for text in df['cleaned_assembly']]
#item_sentences = [text.split() for text in df['cleaned_item']]

# 각각의 Word2Vec 모델 학습
word2vec_machinery = Word2Vec(sentences=machinery_sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_assembly = Word2Vec(sentences=assembly_sentences, vector_size=100, window=5, min_count=1, workers=4)
#word2vec_item = Word2Vec(sentences=item_sentences, vector_size=100, window=5, min_count=1, workers=4)

# 각 텍스트에 대한 벡터 평균값 계산 (문장 단위로 벡터를 평균화하여 문장 벡터 생성)
def sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# 각 컬럼별로 벡터화
machinery_vectors = np.array([sentence_vector(sentence, word2vec_machinery) for sentence in machinery_sentences])
assembly_vectors = np.array([sentence_vector(sentence, word2vec_assembly) for sentence in assembly_sentences])
#item_vectors = np.array([sentence_vector(sentence, word2vec_item) for sentence in item_sentences])

# 결과 확인
print(f"Word2Vec Machinery Shape: {machinery_vectors.shape}")
print(f"Word2Vec Assembly Shape: {assembly_vectors.shape}")
#print(f"Word2Vec Item Shape: {item_vectors.shape}")

Word2Vec Machinery Shape: (19330, 100)
Word2Vec Assembly Shape: (19330, 100)


In [17]:
#!conda install conda-forge::category_encoders -y

In [78]:
from sklearn.preprocessing import MinMaxScaler

# 1. 수치형 피처 스케일링
scaler = MinMaxScaler()

# 수치형 피처 (리드타임 관련 피처들) 스케일링
numerical_features = df[['machinery_avg_leadtime_y', '리드타임_avg', '리드타임_supplier_avg']].values
scaled_numerical_features = scaler.fit_transform(numerical_features)

In [79]:
# 2. 요일과 계절 One-Hot-Encoding (이미 더미 처리된 값들)
day_of_week_features = df.filter(like='day_of_week_').values
season_features = df.filter(like='season_').values

In [80]:

# 3. 견적화폐 One-Hot-Encoding
currency_ohe = OneHotEncoder(sparse_output=False)
currency_encoded = currency_ohe.fit_transform(df[['견적화폐']])


### 데이터 분할


In [85]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import numpy as np
from scipy.sparse import hstack

# word2vec_machinery와 word2vec_assembly의 차원 확인
print(f"Shape of word2vec_machinery: {machinery_vectors.shape}")
print(f"Shape of word2vec_assembly: {assembly_vectors.shape}")

# Word2Vec 임베딩 배열 차원 수정
# (19330, 100) 형태의 배열 두 개를 가로로 결합하여 (19330, 200) 형태로 만들기
word2vec_combined = np.hstack((machinery_vectors, assembly_vectors))

# 크기 확인
print(f"Shape of word2vec_combined: {word2vec_combined.shape}")

# 모든 배열의 첫 번째 차원 (행 수)이 일치하는지 확인 후 결합
if word2vec_combined.shape[0] == scaled_numerical_features.shape[0]:
    # 모든 피처 결합 (수치형 + 요일/계절 OHE + 견적화폐 OHE + Word2Vec 임베딩)
    X = np.hstack((
        scaled_numerical_features,  # 스케일링된 수치형 피처들
        day_of_week_features,  # 요일 One-Hot-Encoding 피처
        season_features,  # 계절 One-Hot-Encoding 피처
        currency_encoded,  # 견적화폐 One-Hot-Encoding 피처
        word2vec_combined  # Word2Vec 임베딩 피처들
    ))
    
    print(f"Shape of X: {X.shape}")
    

Shape of word2vec_machinery: (19330, 100)
Shape of word2vec_assembly: (19330, 100)
Shape of word2vec_combined: (19330, 200)
Shape of X: (19330, 217)


In [86]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
y = df['리드타임'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [87]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (15464, 217)
y_train shape: (15464,)
X_test shape: (3866, 217)
y_test shape: (3866,)


In [88]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 모델 학습
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)

# 예측 및 평가
predictions = model_rf.predict(X_test)

In [90]:
# 10. 모델 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 성능 평가
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error (MSE): 442.5089678678581
Mean Absolute Error (MAE): 9.19210051895314
R^2 Score: 0.8343523055269169
