In [164]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from category_encoders import BinaryEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score


font_path = '../Font/GmarketSansTTFMedium.ttf'
font_name = fm.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)

plt.rcParams['axes.unicode_minus'] = False

In [165]:
seoul = pd.read_csv('../Data/preprocessing/seoul_real_transcation_price.csv')

In [166]:
seoul.head(5)

Unnamed: 0,시도명,시군구명,관리기관,회사명,공장구분,단지명,설립구분,입주형태,보유구분,등록구분,...,지식산업센터명,대표업종,업종명,업종코드,차수,법인주소,필지수,공장주소,공장주소_지번,공장관리번호
0,서울특별시,종로구,서울특별시 종로구,남일문화 (주),개별,,일반,해당없음,임대,신규등록,...,,18111,경 인쇄업 외 2 종,181111811318119,10,서울특별시 종로구 자하문로16길 8 (창성동),1,서울특별시 종로구 자하문로16길 8 (창성동),서울특별시 종로구 창성동 87-1번지,111000000000000.0
1,서울특별시,종로구,서울특별시 종로구,(주)에취.알.디,개별,,일반,해당없음,임대,등록변경,...,,33932,전시용 모형 제조업,33932,10,서울특별시 종로구 통의동 35-69번지,1,서울특별시 종로구 통의동 35-69번지,서울특별시 종로구 통의동 35-69번지,111000000000000.0
2,서울특별시,종로구,서울특별시 종로구,(주)코리아쉬핑가제트,개별,,일반,해당없음,자가,등록변경,...,,58113,일반 서적 출판업 외 3 종,58113181111811300000,10,서울특별시 도봉구 창동 181-17번지 쌍용아파트 115-1902,1,서울특별시 종로구 자하문로2길 13-3 (통의동),서울특별시 종로구 통의동 35-6번지,111000000000000.0
3,서울특별시,종로구,서울특별시 종로구,삼영DP,개별,,일반,해당없음,임대,신규등록,...,,18111,경 인쇄업 외 2 종,181111811318119,10,서울특별시 종로구 사직로 125 (적선동),1,서울특별시 종로구 사직로 125 (적선동),서울특별시 종로구 적선동 107-1번지,111000000000000.0
4,서울특별시,종로구,서울특별시 종로구,삼진기획,개별,,일반,해당없음,임대,등록변경,...,,18119,기타 인쇄업 외 1 종,1811918113,10,서울특별시 서대문구 홍제동 -번지 문화촌현대아파트 103-1101호,1,서울특별시 종로구 적선동 2번지,서울특별시 종로구 적선동 2번지,111000000000000.0


In [167]:
seoul.columns

Index(['시도명', '시군구명', '관리기관', '회사명', '공장구분', '단지명', '설립구분', '입주형태', '보유구분',
       '등록구분', '전화번호', '남자종업원', '여자종업원', '외국인남자종업원', '외국인여자종업원', '종업원합계',
       '생산품', '원자재', '공장규모', '용도지역', '지목', '용지면적', '제조시설면적', '부대시설면적', '건축면적',
       '지식산업센터명', '대표업종', '업종명', '업종코드', '차수', '법인주소', '필지수', '공장주소',
       '공장주소_지번', '공장관리번호'],
      dtype='object')

In [168]:
seoul = seoul.drop(columns=['시도명', '시군구명', '관리기관', '회사명', '단지명', '전화번호', '법인주소', '필지수', '공장주소', '공장관리번호']) #1차
seoul = seoul.drop(columns=['생산품', '원자재', '업종명', '업종코드', '차수']) #2차

### 공장주소_지번을 시, 구, 동으로 구분하기

In [169]:
address = seoul['공장주소_지번'].str.split(n=2, expand=True)
address[2] = address[2].str.split().str[0] #남은 부분에서 첫번째만 분리

print(address)

address.columns = ['시', '구', '동']
seoul = pd.concat([seoul, address[['시', '구', '동']]], axis=1)
seoul = seoul.drop(columns=['공장주소_지번', '시', '동'])


           0    1    2
0      서울특별시  종로구  창성동
1      서울특별시  종로구  통의동
2      서울특별시  종로구  통의동
3      서울특별시  종로구  적선동
4      서울특별시  종로구  적선동
...      ...  ...  ...
11628  서울특별시  강동구  천호동
11629  서울특별시  강동구  천호동
11630  서울특별시  강동구  천호동
11631  서울특별시  강동구  천호동
11632  서울특별시  강동구  천호동

[11633 rows x 3 columns]


In [170]:
object_columns = ['공장구분', '설립구분', '입주형태', '보유구분', '등록구분', '공장규모', '용도지역', '지목', '지식산업센터명', '대표업종', '구']
int_columns = ['남자종업원', '여자종업원', '외국인남자종업원', '외국인여자종업원', '종업원합계','용지면적', '제조시설면적', '부대시설면적', '건축면적']

In [171]:
# object 타입 변환
for column in object_columns:
    seoul[column] = seoul[column].astype('object')

for column in int_columns:
    seoul[column] = seoul[column].astype('int')

In [172]:
seoul['지목'] = seoul['지목'].str.strip() # 공백제거
seoul['지목'].replace('', '해당없음', inplace=True)
seoul['지목'].fillna('해당없음', inplace=True)

In [173]:
seoul['지식산업센터명'] = seoul['지식산업센터명'].str.strip() # 공백제거
seoul['지식산업센터명'].replace('', '해당없음', inplace=True)
seoul['지식산업센터명'].fillna('해당없음', inplace=True)

In [174]:
label_encoders = {}
for column in seoul.select_dtypes(include=['object']).columns:
    if column != '구':
        label_encoders[column] = LabelEncoder()
        seoul[column] = label_encoders[column].fit_transform(seoul[column])

# '구' 칼럼 라벨 인코딩
label_encoder_gu = LabelEncoder()
seoul['구'] = label_encoder_gu.fit_transform(seoul['구'])


In [175]:
X = seoul.drop(columns=['구'])
y = seoul['구']

In [176]:
print(X.shape, y.shape)

(11633, 19) (11633,)


In [177]:
X

Unnamed: 0,공장구분,설립구분,입주형태,보유구분,등록구분,남자종업원,여자종업원,외국인남자종업원,외국인여자종업원,종업원합계,공장규모,용도지역,지목,용지면적,제조시설면적,부대시설면적,건축면적,지식산업센터명,대표업종
0,0,1,9,0,2,0,0,0,0,0,1,57,2,238,57,69,126,223,105
1,0,1,9,0,0,5,2,0,0,7,1,78,2,157,79,78,157,223,396
2,0,1,9,1,0,12,12,0,0,24,1,78,2,160,92,67,160,223,422
3,0,1,9,0,2,0,0,0,0,0,1,59,2,280,35,88,123,223,105
4,0,1,9,0,0,7,2,0,0,9,1,34,2,122,159,18,177,223,108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11628,0,1,6,0,0,1,4,0,0,5,1,58,2,104,104,0,104,223,68
11629,0,1,9,1,0,0,0,0,0,0,1,78,2,164,110,53,164,223,384
11630,0,1,6,0,0,7,0,0,0,7,1,58,2,125,114,11,125,223,77
11631,0,1,6,0,2,2,3,0,0,5,1,74,2,104,104,0,104,223,42


In [178]:
y.value_counts()

7     3455
15    1357
6     1215
23    1190
19     845
3      642
17     428
24     341
10     265
0      228
14     208
5      174
22     166
9      155
1      129
18     108
8      105
16     103
12     102
21      94
2       89
20      77
4       65
13      52
11      40
Name: 구, dtype: int64

In [179]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)

(7910, 19) (1978, 19) (1745, 19) (7910,) (1978,) (1745,)


In [180]:
## label encoding
from sklearn.preprocessing import LabelEncoder
categorical_features = list(X_train.dtypes[X_train.dtypes == 'object'.index])

# 라벨 인코딩 호기화
le = LabelEncoder()

# Label encoding 적용
for feature in categorical_features:
    X_train[feature] = le.fit_transform(X_train[feature])
    X_val[feature] = le.fit_transform(X_val[feature])
    X_test[feature] = le.fit_transform(X_test[feature])

In [181]:
print(X_train.info(), X.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7910 entries, 2548 to 5416
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   공장구분      7910 non-null   int32
 1   설립구분      7910 non-null   int32
 2   입주형태      7910 non-null   int32
 3   보유구분      7910 non-null   int32
 4   등록구분      7910 non-null   int32
 5   남자종업원     7910 non-null   int32
 6   여자종업원     7910 non-null   int32
 7   외국인남자종업원  7910 non-null   int32
 8   외국인여자종업원  7910 non-null   int32
 9   종업원합계     7910 non-null   int32
 10  공장규모      7910 non-null   int32
 11  용도지역      7910 non-null   int32
 12  지목        7910 non-null   int32
 13  용지면적      7910 non-null   int32
 14  제조시설면적    7910 non-null   int32
 15  부대시설면적    7910 non-null   int32
 16  건축면적      7910 non-null   int32
 17  지식산업센터명   7910 non-null   int32
 18  대표업종      7910 non-null   int32
dtypes: int32(19)
memory usage: 648.9 KB
None 공장구분        0
설립구분        0
입주형태        0
보유구분        0
등록구분       

In [182]:
X_train

Unnamed: 0,공장구분,설립구분,입주형태,보유구분,등록구분,남자종업원,여자종업원,외국인남자종업원,외국인여자종업원,종업원합계,공장규모,용도지역,지목,용지면적,제조시설면적,부대시설면적,건축면적,지식산업센터명,대표업종
2548,0,1,9,1,2,10,1,2,0,13,1,9,2,495,245,54,299,223,165
5896,1,0,5,1,3,32,6,1,0,39,1,9,0,208,816,636,1452,96,250
2660,0,1,0,0,0,5,12,0,0,17,1,11,2,904,72,26,98,223,65
8386,1,3,5,1,3,2,9,0,0,11,1,9,0,66,271,175,446,172,62
3536,0,1,6,0,2,4,0,0,0,4,1,58,2,0,131,160,292,223,268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4166,0,1,6,0,0,1,2,0,0,3,1,1,2,170,22,148,170,223,39
9825,0,1,9,0,3,24,1,0,0,25,1,9,0,601,412,220,632,223,347
4383,0,1,9,1,3,11,2,0,0,13,1,9,2,359,0,110,110,223,229
1315,0,1,6,0,2,1,1,0,0,2,1,34,0,0,65,0,65,69,64


In [183]:
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
import os

In [184]:
# 최적 파라미터 설정
best_params = {
    'num_leaves': 97,
    'learning_rate': 0.06291686692582928,
    'n_estimators': 471,
    'subsample': 0.5148941824713373,
    'colsample_bytree': 0.6947463127096882,
    'min_split_gain': 0.33450025083007257,
    'min_child_samples': 69,
    'reg_alpha': 0.40420315890792746,
    'reg_lambda': 3.172929043100316
}

# LightGBM 모델 초기화
model = lgb.LGBMClassifier(
    num_leaves=best_params['num_leaves'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    min_split_gain=best_params['min_split_gain'],
    min_child_samples=best_params['min_child_samples'],
    reg_alpha=best_params['reg_alpha'],
    reg_lambda=best_params['reg_lambda']
)

# 모델 훈련
model.fit(X_train, y_train)


In [189]:
# 성능이 가장 좋은 모델 저장
import joblib

#모델저장
model_filename = 'best_lgbm_model.pkl'
joblib.dump(model, model_filename)

print('모델 저장 완료')

모델 저장 완료


In [185]:
#train, val 시각화 함수
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 결과 시각화 함수
def evaluate_model(model, X_train, y_train, X_val, y_val):
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)

    accuracy_train = accuracy_score(y_train, pred_train)
    accuracy_val = accuracy_score(y_val, pred_val)
    f1_train = f1_score(y_train, pred_train, average='macro')
    f1_val = f1_score(y_val, pred_val, average='macro')

    print("Model Performance Evaluation:\n")
    print(f"{'Metric'.ljust(15)}\tTrain\t\tValidation")
    print("-" * 40)
    print(f"{'Accuracy'.ljust(15)}\t{accuracy_train:.4f}\t\t{accuracy_val:.4f}")
    print(f"{'F1 Score'.ljust(15)}\t{f1_train:.4f}\t\t{f1_val:.4f}")

In [186]:
#모델 test평가
evaluate_model(model, X_train, y_train, X_val, y_val)

Model Performance Evaluation:

Metric         	Train		Validation
----------------------------------------
Accuracy       	0.9029		0.6881
F1 Score       	0.8131		0.3652


In [187]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 단일 모델 평가 함수
def evaluate_model(model, X_train, y_train, X_val, y_val):
    print("Model Performance Evaluation:\n")
    header = f"{'Metric'.ljust(15)}\tTrain\t\tVal"
    print(header)
    print("-" * len(header))

    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)

    accuracy_train = accuracy_score(y_train, pred_train)
    accuracy_val = accuracy_score(y_val, pred_val)
    f1_train = f1_score(y_train, pred_train, average='macro')
    f1_val = f1_score(y_val, pred_val, average='macro')

    print(f"{'Accuracy'.ljust(15)}\t{accuracy_train:.4f}\t\t{accuracy_val:.4f}")
    print(f"{'F1 Score'.ljust(15)}\t{f1_train:.4f}\t\t{f1_val:.4f}")

