In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import csv

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import RandomizedSearchCV

# 업샘플링에 사용되는 라이브러리
from imblearn.over_sampling import ADASYN

data = pd.read_csv('사용할 데이터.csv',index_col=0)

In [2]:
dt = data.copy()
dt['역전세 위험'] = dt['전세가율']*0
dt.loc[(dt['전세가율'] > 70)&(dt['전세가 변화율'] < 0),'역전세 위험']=1

In [3]:
dt['자치구'] = dt['건물'].str.split().str[0]
dt['법정동'] = dt['건물'].str.split().str[1]
dt['면적(㎡)'] = dt['건물'].str.split().str[5]
dt['2층'] = dt['건물'].str.split().str[7]

In [4]:
df = dt.copy()

df.drop(columns = ['건물','전세가율','전세가 변화율'],inplace = True)

cols = df.columns.tolist()
cols = cols[-4:] + cols[:-4]
df = df[cols]

df

Unnamed: 0,자치구,법정동,면적(㎡),2층,계약월,월별평균 매매가,월별평균 전세가,매매가 변화율,역전세 위험
0,강남구,개포동,74㎡,이하,2017-07,55400.000000,25563.030208,0.000000,0.0
1,강남구,개포동,74㎡,이하,2017-08,55865.129446,25714.518156,0.000000,0.0
2,강남구,개포동,74㎡,이하,2017-09,55768.618437,25748.462678,0.000000,0.0
3,강남구,개포동,74㎡,이하,2017-10,55962.980886,25771.746937,0.000000,0.0
4,강남구,개포동,74㎡,이하,2017-11,56373.822889,25866.286638,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
3735591,중랑구,중화동,96㎡,초과,2023-03,83658.567397,45000.000000,-0.067474,0.0
3735592,중랑구,중화동,96㎡,초과,2023-04,83239.667064,44676.854996,-0.075041,0.0
3735593,중랑구,중화동,96㎡,초과,2023-05,82999.388734,44480.292271,-0.079964,0.0
3735594,중랑구,중화동,96㎡,초과,2023-06,82954.053200,44426.263250,-0.085110,0.0


In [5]:
df['역전세 위험'].value_counts(normalize=True)

역전세 위험
0.0    0.947655
1.0    0.052345
Name: proportion, dtype: float64

In [6]:
X = df.drop(columns=['역전세 위험'])
y = df['역전세 위험']

num_selector = selector(dtype_exclude=object)
cat_selector = selector(dtype_include=object)

num_col = num_selector(X)
cat_col = cat_selector(X)

cat_preprocessor = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
num_preprocessor = MinMaxScaler()

preprocessor = ColumnTransformer(
    [
        ("ordinal-encoder", cat_preprocessor, cat_col),
        ("min_max_scaler", num_preprocessor, num_col),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size=0.2, stratify=y)

In [7]:
pipe = Pipeline(
    steps = [('preprocess', preprocessor),
             ('dtc', DecisionTreeClassifier(class_weight = 'balanced',
                                            max_depth = 40,
                                            min_samples_split = 3,
                                            min_samples_leaf = 1,
                                            random_state = 42,
                                            criterion = 'entropy'
                                           ))])

pipe.fit(X_train,y_train)

y_pred = pipe.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98    708012
         1.0       0.71      0.73      0.72     39108

    accuracy                           0.97    747120
   macro avg       0.85      0.86      0.85    747120
weighted avg       0.97      0.97      0.97    747120



데이터 설명: 데이터는 총 8개의 항목으로 구성되어 있으며 매매가 변화율은 해당 되는 시기의 2년전 매매가와 비교했을 때, 얼마만큼 변했는지를 나타냅니다. 

역전세 위험은 매매가 변화율과 같은 방법으로 구한 전세가 변화율과 전세가를 매매가로 나눈 전세율를 통하여 구했는데, 전세가 변화율이 음수리고 전세율이 70% 이상일 경우를 위험도가 있다고 판단했습니다. 

모델: 모델은 디시젼 트리로 렌덤 서치 cv의 결과가 너무 안 좋게 나와서 원본 모델에서 아주 조금 수정만 했습니다. 이건 제가 보기에 모델 문제보다는 데이터의 문제라고 생각하지만 더 이상의 데이터 변화는 필요없다 생각하여 이렇게만 하겠습니다. 