In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from collections import Counter

In [5]:

# parquet 파일 읽기
df = pd.read_parquet('/home/kng/kng/MOUNTAIN2/trainDatas/MOUNTAIN_80.parquet')
df = df.fillna(0)

 # 예시 코드 카테고리형 변수의 열 이름 리스트
code_categorical_columns = ['CMBLC_ID', 'SBLT_ID', 'SMAR_NO', 'SLTP_CD', 'STQLT_CD', 'RHGLT_GGRP', 'WTHR_CD', 'TPGRP_TPCD', 'CLZN_CD', 'PRRCK_LARG', 'SOIL_DRNGE', 'ALTTD_CD', 'ACCMA_FOR', 'WASH_CD', 'SLANT_TYP', 'EIGHT_CD', 'ROCK_EXDGR', 'WIND_EXDGR', 'WTEFF_DGR', 'SIAFLR_STP', 'SIBFLR_STP', 'SIAFLR_SLD', 'SIBFLR_SLD', 'SIAFLR_ERC', 'SIBFLR_ERC', 'SIAFLR_ORM', 'SIBFLR_ORM', 'SIAFLR_SCS', 'SIBFLR_SCS', 'SIAFLR_CBS', 'SIBFLR_CBS', 'SIAFLR_STR', 'SIBFLR_STR', 'SIAFLR_HGD', 'SIBFLR_HGD', 'SIAFLR_CNS', 'SIBFLR_CNS', 'SIAFLR_HYP', 'SIAFLR_HER', 'SIBFLR_HER', 'SIAFLR_SMA', 'SIBFLR_SMA', 'SIAFLR_MDD', 'SIBFLR_MDD', 'SIAFLR_LAR', 'SIBFLR_LAR', 'KOFTR_CD'] 

# 코드 카테고리형 변수의 열 인덱스 리스트
code_categorical_features = [df.columns.get_loc(col) for col in code_categorical_columns]

In [6]:
# 특성 변수와 타겟 변수 분리
X = df.drop(columns=["is_landslided"])  # 특성 변수만 포함한 데이터프레임
y = df["is_landslided"]  # 타겟 변수

# 오버 샘플링
smotenc = SMOTENC(categorical_features=code_categorical_features, random_state=42)
X_resampled, y_resampled = smotenc.fit_resample(X, y)

In [7]:
# SMOTE 적용 후 클래스 분포 확인
print('Resampled dataset shape %s' % Counter(y_resampled))

# SMOTE 적용 후 데이터프레임 생성
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['is_landslided'] = y_resampled

# Parquet 파일로 저장
resampled_df.to_parquet('resampled_parquet_file.parquet', index=False)

Resampled dataset shape Counter({0.0: 456166, 1.0: 456166})


In [9]:
min_values = resampled_df[['CMBLC_ID', 'SBLT_ID', 'SMAR_NO', 'SLTP_CD', 'STQLT_CD']].min()
max_values = resampled_df[['CMBLC_ID', 'SBLT_ID', 'SMAR_NO', 'SLTP_CD', 'STQLT_CD']].max()

print("Minimum values:")
print(min_values)
print("\nMaximum values:")
print(max_values)

Minimum values:
CMBLC_ID    0.0
SBLT_ID     0.0
SMAR_NO     1.0
SLTP_CD     1.0
STQLT_CD    1.0
dtype: float64

Maximum values:
CMBLC_ID    121.0
SBLT_ID       0.0
SMAR_NO     287.0
SLTP_CD      28.0
STQLT_CD      3.0
dtype: float64


Unnamed: 0,avg_temp,lowest_temp,highest_temp,rainfalling_hr_asos,1hr_rainfall_mm_asos,daily_rainfall_asos,max_windspeed_asos,avg_windspeed_asos,avg_dewpoint_temp,min_rhum_per,...,TRHGH_AVRG,TOT_FRAG,AVRG_FRAG,LARCH_STIN,KRPN_STIND,GNGN_LCLT,CNDST_PINE,ACTSM_STIN,JBLPN_STIN,is_landslided
0,14.7,12.4,17.6,0.0,0.0,0.0,3.5,1.7,12.8,78.0,...,16.799999,233.0,17.0,14.0,14.0,12.0,0.0,17.0,11.0,0.0
1,14.7,12.4,17.6,0.0,0.0,0.0,3.5,1.7,12.8,78.0,...,15.300000,191.0,15.0,13.0,14.0,9.0,9.0,16.0,11.0,0.0
2,14.7,12.4,17.6,0.0,0.0,0.0,3.5,1.7,12.8,78.0,...,14.000000,199.0,14.0,12.0,12.0,7.0,7.0,14.0,9.0,0.0
3,14.7,12.4,17.6,0.0,0.0,0.0,3.5,1.7,12.8,78.0,...,17.100000,241.0,17.0,13.0,15.0,10.0,9.0,17.0,12.0,0.0
4,14.7,12.4,17.6,0.0,0.0,0.0,3.5,1.7,12.8,78.0,...,0.000000,0.0,0.0,13.0,14.0,10.0,10.0,17.0,12.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912327,26.3,21.8,33.0,0.0,0.5,0.5,4.3,0.9,22.4,48.0,...,15.000000,139.0,15.0,12.0,12.0,7.0,7.0,14.0,10.0,1.0
912328,22.8,21.2,24.1,0.0,13.1,24.3,1.6,0.3,22.2,93.0,...,0.000000,0.0,0.0,15.0,15.0,12.0,12.0,19.0,12.0,1.0
912329,23.0,20.0,26.8,0.0,0.5,0.9,1.9,0.4,22.5,86.0,...,12.600000,164.0,13.0,13.0,13.0,0.0,8.0,12.0,0.0,1.0
912330,21.8,19.8,24.0,0.0,31.2,170.1,2.7,0.9,21.3,91.0,...,15.700000,166.0,16.0,14.0,13.0,11.0,8.0,14.0,10.0,1.0
