In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [29]:
#자연발생 클러스터링(민원 단계 컬럼 생성)
from jenkspy import JenksNaturalBreaks

parks_df = pd.read_csv("src/RF_parksdf_week.csv")
#parks_df = pd.read_csv("src/RF_parksdf_sat.csv")
#parks_df = pd.read_csv("src/RF_parksdf_holi.csv")

# 1) IQR 계산
Q1 = parks_df["complaints_r300"].quantile(0.25)
Q3 = parks_df["complaints_r300"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 2) 이상치 제거
outliers = parks_df[
    (parks_df["complaints_r300"] < lower_bound) |
    (parks_df["complaints_r300"] > upper_bound)
].copy()

parks_df = parks_df.drop(outliers.index).copy()

# 3) 필터된 데이터로 Jenks 적용
i = 3
breaks = JenksNaturalBreaks(n_classes=i)
breaks.fit(parks_df["complaints_r300"])
print("경계값:", breaks.breaks_)

# 4) 레이블 부여
parks_df["complaint_level"] = pd.cut(
    parks_df["complaints_r300"],
    bins=breaks.breaks_,
    labels=range(i),
    include_lowest=True
)
parks_df["complaint_level"].value_counts()


경계값: [0, 913, 2107, 4099]


complaint_level
0    800
1    459
2    273
Name: count, dtype: int64

In [30]:
print(parks_df.columns)

Index(['address', 'parking_type', 'operation_type', 'total_parking_spaces',
       'base_parking_fee', 'base_parking_time', 'additional_unit_fee',
       'additional_unit_time', 'fee1H', 'cctv_r300', 'bus_r300',
       'complaints_r300', 'Weekday_paid', 'Weekday_operatingHours', 'lon',
       'lat', 'zoning_encoded', 'start_sin', 'start_cos', 'end_sin', 'end_cos',
       'complaint_level'],
      dtype='object')


In [31]:
#범주형 변수의 label을 [0,1]로 변환
from sklearn.preprocessing import LabelEncoder

le_dict = {}

for col in ["parking_type", "operation_type", "Weekday_paid", "Saturday_paid", "Holiday_paid"]:
    if col in parks_df.columns:  # ✅ 존재하는 경우에만 처리
        le = LabelEncoder()
        parks_df[col] = le.fit_transform(parks_df[col])
        le_dict[col] = le

# 예시: parking_type의 범주와 정수 매핑 확인
for col in le_dict:
    print(f"{col}: {list(le_dict[col].classes_)}")

# [0, 1, 2, ...] 순서로 classes_에 저장됨
#['NS', 'NW'] -> [0,1]
#[1, 3] -> [0,1]
#['N', 'Y'] -> [0,1]
#['N', 'Y'] -> [0,1]
#['N', 'Y'] -> [0,1]


parking_type: ['NS', 'NW']
operation_type: [1, 3]
Weekday_paid: ['N', 'Y']


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# 0) 데이터 준비axis=
X = parks_df.drop(["complaints_r300", "complaint_level", "address", "lat", "lon",
                   "base_parking_fee", "base_parking_time", "additional_unit_fee", "additional_unit_time", "zoning_encoded"], axis=1)
y = parks_df["complaint_level"]

# 1) 80%를 train+val, 20%를 test로 분리 (stratify 유지)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 2) train_val(80%) 중 12.5%를 val로 → 전체 비율 10%, train은 70%
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.125,    # 0.8 * 0.125 = 0.1
    stratify=y_train_val,
    random_state=42
)
# 3) 훈련셋에서만 RandomForest로 피처 중요도 계산
rf_fs = RandomForestClassifier(random_state=0)
rf_fs.fit(X_train, y_train)

importances = pd.Series(rf_fs.feature_importances_, index=X_train.columns)
top10 = importances.nlargest(10).index.tolist()
print("선택된 상위 10개 피처:", top10)

if 'start_sin' in top10 and 'start_cos' not in top10:
    top10.append('start_cos')
if 'start_cos' in top10 and 'start_sin' not in top10:
    top10.append('start_sin')
# 이렇게 하면 둘이 반드시 함께 들어감


# 4) 선택된 피처로 데이터 재정의
X_train_sel = X_train[top10]
X_val_sel   = X_val[top10]
X_test_sel  = X_test[top10]

# 5) 최종 모델 학습 & 평가
rf = RandomForestClassifier(
    random_state=0,
    class_weight="balanced"   # 클래스 불균형 보정
)
rf.fit(X_train_sel, y_train)

val_pred  = rf.predict(X_val_sel)
test_pred = rf.predict(X_test_sel)

print("Validation 정확도: {:.4f}".format(accuracy_score(y_val, val_pred)))
print("Test 정확도      : {:.4f}".format(accuracy_score(y_test, test_pred)))

선택된 상위 10개 피처: ['total_parking_spaces', 'bus_r300', 'cctv_r300', 'fee1H', 'Weekday_operatingHours', 'end_sin', 'end_cos', 'start_sin', 'start_cos', 'operation_type']
Validation 정확도: 0.6234
Test 정확도      : 0.7264


In [33]:
# Accuracy 분석

from sklearn.metrics import classification_report, accuracy_score

print("Complaint Level (Kmeans) Classification Accuracy:", round(accuracy_score(y_test, test_pred), 3))
print(classification_report(y_test, test_pred, target_names=['Low', 'Mid', 'High']))



Complaint Level (Kmeans) Classification Accuracy: 0.726
              precision    recall  f1-score   support

         Low       0.84      0.87      0.86       160
         Mid       0.59      0.55      0.57        92
        High       0.59      0.60      0.59        55

    accuracy                           0.73       307
   macro avg       0.67      0.67      0.67       307
weighted avg       0.72      0.73      0.72       307



In [34]:

importances = rf.feature_importances_
feature_names = top10  # DataFrame에서 feature 이름 추출

# 중요도 표 만들기
importance_table = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

importance_table


Unnamed: 0,feature,importance
0,total_parking_spaces,0.22338
1,bus_r300,0.197859
3,fee1H,0.172679
2,cctv_r300,0.160017
6,end_cos,0.06139
5,end_sin,0.05461
4,Weekday_operatingHours,0.052322
9,operation_type,0.02709
7,start_sin,0.025401
8,start_cos,0.025253


In [35]:
import numpy as np
import pandas as pd
import shap

# ── 1) 기존 mean_abs_shap 계산 (axis=0 으로 피처별 평균 절댓값 SHAP) ─────
explainer = shap.TreeExplainer(rf)
shap_vals = explainer.shap_values(X_train_sel)

mean_abs_shap = pd.DataFrame({
    f'class_{cls}': np.abs(shap_vals[cls]).mean(axis=1)
    for cls in range(len(shap_vals))
}, index=top10)

# ── 2) start_time, end_time 그룹 중요도 (Euclidean norm) 계산 ──────────
for cls in mean_abs_shap.columns:
    # start_time
    s_sin = mean_abs_shap.loc['start_sin', cls]
    s_cos = mean_abs_shap.loc['start_cos', cls]
    mean_abs_shap.loc['start_time', cls] = np.sqrt(s_sin**2 + s_cos**2)
    # end_time
    e_sin = mean_abs_shap.loc['end_sin', cls]
    e_cos = mean_abs_shap.loc['end_cos', cls]
    mean_abs_shap.loc['end_time', cls] = np.sqrt(e_sin**2 + e_cos**2)

# ── 3) sin/cos 개별 행 제거하여 그룹화된 DataFrame 구성 ────────────────
grouped_shap = mean_abs_shap.drop(
    ['start_sin','start_cos','end_sin','end_cos'],
    axis=0
)

# ── 4) 클래스별 상위 N개 피처 출력 ──────────────────────────────────
top_n = 5
for cls in grouped_shap.columns:
    print(f"\n▶ {cls} 기여도 상위 {top_n}개 피처")
    print(grouped_shap[cls].sort_values(ascending=False).head(top_n))



▶ class_0 기여도 상위 5개 피처
end_time                  0.129419
fee1H                     0.088352
Weekday_operatingHours    0.064053
cctv_r300                 0.037974
bus_r300                  0.020568
Name: class_0, dtype: float64

▶ class_1 기여도 상위 5개 피처
cctv_r300               0.085809
fee1H                   0.068770
end_time                0.037580
total_parking_spaces    0.029318
bus_r300                0.027743
Name: class_1, dtype: float64

▶ class_2 기여도 상위 5개 피처
end_time                  0.109054
operation_type            0.079705
Weekday_operatingHours    0.061557
bus_r300                  0.047206
fee1H                     0.047133
Name: class_2, dtype: float64

▶ class_3 기여도 상위 5개 피처
cctv_r300                 0.079494
bus_r300                  0.077874
fee1H                     0.056207
end_time                  0.054887
Weekday_operatingHours    0.043098
Name: class_3, dtype: float64

▶ class_4 기여도 상위 5개 피처
cctv_r300                 0.086029
bus_r300                  0.074690


In [36]:
from sklearn.metrics import mean_squared_error
import pandas as pd
pd.set_option('display.max_rows', None)

# 여기 요일타입에 따라 바꿔야 됩니다!!
dong_parks_df = pd.read_csv("src/dongParks_Weekday.csv")
dong_parks_df

# dong_parks_df의 feature 컬럼만 추출 (dong 이름 등 식별자 제외)
# 모델이 학습할 때 사용한 feature 리스트
model_features = list(rf.feature_names_in_)

# dong_parks_df에서 해당 컬럼만 추출
X_dong = dong_parks_df[model_features]
# 동별 민원 수준 예측
dong_parks_df['predicted_complaints'] = rf.predict(X_dong)

# 그룹별 동 이름 추출
high_dong = dong_parks_df[dong_parks_df['predicted_complaints'] == 2]['dong']
mid_dong = dong_parks_df[dong_parks_df['predicted_complaints'] == 1]['dong']
low_dong = dong_parks_df[dong_parks_df['predicted_complaints'] == 0]['dong']

# 각 그룹을 DataFrame으로 변환 (인덱스 리셋)
high_df = pd.DataFrame({'High 민원': high_dong.reset_index(drop=True)})
mid_df = pd.DataFrame({'Mid 민원': mid_dong.reset_index(drop=True)})
low_df = pd.DataFrame({'Low 민원': low_dong.reset_index(drop=True)})

# 길이가 다른 경우도 맞춰서 하나의 표로 합치기
dong_table = pd.concat([high_df, mid_df, low_df], axis=1)

from IPython.display import display
display(dong_table.head())


Unnamed: 0,High 민원,Mid 민원,Low 민원
0,가락본동,가리봉동,가산동
1,고덕2동,강일동,가양1동
2,구로3동,개포3동,가양2동
3,구의1동,거여2동,가양3동
4,금호2·3가동,광희동,갈현1동


In [37]:
import pandas as pd

pd.Series(rf.feature_importances_, index=X_dong.columns).sort_values(ascending=False)


total_parking_spaces      0.223380
bus_r300                  0.197859
fee1H                     0.172679
cctv_r300                 0.160017
end_cos                   0.061390
end_sin                   0.054610
Weekday_operatingHours    0.052322
operation_type            0.027090
start_sin                 0.025401
start_cos                 0.025253
dtype: float64

In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler


scaler = RobustScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_dong),
    columns=X_dong.columns
)

'''
# 정규화 수행
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_dong), columns=X_dong.columns)
'''

# 정규화된 데이터 기준으로 영향력 큰 feature 추정
importances = rf.feature_importances_
dong_parks_df['top_feature'] = X_scaled.apply(
    lambda row: row.index[np.argmax(np.abs(row.values * importances))],
    axis=1
)

# 결과 출력
solution_table = dong_parks_df[['dong', 'predicted_complaints', 'top_feature']]
display(solution_table.head())

# 민원 수준별 요약
summary = solution_table.groupby('predicted_complaints')['top_feature'].value_counts()
print(summary)


Unnamed: 0,dong,predicted_complaints,top_feature
0,가락본동,2,cctv_r300
1,가리봉동,1,total_parking_spaces
2,가산동,0,bus_r300
3,가양1동,0,total_parking_spaces
4,가양2동,0,bus_r300


predicted_complaints  top_feature         
0                     total_parking_spaces    55
                      fee1H                   53
                      bus_r300                52
                      cctv_r300               33
1                     fee1H                   31
                      cctv_r300               18
                      bus_r300                16
                      total_parking_spaces     9
2                     cctv_r300               18
                      total_parking_spaces    13
                      bus_r300                12
                      fee1H                   11
Name: count, dtype: int64


In [39]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# 1) 행정동 경계 불러오기 (EPSG:5179)
dong_boundaries = (
    gpd.read_file("src/dong_boundary.shp")
       .rename(columns={"ADM_NM":"dong"})
)

# 3) 주소에서 ‘○○동’ 부분만 추출해 두기
parks_df['addr_dong'] = parks_df['address'].str.extract(r'^\S+\s+(\S+동)')[0]

# 4) GeoDataFrame 생성 (위경도 → EPSG:4326 지정)
geometry = [Point(xy) for xy in zip(parks_df['lon'], parks_df['lat'])]
parks_gdf = gpd.GeoDataFrame(
    parks_df,
    geometry=geometry,
    crs="EPSG:4326"
)

# 5) 경계 CRS로 투영
parks_gdf = parks_gdf.to_crs(dong_boundaries.crs)

# ─────── 이 부분을 추가 ───────
# 6) 공간 결합
parks_gdf = gpd.sjoin(
    parks_gdf,
    dong_boundaries[['dong','geometry']],
    how='left',
    predicate='within'
)

# 7) NaN인 경우만 매핑 테이블로 보정
mapping = {
    '잠실동': '잠실본동',
    # 필요시 다른 주소-행정동도 여기에 추가
}
parks_gdf['dong'] = parks_gdf['dong'].fillna(
    parks_gdf['addr_dong'].map(mapping)
)
# ─────────────────────────────

# 8) 결과 확인
parks_gdf = parks_gdf.drop(columns=['addr_dong', 'geometry', 'index_right'])
print(parks_gdf.columns)

Index(['address', 'parking_type', 'operation_type', 'total_parking_spaces',
       'base_parking_fee', 'base_parking_time', 'additional_unit_fee',
       'additional_unit_time', 'fee1H', 'cctv_r300', 'bus_r300',
       'complaints_r300', 'Weekday_paid', 'Weekday_operatingHours', 'lon',
       'lat', 'zoning_encoded', 'start_sin', 'start_cos', 'end_sin', 'end_cos',
       'complaint_level', 'dong'],
      dtype='object')


In [40]:
import numpy as np
import pandas as pd
import shap

# 1) TreeExplainer 생성
explainer = shap.TreeExplainer(rf)
feature_cols = list(rf.feature_names_in_)

# 2) park-level SHAP 값 계산 (n_samples, n_features, n_classes)
raw_shap_p = explainer.shap_values(parks_gdf[feature_cols])

# 3) 클래스 차원 절댓값 평균 → (n_samples, n_features)
shap_abs_p = np.abs(raw_shap_p).mean(axis=2)

# 4) DataFrame으로 변환 후 dong 매핑
df_shap_p        = pd.DataFrame(shap_abs_p, columns=feature_cols)
df_shap_p['dong'] = parks_gdf['dong'].values

# 5) 행정동별 평균 SHAP 기여도 집계
dong_shap_mean = df_shap_p.groupby('dong').mean()

# 6) 행정동별 Top3 피처 추출
top3 = dong_shap_mean.apply(lambda row: row.nlargest(3).index.tolist(), axis=1)
df_top3 = (
    pd.DataFrame(top3.tolist(), index=top3.index, columns=['Top1','Top2','Top3'])
      .reset_index()
      .rename(columns={'index':'dong'})
)

# 7) 결과 확인
print(df_top3.head())


   dong                    Top1                  Top2                  Top3
0  가락본동                 end_cos                 fee1H               end_sin
1  가리봉동               cctv_r300  total_parking_spaces              bus_r300
2   가산동  Weekday_operatingHours               end_cos                 fee1H
3  가양1동               cctv_r300                 fee1H  total_parking_spaces
4  가양2동    total_parking_spaces              bus_r300             cctv_r300


In [41]:
# 1) 실제 complaints_level: parks_gdf 에 이미 있다면 바로 사용
#    없다면, 원본 complaints_df 등에서 행정동별로 집계해서 merge

# 예: 원본 parks_df 에 'complaints_level' 컬럼이 있다고 가정
actual = parks_gdf.groupby('dong')['complaint_level'] \
                   .agg(lambda x: x.mode()[0]) \
                   .reset_index()

# 2) 예측 complaints_level
predicted = pd.DataFrame({
    'dong': parks_gdf['dong'],
    'predicted_level': rf.predict(parks_gdf[feature_cols])
}).groupby('dong')['predicted_level'] \
  .agg(lambda x: x.mode()[0]) \
  .reset_index()

# 3) df_top3 에 결합
df_insight = (
    df_top3
      .merge(actual,    on='dong', how='left')
      .merge(predicted, on='dong', how='left')
)

df_insight.head()

Unnamed: 0,dong,Top1,Top2,Top3,complaint_level,predicted_level
0,가락본동,end_cos,fee1H,end_sin,2,2
1,가리봉동,cctv_r300,total_parking_spaces,bus_r300,1,1
2,가산동,Weekday_operatingHours,end_cos,fee1H,0,0
3,가양1동,cctv_r300,fee1H,total_parking_spaces,0,0
4,가양2동,total_parking_spaces,bus_r300,cctv_r300,0,0


In [43]:
import pandas as pd

# 1. 두 CSV 파일 불러오기
df1 = pd.read_csv('src/RF_parksdf_week.csv')
df2 = pd.read_csv('src/RF_parksdf_weeks.csv')

# 2. 데이터프레임 전체 비교
if df1.equals(df2):
    print("✅ 두 CSV 파일은 완전히 동일합니다.")
else:
    print("❌ 두 CSV 파일은 다릅니다.")


✅ 두 CSV 파일은 완전히 동일합니다.
