In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# 각 주차장의 속성으로부터 그 주변 100m 이내에 발생한 민원 건수를 예측

In [3]:
# 위도, 경도 가지고 거리 측정하는 함수
def haversine_array(lat, lon, rep_coords):
    R = 6371000  # 지구 반지름 (m)
    lat1 = np.radians(lat)
    lon1 = np.radians(lon)
    lat2 = np.radians(rep_coords[:,0])
    lon2 = np.radians(rep_coords[:,1])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

In [4]:
parks = pd.read_csv('parks.csv')
reports = pd.read_csv('reports.csv')

In [5]:
# 전처리
for col in ['평일유료','토요일유료','공휴일유료']:
    parks[col] = parks[col].map({'Y':1, 'N':0})

In [6]:
parks.head()

Unnamed: 0,주소,주차장종류,운영구분명,총주차면,평일유료,토요일유료,공휴일유료,평일시작,평일종료,토요일시작,토요일종료,공휴일시작,공휴일종료,기본주차요금,기본주차시간,추가단위요금,추가단위시간,경도,위도,1시간 요금
0,강남구 개포동 126-2,NW,시간제 주차장,132.0,1,1,1,09:00:00,19:00:00,00:00:00,00:00:00,00:00:00,00:00:00,200.0,5.0,200.0,5.0,127.066477,37.477263,2400.0
1,강남구 개포동 1266-0,NW,시간제 주차장,97.0,1,0,0,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,200.0,5.0,200.0,5.0,127.048218,37.481496,2400.0
2,강남구 개포동 13-2,NW,시간제 주차장,168.0,1,0,0,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,100.0,5.0,100.0,5.0,127.079307,37.494938,1200.0
3,강남구 개포동 567-23,NW,시간제 주차장,92.0,1,1,1,09:00:00,19:00:00,00:00:00,00:00:00,00:00:00,00:00:00,200.0,5.0,200.0,5.0,127.065835,37.477888,2400.0
4,강남구 논현동 168-0,NW,시간제 주차장,192.0,1,0,0,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,300.0,5.0,300.0,5.0,127.02629,37.508171,3600.0


In [7]:
features = parks[['총주차면','평일유료','토요일유료','공휴일유료','1시간 요금']].astype(float)

In [8]:
features.head()

Unnamed: 0,총주차면,평일유료,토요일유료,공휴일유료,1시간 요금
0,132.0,1.0,1.0,1.0,2400.0
1,97.0,1.0,0.0,0.0,2400.0
2,168.0,1.0,0.0,0.0,1200.0
3,92.0,1.0,1.0,1.0,2400.0
4,192.0,1.0,0.0,0.0,3600.0


In [9]:
# 주차장별 민원 건수 집계 함수
def count_reports_within_radius(parks, reports, radius_m=400):
    # haversine distance 계산
    park_coords = parks[['위도','경도']].to_numpy()   # (lat, lon)
    rep_coords  = reports[['위도','경도']].to_numpy()  # (lat, lon)

    # 각 주차장마다 반경 내 민원 수
    counts = []
    for lat, lon in park_coords:
        dists = haversine_array(lat, lon, rep_coords)
        counts.append((dists <= radius_m).sum())
    return np.array(counts)

In [10]:
parks['report_count'] = count_reports_within_radius(parks, reports)

In [11]:
target   = parks['report_count']

In [77]:
X.head()


Unnamed: 0,총주차면,1시간 요금,평일유료,토요일유료,공휴일유료
0.0,132.0,2400.0,,,
1.0,97.0,2400.0,,,
2.0,168.0,1200.0,,,
3.0,92.0,2400.0,,,
4.0,192.0,3600.0,,,


In [59]:
# 랜덤포레스트 회귀

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print("RandomForestRegressor MAE:", mean_absolute_error(y_test, preds))

RandomForestRegressor MAE: 3078.132037537523


In [13]:
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [14]:
parks_df = pd.read_csv('parks.csv')
reports_df = pd.read_csv('reports.csv')

In [15]:
from datetime import datetime

reports_df["datetime"] = pd.to_datetime(
    reports_df["민원접수일"].astype(str) + " " + reports_df["민원접수시간"].astype(str),
    errors="coerce"
)

In [17]:
def is_within_operating_hours(report_time, start_str, end_str):
    if start_str == "00:00:00" and end_str == "00:00:00":
        return False
    try:
        start = datetime.strptime(start_str, "%H:%M:%S").time()
        end = datetime.strptime(end_str, "%H:%M:%S").time()
        return start <= report_time.time() <= end
    except:
        return False

def time_to_float(tstr):
    try:
        t = datetime.strptime(tstr, "%H:%M:%S").time()
        return t.hour + t.minute / 60
    except:
        return 0


In [27]:
import pandas as pd
import numpy as np

# 0) parks 좌표 전처리 & 라디안 변환 (한 번만)
parks_df = parks_df.dropna(subset=['위도','경도']).reset_index(drop=True)
parks_rad = np.radians(parks_df[['위도','경도']].values)  # shape=(n_parks,2)

# 1) reports 전처리 & 10만 건 랜덤 샘플링
reports_df['datetime'] = pd.to_datetime(reports_df['datetime'])
reports_df = reports_df.dropna(subset=['위도','경도']).reset_index(drop=True)
reports_sample = reports_df.sample(n=100000, random_state=42).reset_index(drop=True)

# 2) 최적화된 해버사인 함수 (입력은 모두 라디안)
def haversine_rad(lat1, lon1, lat2_arr, lon2_arr, R=6371000):
    dlat = lat2_arr - lat1
    dlon = lon2_arr - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2_arr) * np.sin(dlon / 2.0)**2
    return 2 * R * np.arcsin(np.sqrt(a))

records = []

# 3) 샘플 순회하며 거리 계산 → 최단 거리 주차장 찾기
for _, report in reports_sample.iterrows():
    # 3-1) 리포트 좌표를 라디안으로 변환
    lat1 = np.radians(report['위도'])
    lon1 = np.radians(report['경도'])
    
    # 3-2) 벡터화된 해버사인 호출
    dists = haversine_rad(
        lat1, lon1,
        parks_rad[:, 0],  # parks 위도(라디안)
        parks_rad[:, 1]   # parks 경도(라디안)
    )
    
    # 3-3) 최단거리 인덱스 & 주차장 정보 추출
    idx_min = np.argmin(dists)
    nearest = parks_df.iloc[idx_min]
    dist_m = dists[idx_min]
    
    # 3-4) 운영시간 내외 판단 (사용자 정의 함수)
    label = is_within_operating_hours(
        report['datetime'],
        nearest['평일시작'],
        nearest['평일종료']
    )
    
    # 3-5) 결과 저장
    records.append({
        'distance': dist_m,
        '시작시간': time_to_float(nearest['평일시작']),
        '종료시간': time_to_float(nearest['평일종료']),
        '요금': nearest.get('1시간 요금', 0) or 0,
        'label': int(label)
    })

# 4) DataFrame 변환
df = pd.DataFrame(records)
print(df.head())


      distance  시작시간  종료시간      요금  label
0  1453.631132   0.0   0.0  2040.0      0
1   131.029011   0.0   0.0  1800.0      0
2    43.389330   9.0  18.0  3000.0      1
3   465.849981   0.0   0.0  3600.0      0
4    99.152378   0.0   0.0  1500.0      0


In [28]:
X = df[["distance", "시작시간", "종료시간", "요금"]]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.78      0.87     20760
           1       0.66      0.96      0.78      9240

    accuracy                           0.83     30000
   macro avg       0.82      0.87      0.82     30000
weighted avg       0.88      0.83      0.84     30000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
import pandas as pd
import statsmodels.api as sm

# Poisson Regression

# 상수항 추가
X = sm.add_constant(features)
poisson_model = sm.GLM(target, X, family=sm.families.Poisson())
poisson_result = poisson_model.fit()

print(poisson_result.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:           report_count   No. Observations:                 1463
Model:                            GLM   Df Residuals:                     1457
Model Family:                 Poisson   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -2.9576e+06
Date:                Thu, 22 May 2025   Deviance:                   5.9007e+06
Time:                        22:28:03   Pearson chi2:                 8.63e+06
No. Iterations:                     6   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.2635      0.002   3367.960      0.0

주차면 1 증가시 민원 수 0.08% 감소
평일유료일 때 민원 수 25.73% 감소 
토요일유료일 때 민원 수 22.98% 감소
공휴일유료일 때 민원 수 19.50% 증가 
1시간 요금 1원 증가시 민원 수 0.01% 증가 

-> 과산포

In [61]:
# Negative Binomial GLM

X = sm.add_constant(features)   # 상수항 추가
y = parks['report_count']

nb_model = sm.GLM(
    y, 
    X, 
    family=sm.families.NegativeBinomial(alpha=1.0)
)

nb_result = nb_model.fit()

print(nb_result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           report_count   No. Observations:                 1463
Model:                            GLM   Df Residuals:                     1457
Model Family:        NegativeBinomial   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13953.
Date:                Thu, 22 May 2025   Deviance:                       1198.2
Time:                        22:28:03   Pearson chi2:                 1.65e+03
No. Iterations:                    10   Pseudo R-squ. (CS):            0.03543
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.1852      0.179     45.830      0.0

In [62]:
!pip install xgboost



In [63]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb

# Xgboost

# DMatrix 생성
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test,  label=y_test)

# 하이퍼파라미터 설정
params = {
    'objective':        'reg:squarederror',  # MSE 기반 회귀
    'eval_metric':      'rmse',              # 검증 지표로 RMSE 사용
    'learning_rate':    0.1,                 # (alias: eta) 0.01–0.3 사이로 주로 튜닝
    'max_depth':        6,                   # 트리 깊이, 기본값 6, 3–10 사이로 자주 사용
    'min_child_weight': 1,                   # 자식 노드가 최소 가져야 할 샘플 가중치 합
    'subsample':        0.5,                 # 각 트리 학습 시 데이터 샘플링 비율 (0.5–1.0)
    'colsample_bytree': 0.5,                 # 각 트리 학습 시 컬럼 샘플링 비율 (0.5–1.0)
    'gamma':            0,                   # 리프 노드 분할을 위한 최소 손실 감소량
    'reg_alpha':        0.0,                 # L1 정규화 강도 (alpha)
    'reg_lambda':       1.0,                 # L2 정규화 강도 (lambda)
    'seed':             42,
}

watchlist = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=watchlist,
    early_stopping_rounds=20,
    verbose_eval=False
)

preds = bst.predict(dtest)
rmse = root_mean_squared_error(y_test, preds)

print("XGBoost RMSE:", rmse)

XGBoost RMSE: 5261.084312907074


In [64]:
mean_y = y_test.mean()
std_y  = y_test.std()
print(f"관측치 평균: {mean_y:.1f}, 표준편차: {std_y:.1f}")
print(f"RMSE/mean(y): {rmse/mean_y*100:.1f}%")    # 상대적 오차(%)
print(f"RMSE/(y_max - y_min): {rmse/(y_test.max()-y_test.min())*100:.1f}%")  # 정규화 RMSE


관측치 평균: 5035.3, 표준편차: 5845.3
RMSE/mean(y): 104.5%
RMSE/(y_max - y_min): 9.6%
