# '행정동 - 주택유형'으로 매핑된 데이터프레임 임베딩 작업

In [1]:
import os
import pandas as pd
from pathlib import Path

csv_files = []
input_folder = './csv_preprocessing'

for dirpath, dirnames, filenames in os.walk(input_folder):
    for filename in filenames:
        if filename.endswith(".csv"):
            csv_files.append(Path(dirpath) / filename)
csv_files

# csv_files 있는 모든 xlsx 파일을 읽어서 각각 csv로 저장
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

for csv_file in csv_files:
    # csv_file의 이름 추출하여 df 이름으로 사용
    df_name = csv_file.stem.replace(" ", "_").replace("-", "_").replace(".", "_")
    # csv 파일 읽어서 df_name의 이름을 가진 DataFrame으로 저장.
    globals()[df_name] = pd.read_csv(csv_file)
    print(f"Loaded {csv_file} into DataFrame '{df_name}' with shape {globals()[df_name].shape}")


Loaded csv_preprocessing/창원시 오피스텔(매매)_실거래가 201501~202006/4. 마산회원구_오피스텔(매매)_실거래가_1607-1706.csv into DataFrame '4__마산회원구_오피스텔(매매)_실거래가_1607_1706' with shape (110, 12)
Loaded csv_preprocessing/창원시 오피스텔(매매)_실거래가 201501~202006/5. 진해구_오피스텔(매매)_실거래가_1607-1706.csv into DataFrame '5__진해구_오피스텔(매매)_실거래가_1607_1706' with shape (150, 12)
Loaded csv_preprocessing/창원시 오피스텔(매매)_실거래가 201501~202006/1. 의창구_오피스텔(매매)_실거래가_1501-1506.csv into DataFrame '1__의창구_오피스텔(매매)_실거래가_1501_1506' with shape (72, 12)
Loaded csv_preprocessing/창원시 오피스텔(매매)_실거래가 201501~202006/1. 의창구_오피스텔(매매)_실거래가_1507-1606.csv into DataFrame '1__의창구_오피스텔(매매)_실거래가_1507_1606' with shape (117, 12)
Loaded csv_preprocessing/창원시 오피스텔(매매)_실거래가 201501~202006/3. 마산합포구_오피스텔(매매)_실거래가_1807-1906.csv into DataFrame '3__마산합포구_오피스텔(매매)_실거래가_1807_1906' with shape (154, 12)
Loaded csv_preprocessing/창원시 오피스텔(매매)_실거래가 201501~202006/2. 성산구_오피스텔(매매)_실거래가_1607-1706.csv into DataFrame '2__성산구_오피스텔(매매)_실거래가_1607_1706' with shape (236, 12)
Loaded csv_preprocessing/창원

In [2]:
# 데이터프레임 이름 정제작업.
"""
데이터프레임 이름은 다음과 같은 경향성이 관찰됨.
    - "1. 의창구_단독다가구(매매)_실거래가_1501-1506.csv"
    - "2__성산구_연립다세대(매매)_실거래가_1507_1606.csv"
    - "3__마산합포구_아파트(매매)_실거래가_1907_2006.csv"
    - "단독다가구(매매)_실거래가_2021년_마산회원구"
    - "단독다가구(매매)_실거래가_202007_마산합포구"
따라서, 데이터프레임에서 다음과 같은 키워드를 캐치하여 데이터프레임을 정제할 필요가 있음.
    - 지역구: 의창구, 성산구, 마산합포구, 마산회원구, 진해구
    - 주택유형: 단독다가구, 연립다세대, 아파트, 오피스텔
    - 거래유형: 매매, 전월세
    - 기간: 1501-1506, 1507_1606, 2021년 등등
"""

# 우선, 같은 지역구 끼리 묶어서 개별 리스트에 보관.
uichang_gu_dfs = []
seongsan_gu_dfs = []
masanhappo_gu_dfs = []
masanhoeweon_gu_dfs = []
jinhe_gu_dfs = []

# 데이터프레임 이름을 순회하며 지역구 키워드에 따라 분류.
for df_name in list(globals().keys()):
    if "의창구" in df_name:
        uichang_gu_dfs.append(df_name)
    elif "성산구" in df_name:
        seongsan_gu_dfs.append(df_name)
    elif "마산합포구" in df_name:
        masanhappo_gu_dfs.append(df_name)
    elif "마산회원구" in df_name:
        masanhoeweon_gu_dfs.append(df_name)
    elif "진해구" in df_name:
        jinhe_gu_dfs.append(df_name)

# 주택유형에 따라 지역구_주택유형_df 형태로 분류.
uichang_gu_dfs_dict = {"단독다가구": [], "연립다세대": [], "아파트": [], "오피스텔": []}
seongsan_gu_dfs_dict = {"단독다가구": [], "연립다세대": [], "아파트": [], "오피스텔": []}
masanhappo_gu_dfs_dict = {"단독다가구": [], "연립다세대": [], "아파트": [], "오피스텔": []}
masanhoeweon_gu_dfs_dict = {"단독다가구": [], "연립다세대": [], "아파트": [], "오피스텔": []}
jinhe_gu_dfs_dict = {"단독다가구": [], "연립다세대": [], "아파트": [], "오피스텔": []}

for df_name in uichang_gu_dfs:
    if "단독다가구" in df_name:
        uichang_gu_dfs_dict["단독다가구"].append(df_name)
    elif "연립다세대" in df_name:
        uichang_gu_dfs_dict["연립다세대"].append(df_name)
    elif "아파트" in df_name:
        uichang_gu_dfs_dict["아파트"].append(df_name)
    elif "오피스텔" in df_name:
        uichang_gu_dfs_dict["오피스텔"].append(df_name)

for df_name in seongsan_gu_dfs:
    if "단독다가구" in df_name:
        seongsan_gu_dfs_dict["단독다가구"].append(df_name)
    elif "연립다세대" in df_name:
        seongsan_gu_dfs_dict["연립다세대"].append(df_name)
    elif "아파트" in df_name:
        seongsan_gu_dfs_dict["아파트"].append(df_name)
    elif "오피스텔" in df_name:
        seongsan_gu_dfs_dict["오피스텔"].append(df_name)

for df_name in masanhappo_gu_dfs:
    if "단독다가구" in df_name:
        masanhappo_gu_dfs_dict["단독다가구"].append(df_name)
    elif "연립다세대" in df_name:
        masanhappo_gu_dfs_dict["연립다세대"].append(df_name)
    elif "아파트" in df_name:
        masanhappo_gu_dfs_dict["아파트"].append(df_name)
    elif "오피스텔" in df_name:
        masanhappo_gu_dfs_dict["오피스텔"].append(df_name)

for df_name in masanhoeweon_gu_dfs:
    if "단독다가구" in df_name:
        masanhoeweon_gu_dfs_dict["단독다가구"].append(df_name)
    elif "연립다세대" in df_name:
        masanhoeweon_gu_dfs_dict["연립다세대"].append(df_name)
    elif "아파트" in df_name:
        masanhoeweon_gu_dfs_dict["아파트"].append(df_name)
    elif "오피스텔" in df_name:
        masanhoeweon_gu_dfs_dict["오피스텔"].append(df_name)

for df_name in jinhe_gu_dfs:
    if "단독다가구" in df_name:
        jinhe_gu_dfs_dict["단독다가구"].append(df_name)
    elif "연립다세대" in df_name:
        jinhe_gu_dfs_dict["연립다세대"].append(df_name)
    elif "아파트" in df_name:
        jinhe_gu_dfs_dict["아파트"].append(df_name)
    elif "오피스텔" in df_name:
        jinhe_gu_dfs_dict["오피스텔"].append(df_name)



In [None]:
# 지역구 사전의 키를 순회하며, 주택유형별로 데이터프레임을 병합.
for gu_dict, gu_name in zip(
    [uichang_gu_dfs_dict, seongsan_gu_dfs_dict, masanhappo_gu_dfs_dict, masanhoeweon_gu_dfs_dict, jinhe_gu_dfs_dict],
    ["의창구", "성산구", "마산합포구", "마산회원구", "진해구"]
):
    for housing_type, df_names in gu_dict.items():
        if df_names:  # df_names 리스트가 비어있지 않은 경우에만 병합 수행
            merged_df = pd.concat([globals()[df_name] for df_name in df_names], ignore_index=True)
            merged_df_name = f"{gu_name}_{housing_type}_merged"

            # '계약년월' 컬럼 기준으로 오름차순 정렬
            if '계약년월' in merged_df.columns:
                merged_df = merged_df.sort_values(by='계약년월').reset_index(drop=True)

            globals()[merged_df_name] = merged_df
            print(f"Merged {len(df_names)} DataFrames into '{merged_df_name}' with shape {merged_df.shape}")

# 병합된 데이터프레임을 csv로 저장.
output_folder = './csv_merged'
os.makedirs(output_folder, exist_ok=True)
for var_name in list(globals().keys()):
    if var_name.endswith("_merged"):
        df = globals()[var_name]
        df.to_csv(f"{output_folder}/{var_name}.csv", index=False)
        print(f"Saved merged DataFrame to '{output_folder}/{var_name}.csv'")

Merged 11 DataFrames into '의창구_단독다가구_merged' with shape (4238, 17)
Merged 11 DataFrames into '의창구_연립다세대_merged' with shape (1821, 21)
Merged 11 DataFrames into '의창구_아파트_merged' with shape (17625, 21)
Merged 11 DataFrames into '의창구_오피스텔_merged' with shape (901, 18)
Merged 11 DataFrames into '성산구_단독다가구_merged' with shape (1368, 17)
Merged 11 DataFrames into '성산구_연립다세대_merged' with shape (299, 21)
Merged 11 DataFrames into '성산구_아파트_merged' with shape (39202, 21)
Merged 11 DataFrames into '성산구_오피스텔_merged' with shape (3298, 18)
Merged 11 DataFrames into '마산합포구_단독다가구_merged' with shape (4696, 17)
Merged 11 DataFrames into '마산합포구_연립다세대_merged' with shape (971, 21)
Merged 11 DataFrames into '마산합포구_아파트_merged' with shape (17754, 21)
Merged 11 DataFrames into '마산합포구_오피스텔_merged' with shape (2126, 18)
Merged 11 DataFrames into '마산회원구_단독다가구_merged' with shape (4078, 17)
Merged 11 DataFrames into '마산회원구_연립다세대_merged' with shape (2046, 21)
Merged 11 DataFrames into '마산회원구_아파트_merged' with shape (19

# 

## 필요에 따라 도로명 주소를 좌표로 변환하기 위한 코드블럭.

In [10]:
# 모든 Merged DataFrame들을 순회하며 '도로명'컬럼의 고유값을 추출
unique_road_names = set()
for var_name in list(globals().keys()):
    if var_name.endswith("_merged"):
        df = globals()[var_name]
        df['도로명'] = df['도로명'].str.strip()
        unique_road_names.update(df['도로명'].dropna().unique())

unique_road_names = sorted(unique_road_names)
print(f"Extracted {len(unique_road_names)} unique road names.")

Extracted 5223 unique road names.


In [11]:
# csv로 저장
road_names_df = pd.DataFrame(unique_road_names, columns=['도로명'])
road_names_df.to_csv('./csv_merged/unique_road_names.csv', index=False)