# Functions

In [1]:
import pandas as pd
import numpy as np
import re
from typing import Dict, Tuple

def process_election_data(election_number: str) -> pd.DataFrame:
    """
    특정 선거 회차의 데이터를 처리하여 병합된 결과를 반환

    Args:
        election_number (str): 선거 회차 (예: '14', '15', '16' 등)

    Returns:
        pd.DataFrame: 병합된 선거 데이터
    """

    # URL 생성
    df1_url = f"https://raw.githubusercontent.com/sw1kwon/korean-elections/refs/heads/main/temp/v1_p/temp1_president_{election_number}.csv"
    df2_url = f"https://raw.githubusercontent.com/sw1kwon/korean-elections/refs/heads/main/temp/v2_2_p/temp2_2_president_{election_number}.csv"

    print(f"=== {election_number}회 대선 데이터 처리 시작 ===")
    print(f"상세 데이터 URL: {df1_url}")
    print(f"요약 데이터 URL: {df2_url}")

    try:
        # 첫 번째 CSV 파일 처리 (상세 득표 데이터)
        df1 = pd.read_csv(df1_url)
        print(f"상세 데이터 로드 완료: {df1.shape}")

        # 득표수_{숫자}_정당_이름 패턴의 컬럼들 찾기
        vote_columns = [col for col in df1.columns if col.startswith('득표수_') and col != '득표수_계']
        print(f"득표수 관련 컬럼 수: {len(vote_columns)}")

        # 각 행별로 1위와 2위 후보의 번호 찾기
        def find_top_two_candidates(row):
            vote_data = []

            # 모든 후보의 득표수와 번호를 수집
            for col in vote_columns:
                votes = row[col] if pd.notna(row[col]) else 0
                # 컬럼명에서 숫자 추출
                match = re.search(r'득표수_(\d+)_', col)
                if match:
                    candidate_number = int(match.group(1))
                    vote_data.append((votes, candidate_number, col))

            # 득표수 기준으로 내림차순 정렬
            vote_data.sort(key=lambda x: x[0], reverse=True)

            # 1위와 2위 후보 반환
            first_place = vote_data[0] if len(vote_data) > 0 else (0, None, None)
            second_place = vote_data[1] if len(vote_data) > 1 else (0, None, None)

            return first_place, second_place

        # 1위 후보 번호 찾기
        def find_winner_number(row):
            first_place, _ = find_top_two_candidates(row)
            return first_place[1]

        # 2위 후보 번호 찾기
        def find_second_number(row):
            _, second_place = find_top_two_candidates(row)
            return second_place[1]

        # 1위 득표율 계산
        def calculate_vote_rate_1st(row):
            first_place, _ = find_top_two_candidates(row)
            max_votes = first_place[0]
            total_votes = row['득표수_계'] if pd.notna(row['득표수_계']) else 0

            if total_votes > 0:
                return max_votes / total_votes
            else:
                return 0

        # 2위 득표율 계산
        def calculate_vote_rate_2nd(row):
            _, second_place = find_top_two_candidates(row)
            second_votes = second_place[0]
            total_votes = row['득표수_계'] if pd.notna(row['득표수_계']) else 0

            if total_votes > 0:
                return second_votes / total_votes
            else:
                return 0

        # 새로운 컬럼 추가 (1위, 2위)
        df1['득표_1위_후보번호'] = df1.apply(find_winner_number, axis=1)
        df1['득표_1위_득표율'] = df1.apply(calculate_vote_rate_1st, axis=1)
        df1['득표_2위_후보번호'] = df1.apply(find_second_number, axis=1)
        df1['득표_2위_득표율'] = df1.apply(calculate_vote_rate_2nd, axis=1)

        # 정당 카테고리 매핑 딕셔너리 (회차별로 다를 수 있음)
        # 예: vote_columns = ['득표수_1_민주자유당_김영삼', '득표수_2_민주당_김대중', ...]
        # -> get_category_mapping이 실제 존재하는 후보번호만 찾아서 매핑 생성
        category_mapping = get_category_mapping(election_number, vote_columns)
        print(f"생성된 카테고리 매핑: {category_mapping}")

        # 1위, 2위 후보번호를 카테고리로 매핑
        # fillna()로 매핑되지 않은 후보는 '기타'로 처리
        df1['득표_1위_정당'] = df1['득표_1위_후보번호'].map(category_mapping).fillna('기타')
        df1['득표_2위_정당'] = df1['득표_2위_후보번호'].map(category_mapping).fillna('기타')

        # 매핑 결과 확인
        print("매핑 후 1위 정당 분포 (처리 중):")
        print(df1['득표_1위_정당'].value_counts())

        # 매핑되지 않은 후보 확인
        unmapped_1st = df1[df1['득표_1위_정당'] == '기타']['득표_1위_후보번호'].unique()
        unmapped_2nd = df1[df1['득표_2위_정당'] == '기타']['득표_2위_후보번호'].unique()
        if len(unmapped_1st) > 0:
            print(f"경고: 1위에서 매핑되지 않은 후보번호: {unmapped_1st}")
        if len(unmapped_2nd) > 0:
            print(f"경고: 2위에서 매핑되지 않은 후보번호: {unmapped_2nd}")

        # 카테고리별 후보자 수 계산 (모든 카테고리 포함)
        candidate_counts = {}
        all_categories = ['보수정당', '진보정당', '그외정당', '무소속']

        # 먼저 모든 카테고리를 0으로 초기화
        for category in all_categories:
            candidate_counts[category] = 0

        # 실제 매핑에서 카운트
        for candidate_num, category in category_mapping.items():
            if category in candidate_counts:
                candidate_counts[category] += 1
            else:
                candidate_counts[category] = 1

        print(f"카테고리별 후보자 수: {candidate_counts}")

        # 각 카테고리별로 개별 컬럼 생성 (없는 카테고리도 0으로 포함)
        for category in all_categories:
            candidate_count = candidate_counts.get(category, 0)  # 없으면 0
            df1[f'{category}_후보자수'] = candidate_count
            print(f"  {category}_후보자수: {candidate_count}")

        # 필요한 컬럼들만 선택 (병합용)
        merge_columns = ['시도', '구시군', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
                         '득표_1위_정당', '득표_2위_정당'] + [f'{cat}_후보자수' for cat in all_categories]

        df1_for_merge = df1[merge_columns].copy()

        # 두 번째 CSV 파일 읽기 (요약 데이터)
        df2 = pd.read_csv(df2_url)
        print(f"요약 데이터 로드 완료: {df2.shape}")

        # 시도, 구시군을 키로 하여 병합
        merged_df = pd.merge(df2, df1_for_merge, on=['시도', '구시군'], how='left')

        # 병합 결과 검증
        missing_data = merged_df[merged_df['득표_1위_후보번호'].isna()]
        if len(missing_data) > 0:
            print(f"경고: 병합되지 않은 데이터가 {len(missing_data)}개 있습니다")
        else:
            print("모든 데이터가 성공적으로 병합되었습니다!")

        print(f"최종 데이터 형태: {merged_df.shape}")
        print(f"1위 정당 분포:")
        print(merged_df['득표_1위_정당'].value_counts())
        print(f"=== {election_number}회 대선 데이터 처리 완료 ===\n")

        return merged_df

    except Exception as e:
        print(f"오류 발생: {e}")
        return None

def get_category_mapping(election_number: str, vote_columns: list) -> Dict[int, str]:
    """
    선거 회차별로 정당 카테고리 매핑을 반환
    각 선거마다 후보자와 정당이 다르므로 수동으로 설정 필요

    Args:
        election_number: 선거 회차
        vote_columns: 득표수 컬럼 리스트 (후보 확인용)

    Returns:
        해당 선거의 후보번호별 카테고리 매핑
    """

    print(f"\n=== {election_number}회 선거 후보 정보 ===")
    print("실제 후보 컬럼들:")
    for col in vote_columns:
        print(f"  {col}")

    # 선거 회차별 매핑 설정
    if election_number == '14':  # 14회 대선 (1992년)
        mapping = {
            1: '보수정당',    # 민주자유당_김영삼
            2: '진보정당',    # 민주당_김대중
            3: '그외정당',    # 통일국민당_정주영
            5: '그외정당',    # 신정치개혁당_박찬종
            6: '그외정당',    # 대한정의당_이병호
            7: '무소속',      # 무소속_김옥선
            8: '무소속'       # 무소속_백기완
        }

    elif election_number == '15':  # 15회 대선 (1997년) - 예시
        mapping = {
            1: '보수정당',    # 한나라당_이회창
            2: '진보정당',    # 새정치국민회의_김대중
            3: '그외정당',    # 국민신당_이인제
            4: '그외정당',    # 민주당_권영길
            5: '그외정당',
            6: '그외정당',
            7: '그외정당'
        }

    elif election_number == '16':  # 16회 대선 (2002년) - 예시
        mapping = {
            1: '보수정당',    # 한나라당_이회창
            2: '진보정당',    # 새천년민주당_노무현
            3: '그외정당',    # 국민통합21_정몽준
            4: '그외정당',
            5: '그외정당',
            6: '그외정당'
        }

    elif election_number == '17':  # 17회 대선 (2007년) - 예시
        mapping = {
            1: '진보정당',
            2: '보수정당',
            3: '그외정당',
            4: '그외정당',
            5: '그외정당',
            6: '그외정당',
            7: '그외정당',
            8: '그외정당',
            9: '그외정당',
            10: '그외정당',
            11: '무소속'
        }

    elif election_number == '18':  # 18회 대선 (2012년) - 예시
        mapping = {
            1: '보수정당',    # 새누리당_박근혜
            2: '진보정당',    # 민주통합당_문재인
            4: '무소속',
            5: '무소속',
            6: '무소속',
            7: '무소속'
        }

    elif election_number == '19':  # 19회 대선 (2017년) - 예시
        mapping = {
            1: '진보정당',    # 더불어민주당_문재인
            2: '보수정당',    # 자유한국당_홍준표
            3: '그외정당',    # 국민의당_안철수
            4: '그외정당',    # 바른정당_유승민
            5: '그외정당',    # 정의당_심상정
            6: '그외정당',
            7: '그외정당',
            8: '그외정당',
            9: '그외정당',
            10: '그외정당',
            12: '그외정당',
            14: '그외정당',
            15: '무소속'
        }

    elif election_number == '20':  # 20회 대선 (2022년) - 예시
        mapping = {
            1: '진보정당',
            2: '보수정당',
            3: '그외정당',
            5: '그외정당',
            6: '그외정당',
            7: '그외정당',
            8: '그외정당',
            10: '그외정당',
            11: '그외정당',
            12: '그외정당',
            13: '그외정당',
            14: '그외정당'
        }

    elif election_number == '21':
        mapping = {
            1: '진보정당',
            2: '보수정당',
            4: '그외정당',
            5: '그외정당',
            8: '무소속'
        }

    else:
        print(f"경고: {election_number}회 선거에 대한 매핑이 정의되지 않았습니다.")
        print("기본 매핑을 사용합니다. 수동으로 매핑을 추가해주세요.")
        # 기본 매핑 (수정 필요)
        mapping = {
            1: '보수정당',
            2: '진보정당',
            3: '그외정당',
            4: '그외정당',
            5: '그외정당',
            6: '그외정당',
            7: '무소속',
            8: '무소속'
        }

    # 실제 존재하는 후보 번호만 추출
    existing_candidates = set()
    for col in vote_columns:
        match = re.search(r'득표수_(\d+)_', col)
        if match:
            existing_candidates.add(int(match.group(1)))

    print(f"실제 존재하는 후보번호: {sorted(existing_candidates)}")

    # 존재하는 후보에 대해서만 매핑 적용
    filtered_mapping = {k: v for k, v in mapping.items() if k in existing_candidates}

    # 매핑되지 않은 후보 번호 확인
    unmapped_candidates = existing_candidates - set(mapping.keys())
    if unmapped_candidates:
        print(f"경고: 매핑되지 않은 후보번호들: {sorted(unmapped_candidates)}")
        print("이 후보들은 '기타' 카테고리로 분류됩니다.")

    print(f"적용된 매핑: {filtered_mapping}")
    print("=" * 50)

    return filtered_mapping

def process_multiple_elections(election_numbers: list) -> Dict[str, pd.DataFrame]:
    """
    여러 선거 회차를 일괄 처리하여 df_숫자 형태로 변수 저장

    Args:
        election_numbers (list): 처리할 선거 회차 리스트

    Returns:
        Dict[str, pd.DataFrame]: 회차별 처리된 데이터프레임 딕셔너리
    """
    results = {}

    for election_num in election_numbers:
        print(f"\n{'='*50}")
        result_df = process_election_data(str(election_num))

        if result_df is not None:
            # df_14, df_15 형태로 변수명 지정
            var_name = f'df_{election_num}'
            results[var_name] = result_df

            print(f"데이터프레임 저장: {var_name} (shape: {result_df.shape})")
        else:
            print(f"{election_num}회 선거 데이터 처리 실패")

    return results

def save_dataframes_to_globals(election_numbers: list):
    """
    선거 데이터를 처리하여 글로벌 변수로 저장 (df_14, df_15 등)

    Args:
        election_numbers (list): 처리할 선거 회차 리스트
    """
    results = process_multiple_elections(election_numbers)

    # 글로벌 변수로 각 데이터프레임 저장
    global_vars = globals()

    for var_name, df in results.items():
        global_vars[var_name] = df
        print(f"글로벌 변수 생성: {var_name}")

    print(f"\n생성된 변수들:")
    for var_name in results.keys():
        print(f"  {var_name}: {results[var_name].shape}")

    return results

# # 사용 예시
# if __name__ == "__main__":
#     # 방법 1: 딕셔너리로 받기
#     election_results = process_multiple_elections(['14', '15', '16', '17', '18', '19', '20', '21'])

#     # 개별 접근 예시:
#     # df_14 = election_results['df_14']
#     # df_15 = election_results['df_15']

#     # 방법 2: 글로벌 변수로 직접 생성 (주석 해제하여 사용)
#     # save_dataframes_to_globals(['14', '15', '16', '17', '18', '19', '20', '21'])

#     # 그러면 df_14, df_15, df_16, ... 변수들이 자동 생성됨

# Preprocessing & Merge

- 14회 4번 후보는 사퇴로 제외

In [2]:
election_results = process_multiple_elections(['14', '15', '16', '17', '18', '19', '20', '21'])


=== 14회 대선 데이터 처리 시작 ===
상세 데이터 URL: https://raw.githubusercontent.com/sw1kwon/korean-elections/refs/heads/main/temp/v1_p/temp1_president_14.csv
요약 데이터 URL: https://raw.githubusercontent.com/sw1kwon/korean-elections/refs/heads/main/temp/v2_2_p/temp2_2_president_14.csv
상세 데이터 로드 완료: (325, 15)
득표수 관련 컬럼 수: 8

=== 14회 선거 후보 정보 ===
실제 후보 컬럼들:
  득표수_1_민주자유당_김영삼
  득표수_2_민주당_김대중
  득표수_3_통일국민당_정주영
  득표수_4_새한국당_이종찬
  득표수_5_신정치개혁당_박찬종
  득표수_6_대한정의당_이병호
  득표수_7_무소속_김옥선
  득표수_8_무소속_백기완
실제 존재하는 후보번호: [1, 2, 3, 4, 5, 6, 7, 8]
경고: 매핑되지 않은 후보번호들: [4]
이 후보들은 '기타' 카테고리로 분류됩니다.
적용된 매핑: {1: '보수정당', 2: '진보정당', 3: '그외정당', 5: '그외정당', 6: '그외정당', 7: '무소속', 8: '무소속'}
생성된 카테고리 매핑: {1: '보수정당', 2: '진보정당', 3: '그외정당', 5: '그외정당', 6: '그외정당', 7: '무소속', 8: '무소속'}
매핑 후 1위 정당 분포 (처리 중):
득표_1위_정당
보수정당    221
진보정당     99
그외정당      5
Name: count, dtype: int64
카테고리별 후보자 수: {'보수정당': 1, '진보정당': 1, '그외정당': 3, '무소속': 2}
  보수정당_후보자수: 1
  진보정당_후보자수: 1
  그외정당_후보자수: 3
  무소속_후보자수: 2
요약 데이터 로드 완료: (325, 11)
모든 데이터가 성공적으로 병합되었습니다!
최종 데

# 14th_1992

In [3]:
df_14 = election_results['df_14']

In [4]:
df_14

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,28676547,24095170,9977332,8041284,5431853,324940,23775409,319761,...,1,0.419649,2,0.338219,보수정당,진보정당,1,1,3,2
1,서울,합계,7235830,6021311,2167298,2246636,1456961,80882,5951777,69534,...,2,0.377473,1,0.364143,진보정당,보수정당,1,1,3,2
2,서울,종로구,158824,130419,41396,50226,34764,1884,128270,2149,...,2,0.391565,1,0.322726,진보정당,보수정당,1,1,3,2
3,서울,중구,123335,101005,34093,41814,22368,1320,99595,1410,...,2,0.419840,1,0.342316,진보정당,보수정당,1,1,3,2
4,서울,용산구,198704,161166,59755,57144,40041,2149,159089,2077,...,1,0.375607,2,0.359195,보수정당,진보정당,1,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,제주,합계,319824,265252,104292,85889,65770,4933,260884,4368,...,1,0.399764,2,0.329223,보수정당,진보정당,1,1,3,2
323,제주,제주시,146005,120565,45617,39122,32250,2017,119006,1559,...,1,0.383317,2,0.328740,보수정당,진보정당,1,1,3,2
324,제주,북제주군,66346,54713,23957,15726,12626,1229,53538,1175,...,1,0.447477,2,0.293735,보수정당,진보정당,1,1,3,2
325,제주,서귀포시,53574,44840,16042,16955,10384,768,44149,691,...,2,0.384040,1,0.363360,진보정당,보수정당,1,1,3,2


## preprocessing

In [5]:
df_14['시도'].unique()

array(['전국', '서울', '부산', '대구', '인천', '광주', '대전', '경기', '강원', '충북', '충남',
       '전북', '전남', '경북', '경남', '제주'], dtype=object)

In [6]:
df_14.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [7]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_14.columns if col not in fixed_cols]

# 메소드 체이닝
df_14 = (
    df_14
    .assign(선거종류='대통령', 선거년도='1992')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [8]:
df_14

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,1992,대통령,보수정당,진보정당,1,0.419649,2,0.338219,...,2,28676547,24095170,9977332,8041284,5431853,324940,23775409,319761,4581377
1,서울,합계,1992,대통령,진보정당,보수정당,2,0.377473,1,0.364143,...,2,7235830,6021311,2167298,2246636,1456961,80882,5951777,69534,1214519
2,서울,종로구,1992,대통령,진보정당,보수정당,2,0.391565,1,0.322726,...,2,158824,130419,41396,50226,34764,1884,128270,2149,28405
3,서울,중구,1992,대통령,진보정당,보수정당,2,0.419840,1,0.342316,...,2,123335,101005,34093,41814,22368,1320,99595,1410,22330
4,서울,용산구,1992,대통령,보수정당,진보정당,1,0.375607,2,0.359195,...,2,198704,161166,59755,57144,40041,2149,159089,2077,37538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,제주,합계,1992,대통령,보수정당,진보정당,1,0.399764,2,0.329223,...,2,319824,265252,104292,85889,65770,4933,260884,4368,54572
323,제주,제주시,1992,대통령,보수정당,진보정당,1,0.383317,2,0.328740,...,2,146005,120565,45617,39122,32250,2017,119006,1559,25440
324,제주,북제주군,1992,대통령,보수정당,진보정당,1,0.447477,2,0.293735,...,2,66346,54713,23957,15726,12626,1229,53538,1175,11633
325,제주,서귀포시,1992,대통령,진보정당,보수정당,2,0.384040,1,0.363360,...,2,53574,44840,16042,16955,10384,768,44149,691,8734


## v4.1 ~ v4.3

In [9]:
# 1. 전체 데이터 저장
df_14.to_csv("temp4_1_president_14.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_14
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_14.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_14
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_14.csv", index=False, encoding="utf-8-sig")
)

# 15th_1997

In [10]:
df_15 = election_results['df_15']

In [11]:
df_15

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,32290416,26042633,9935718,10326275,5380445,0,25642438,400195,...,2,0.402703,1,0.387472,진보정당,보수정당,1,1,5,0
1,서울,합계,7358547,5926743,2394309,2627308,833156,0,5854773,71970,...,2,0.448746,1,0.408950,진보정당,보수정당,1,1,5,0
2,서울,종로구,145809,116117,48664,52381,13550,0,114595,1522,...,2,0.457097,1,0.424661,진보정당,보수정당,1,1,5,0
3,서울,중구,96000,76174,29633,36072,9388,0,75093,1081,...,2,0.480364,1,0.394617,진보정당,보수정당,1,1,5,0
4,서울,용산구,180295,141083,61050,60486,17650,0,139186,1897,...,1,0.438622,2,0.434570,보수정당,진보정당,1,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,제주,합계,361680,278999,100103,111009,62465,0,273577,5422,...,2,0.405769,1,0.365904,진보정당,보수정당,1,1,5,0
316,제주,제주시,175309,135454,53153,53254,27106,0,133513,1941,...,2,0.398868,1,0.398111,진보정당,보수정당,1,1,5,0
317,제주,북제주군,71558,54622,18519,20105,14454,0,53078,1544,...,2,0.378782,1,0.348902,진보정당,보수정당,1,1,5,0
318,제주,서귀포시,58547,45135,14768,20392,9220,0,44380,755,...,2,0.459486,1,0.332763,진보정당,보수정당,1,1,5,0


## preprocessing

In [12]:
df_15['시도'].unique()

array(['전국', '서울', '부산', '대구', '인천', '광주', '대전', '울산', '경기', '강원', '충북',
       '충남', '전북', '전남', '경북', '경남', '제주'], dtype=object)

In [13]:
df_15.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [14]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_15.columns if col not in fixed_cols]

# 메소드 체이닝
df_15 = (
    df_15
    .assign(선거종류='대통령', 선거년도='1997')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [15]:
df_15

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,1997,대통령,진보정당,보수정당,2,0.402703,1,0.387472,...,0,32290416,26042633,9935718,10326275,5380445,0,25642438,400195,6247783
1,서울,합계,1997,대통령,진보정당,보수정당,2,0.448746,1,0.408950,...,0,7358547,5926743,2394309,2627308,833156,0,5854773,71970,1431804
2,서울,종로구,1997,대통령,진보정당,보수정당,2,0.457097,1,0.424661,...,0,145809,116117,48664,52381,13550,0,114595,1522,29692
3,서울,중구,1997,대통령,진보정당,보수정당,2,0.480364,1,0.394617,...,0,96000,76174,29633,36072,9388,0,75093,1081,19826
4,서울,용산구,1997,대통령,보수정당,진보정당,1,0.438622,2,0.434570,...,0,180295,141083,61050,60486,17650,0,139186,1897,39212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,제주,합계,1997,대통령,진보정당,보수정당,2,0.405769,1,0.365904,...,0,361680,278999,100103,111009,62465,0,273577,5422,82681
316,제주,제주시,1997,대통령,진보정당,보수정당,2,0.398868,1,0.398111,...,0,175309,135454,53153,53254,27106,0,133513,1941,39855
317,제주,북제주군,1997,대통령,진보정당,보수정당,2,0.378782,1,0.348902,...,0,71558,54622,18519,20105,14454,0,53078,1544,16936
318,제주,서귀포시,1997,대통령,진보정당,보수정당,2,0.459486,1,0.332763,...,0,58547,45135,14768,20392,9220,0,44380,755,13412


## v4.1 ~ v4.3

In [16]:
# 1. 전체 데이터 저장
df_15.to_csv("temp4_1_president_15.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_15
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_15.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_15
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_15.csv", index=False, encoding="utf-8-sig")
)

# 16th_2002

In [17]:
df_16 = election_results['df_16']

In [18]:
df_16

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,34991529,24784963,11443297,12014277,1104342,0,24561916,223047,...,2,0.489143,1,0.465896,진보정당,보수정당,1,1,4,0
1,서울,합계,7670682,5475715,2447376,2792957,203657,0,5443990,31725,...,2,0.513035,1,0.449556,진보정당,보수정당,1,1,4,0
2,서울,종로구,140105,99988,45901,49989,3491,0,99381,607,...,2,0.503004,1,0.461869,진보정당,보수정당,1,1,4,0
3,서울,중구,108936,76499,33712,39876,2404,0,75992,507,...,2,0.524739,1,0.443626,진보정당,보수정당,1,1,4,0
4,서울,용산구,184276,127810,61349,61437,4276,0,127062,748,...,2,0.483520,1,0.482827,진보정당,보수정당,1,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,제주,합계,391151,268227,105744,148423,10632,0,264799,3428,...,2,0.560512,1,0.399337,진보정당,보수정당,1,1,4,0
257,제주,제주시,198820,136002,55892,73679,5302,0,134873,1129,...,2,0.546284,1,0.414405,진보정당,보수정당,1,1,4,0
258,제주,북제주군,74769,51969,20345,28501,2028,0,50874,1095,...,2,0.560227,1,0.399910,진보정당,보수정당,1,1,4,0
259,제주,서귀포시,61099,41151,15077,24026,1595,0,40698,453,...,2,0.590348,1,0.370460,진보정당,보수정당,1,1,4,0


## preprocessing

In [19]:
df_16['시도'].unique()

array(['전국', '서울', '부산', '대구', '인천', '광주', '대전', '울산', '경기', '강원', '충북',
       '충남', '전북', '전남', '경북', '경남', '제주'], dtype=object)

In [20]:
df_16.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [21]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_16.columns if col not in fixed_cols]

# 메소드 체이닝
df_16 = (
    df_16
    .assign(선거종류='대통령', 선거년도='2002')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [22]:
df_16

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,2002,대통령,진보정당,보수정당,2,0.489143,1,0.465896,...,0,34991529,24784963,11443297,12014277,1104342,0,24561916,223047,10206566
1,서울,합계,2002,대통령,진보정당,보수정당,2,0.513035,1,0.449556,...,0,7670682,5475715,2447376,2792957,203657,0,5443990,31725,2194967
2,서울,종로구,2002,대통령,진보정당,보수정당,2,0.503004,1,0.461869,...,0,140105,99988,45901,49989,3491,0,99381,607,40117
3,서울,중구,2002,대통령,진보정당,보수정당,2,0.524739,1,0.443626,...,0,108936,76499,33712,39876,2404,0,75992,507,32437
4,서울,용산구,2002,대통령,진보정당,보수정당,2,0.483520,1,0.482827,...,0,184276,127810,61349,61437,4276,0,127062,748,56466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,제주,합계,2002,대통령,진보정당,보수정당,2,0.560512,1,0.399337,...,0,391151,268227,105744,148423,10632,0,264799,3428,122924
257,제주,제주시,2002,대통령,진보정당,보수정당,2,0.546284,1,0.414405,...,0,198820,136002,55892,73679,5302,0,134873,1129,62818
258,제주,북제주군,2002,대통령,진보정당,보수정당,2,0.560227,1,0.399910,...,0,74769,51969,20345,28501,2028,0,50874,1095,22800
259,제주,서귀포시,2002,대통령,진보정당,보수정당,2,0.590348,1,0.370460,...,0,61099,41151,15077,24026,1595,0,40698,453,19948


## v4.1 ~ v4.3

In [23]:
# 1. 전체 데이터 저장
df_16.to_csv("temp4_1_president_16.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_16
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_16.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_16
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_16.csv", index=False, encoding="utf-8-sig")
)

# 17th_2007

In [24]:
df_17 = election_results['df_17']

In [25]:
df_17

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,37653518,23732854,11492389,6174681,2385847,3559963,23612880,119974,...,2,0.486700,1,0.261496,보수정당,진보정당,1,1,7,1
1,서울,합계,8051696,5066022,2689162,1237812,528169,596226,5051369,14653,...,2,0.532363,1,0.245045,보수정당,진보정당,1,1,7,1
2,서울,종로구,133946,85480,45172,21515,8486,9984,85157,323,...,2,0.530456,1,0.252651,보수정당,진보정당,1,1,7,1
3,서울,중구,106837,66496,35335,17299,6373,7278,66285,211,...,2,0.533077,1,0.260979,보수정당,진보정당,1,1,7,1
4,서울,용산구,190910,117051,66096,25967,11087,13554,116704,347,...,2,0.566356,1,0.222503,보수정당,진보정당,1,1,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,경남,거창군,50823,34594,19863,3637,3399,6982,33881,713,...,2,0.586258,11,0.206074,보수정당,무소속,1,1,7,1
261,경남,합천군,45959,31741,18867,2815,2889,6339,30910,831,...,2,0.610385,11,0.205079,보수정당,무소속,1,1,7,1
262,제주,합계,414022,252111,96495,81570,33962,37495,249522,2589,...,2,0.386719,1,0.326905,보수정당,진보정당,1,1,7,1
263,제주,제주시,296325,181304,70297,56798,24685,27824,179604,1700,...,2,0.391400,1,0.316240,보수정당,진보정당,1,1,7,1


## preprocessing

In [26]:
df_17['시도'].unique()

array(['전국', '서울', '부산', '대구', '인천', '광주', '대전', '울산', '경기', '강원', '충북',
       '충남', '전북', '전남', '경북', '경남', '제주'], dtype=object)

In [27]:
df_17.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [28]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_17.columns if col not in fixed_cols]

# 메소드 체이닝
df_17 = (
    df_17
    .assign(선거종류='대통령', 선거년도='2007')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [29]:
df_17

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,2007,대통령,보수정당,진보정당,2,0.486700,1,0.261496,...,1,37653518,23732854,11492389,6174681,2385847,3559963,23612880,119974,13920664
1,서울,합계,2007,대통령,보수정당,진보정당,2,0.532363,1,0.245045,...,1,8051696,5066022,2689162,1237812,528169,596226,5051369,14653,2985674
2,서울,종로구,2007,대통령,보수정당,진보정당,2,0.530456,1,0.252651,...,1,133946,85480,45172,21515,8486,9984,85157,323,48466
3,서울,중구,2007,대통령,보수정당,진보정당,2,0.533077,1,0.260979,...,1,106837,66496,35335,17299,6373,7278,66285,211,40341
4,서울,용산구,2007,대통령,보수정당,진보정당,2,0.566356,1,0.222503,...,1,190910,117051,66096,25967,11087,13554,116704,347,73859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,경남,거창군,2007,대통령,보수정당,무소속,2,0.586258,11,0.206074,...,1,50823,34594,19863,3637,3399,6982,33881,713,16229
261,경남,합천군,2007,대통령,보수정당,무소속,2,0.610385,11,0.205079,...,1,45959,31741,18867,2815,2889,6339,30910,831,14218
262,제주,합계,2007,대통령,보수정당,진보정당,2,0.386719,1,0.326905,...,1,414022,252111,96495,81570,33962,37495,249522,2589,161911
263,제주,제주시,2007,대통령,보수정당,진보정당,2,0.391400,1,0.316240,...,1,296325,181304,70297,56798,24685,27824,179604,1700,115021


## v4.1 ~ v4.3

In [30]:
# 1. 전체 데이터 저장
df_17.to_csv("temp4_1_president_17.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_17
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_17.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_17
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_17.csv", index=False, encoding="utf-8-sig")
)

# 18th_2012

In [31]:
df_18 = election_results['df_18']

In [32]:
df_18

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,40507842,30721459,15773128,14692632,0,128861,30594621,126838,...,1,0.515552,2,0.480236,보수정당,진보정당,1,1,0,4
1,서울특별시,합계,8393847,6307869,3024572,3227639,0,24488,6276699,31170,...,2,0.514226,1,0.481873,진보정당,보수정당,1,1,0,4
2,서울특별시,종로구,141447,103189,49422,52747,0,467,102636,553,...,2,0.513923,1,0.481527,진보정당,보수정당,1,1,0,4
3,서울특별시,중구,115277,83095,40289,41919,0,345,82553,542,...,2,0.507783,1,0.488038,진보정당,보수정당,1,1,0,4
4,서울특별시,용산구,206665,147849,76997,69572,0,565,147134,715,...,1,0.523312,2,0.472848,보수정당,진보정당,1,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,경상남도,거창군,51773,40103,28726,10466,0,402,39594,509,...,1,0.725514,2,0.264333,보수정당,진보정당,1,1,0,4
242,경상남도,합천군,43981,33586,25313,7114,0,536,32963,623,...,1,0.767922,2,0.215818,보수정당,진보정당,1,1,0,4
243,제주특별자치도,합계,451731,330967,166184,161235,0,1916,329335,1632,...,1,0.504605,2,0.489577,보수정당,진보정당,1,1,0,4
244,제주특별자치도,제주시,328450,241552,119563,119622,0,1301,240486,1066,...,2,0.497418,1,0.497172,진보정당,보수정당,1,1,0,4


## preprocessing

In [33]:
df_18['시도'].unique()

array(['전국', '서울특별시', '부산광역시', '대구광역시', '인천광역시', '광주광역시', '대전광역시',
       '울산광역시', '세종특별자치시', '경기도', '강원도', '충청북도', '충청남도', '전라북도', '전라남도',
       '경상북도', '경상남도', '제주특별자치도'], dtype=object)

In [34]:
# 1. 지역명 매핑 딕셔너리 정의
region_mapping = {
    '서울특별시': '서울',
    '부산광역시': '부산',
    '대구광역시': '대구',
    '인천광역시': '인천',
    '광주광역시': '광주',
    '대전광역시': '대전',
    '울산광역시': '울산',
    '세종특별자치시': '세종',
    '경기도': '경기',
    '강원도': '강원',
    '충청북도': '충북',
    '충청남도': '충남',
    '전라북도': '전북',
    '전라남도': '전남',
    '경상북도': '경북',
    '경상남도': '경남',
    '제주특별자치도': '제주'
}

# 2. 지역명 변경
df_18 = df_18.replace({'시도': region_mapping})

In [35]:
df_18.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [36]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_18.columns if col not in fixed_cols]

# 메소드 체이닝
df_18 = (
    df_18
    .assign(선거종류='대통령', 선거년도='2012')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [37]:
df_18

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,2012,대통령,보수정당,진보정당,1,0.515552,2,0.480236,...,4,40507842,30721459,15773128,14692632,0,128861,30594621,126838,9786383
1,서울,합계,2012,대통령,진보정당,보수정당,2,0.514226,1,0.481873,...,4,8393847,6307869,3024572,3227639,0,24488,6276699,31170,2085978
2,서울,종로구,2012,대통령,진보정당,보수정당,2,0.513923,1,0.481527,...,4,141447,103189,49422,52747,0,467,102636,553,38258
3,서울,중구,2012,대통령,진보정당,보수정당,2,0.507783,1,0.488038,...,4,115277,83095,40289,41919,0,345,82553,542,32182
4,서울,용산구,2012,대통령,보수정당,진보정당,1,0.523312,2,0.472848,...,4,206665,147849,76997,69572,0,565,147134,715,58816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,경남,거창군,2012,대통령,보수정당,진보정당,1,0.725514,2,0.264333,...,4,51773,40103,28726,10466,0,402,39594,509,11670
242,경남,합천군,2012,대통령,보수정당,진보정당,1,0.767922,2,0.215818,...,4,43981,33586,25313,7114,0,536,32963,623,10395
243,제주,합계,2012,대통령,보수정당,진보정당,1,0.504605,2,0.489577,...,4,451731,330967,166184,161235,0,1916,329335,1632,120764
244,제주,제주시,2012,대통령,진보정당,보수정당,2,0.497418,1,0.497172,...,4,328450,241552,119563,119622,0,1301,240486,1066,86898


## v4.1 ~ v4.3

In [38]:
# 1. 전체 데이터 저장
df_18.to_csv("temp4_1_president_18.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_18
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_18.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_18
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_18.csv", index=False, encoding="utf-8-sig")
)

# 19th_2017

In [39]:
df_19 = election_results['df_19']

In [40]:
df_19

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,42479710,32807908,7852849,13423800,11361536,33990,32672175,135733,...,1,0.410863,2,0.240353,진보정당,보수정당,1,1,10,1
1,서울특별시,합계,8382999,6590646,1365285,2781345,2418337,3950,6568917,21729,...,1,0.423410,3,0.227247,진보정당,그외정당,1,1,10,1
2,서울특별시,종로구,133769,102566,22325,42512,37316,49,102202,364,...,1,0.415961,2,0.218440,진보정당,보수정당,1,1,10,1
3,서울특별시,중구,109836,82852,17901,34062,30581,51,82595,257,...,1,0.412398,3,0.234542,진보정당,그외정당,1,1,10,1
4,서울특별시,용산구,197962,148157,35230,58081,54269,80,147660,497,...,1,0.393343,2,0.238589,진보정당,보수정당,1,1,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,경상남도,거창군,53345,41325,19976,11256,9617,100,40949,376,...,2,0.487826,1,0.274879,보수정당,진보정당,1,1,10,1
264,경상남도,합천군,42887,33021,19699,7143,5793,76,32711,310,...,2,0.602213,1,0.218367,보수정당,진보정당,1,1,10,1
265,제주특별자치도,합계,518000,374459,68063,169493,134300,556,372412,2047,...,1,0.455122,3,0.209072,진보정당,그외정당,1,1,10,1
266,제주특별자치도,제주시,375292,273163,48027,125717,97668,368,271780,1383,...,1,0.462569,3,0.205942,진보정당,그외정당,1,1,10,1


## preprocessing

In [41]:
df_19['시도'].unique()

array(['전국', '서울특별시', '부산광역시', '대구광역시', '인천광역시', '광주광역시', '대전광역시',
       '울산광역시', '세종특별자치시', '경기도', '강원도', '충청북도', '충청남도', '전라북도', '전라남도',
       '경상북도', '경상남도', '제주특별자치도'], dtype=object)

In [42]:
# 1. 지역명 매핑 딕셔너리 정의
region_mapping = {
    '서울특별시': '서울',
    '부산광역시': '부산',
    '대구광역시': '대구',
    '인천광역시': '인천',
    '광주광역시': '광주',
    '대전광역시': '대전',
    '울산광역시': '울산',
    '세종특별자치시': '세종',
    '경기도': '경기',
    '강원도': '강원',
    '충청북도': '충북',
    '충청남도': '충남',
    '전라북도': '전북',
    '전라남도': '전남',
    '경상북도': '경북',
    '경상남도': '경남',
    '제주특별자치도': '제주'
}

# 2. 지역명 변경
df_19 = df_19.replace({'시도': region_mapping})

In [43]:
df_19.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [44]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_19.columns if col not in fixed_cols]

# 메소드 체이닝
df_19 = (
    df_19
    .assign(선거종류='대통령', 선거년도='2017')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [45]:
df_19

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,2017,대통령,진보정당,보수정당,1,0.410863,2,0.240353,...,1,42479710,32807908,7852849,13423800,11361536,33990,32672175,135733,9671802
1,서울,합계,2017,대통령,진보정당,그외정당,1,0.423410,3,0.227247,...,1,8382999,6590646,1365285,2781345,2418337,3950,6568917,21729,1792353
2,서울,종로구,2017,대통령,진보정당,보수정당,1,0.415961,2,0.218440,...,1,133769,102566,22325,42512,37316,49,102202,364,31203
3,서울,중구,2017,대통령,진보정당,그외정당,1,0.412398,3,0.234542,...,1,109836,82852,17901,34062,30581,51,82595,257,26984
4,서울,용산구,2017,대통령,진보정당,보수정당,1,0.393343,2,0.238589,...,1,197962,148157,35230,58081,54269,80,147660,497,49805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,경남,거창군,2017,대통령,보수정당,진보정당,2,0.487826,1,0.274879,...,1,53345,41325,19976,11256,9617,100,40949,376,12020
264,경남,합천군,2017,대통령,보수정당,진보정당,2,0.602213,1,0.218367,...,1,42887,33021,19699,7143,5793,76,32711,310,9866
265,제주,합계,2017,대통령,진보정당,그외정당,1,0.455122,3,0.209072,...,1,518000,374459,68063,169493,134300,556,372412,2047,143541
266,제주,제주시,2017,대통령,진보정당,그외정당,1,0.462569,3,0.205942,...,1,375292,273163,48027,125717,97668,368,271780,1383,102129


## v4.1 ~ v4.3

In [46]:
# 1. 전체 데이터 저장
df_19.to_csv("temp4_1_president_19.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_19
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_19.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_19
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_19.csv", index=False, encoding="utf-8-sig")
)

# 20th_2022

In [47]:
df_20 = election_results['df_20']

In [48]:
df_20

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,44197692,34067853,16394815,16147738,1217758,0,33760311,307542,...,2,0.485624,1,0.478305,보수정당,진보정당,1,1,10,0
1,서울특별시,합계,8346647,6501831,3255747,2944981,238411,0,6439139,62692,...,2,0.505618,1,0.457356,보수정당,진보정당,1,1,10,0
2,서울특별시,종로구,129968,100629,49172,46130,4064,0,99366,1263,...,2,0.494857,1,0.464243,보수정당,진보정당,1,1,10,0
3,서울특별시,중구,111448,84998,42906,38244,3034,0,84184,814,...,2,0.509669,1,0.454291,보수정당,진보정당,1,1,10,0
4,서울특별시,용산구,199077,152068,85047,60063,5572,0,150682,1386,...,2,0.564414,1,0.398608,보수정당,진보정당,1,1,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,경상남도,거창군,53049,41399,27254,11963,1728,0,40945,454,...,2,0.665625,1,0.292172,보수정당,진보정당,1,1,10,0
264,경상남도,합천군,39768,31270,22742,6911,1179,0,30832,438,...,2,0.737610,1,0.224150,보수정당,진보정당,1,1,10,0
265,제주특별자치도,합계,564354,409649,173014,213130,19094,0,405238,4411,...,1,0.525938,2,0.426944,진보정당,보수정당,1,1,10,0
266,제주특별자치도,제주시,408552,296826,122084,157695,13870,0,293649,3177,...,1,0.537019,2,0.415748,진보정당,보수정당,1,1,10,0


## preprocessing

In [49]:
df_20['시도'].unique()

array(['전국', '서울특별시', '부산광역시', '대구광역시', '인천광역시', '광주광역시', '대전광역시',
       '울산광역시', '세종특별자치시', '경기도', '강원도', '충청북도', '충청남도', '전라북도', '전라남도',
       '경상북도', '경상남도', '제주특별자치도'], dtype=object)

In [50]:
# 1. 지역명 매핑 딕셔너리 정의
region_mapping = {
    '서울특별시': '서울',
    '부산광역시': '부산',
    '대구광역시': '대구',
    '인천광역시': '인천',
    '광주광역시': '광주',
    '대전광역시': '대전',
    '울산광역시': '울산',
    '세종특별자치시': '세종',
    '경기도': '경기',
    '강원도': '강원',
    '충청북도': '충북',
    '충청남도': '충남',
    '전라북도': '전북',
    '전라남도': '전남',
    '경상북도': '경북',
    '경상남도': '경남',
    '제주특별자치도': '제주'
}

# 2. 지역명 변경
df_20 = df_20.replace({'시도': region_mapping})

In [51]:
df_20.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [52]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_20.columns if col not in fixed_cols]

# 메소드 체이닝
df_20 = (
    df_20
    .assign(선거종류='대통령', 선거년도='2022')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [53]:
df_20

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,2022,대통령,보수정당,진보정당,2,0.485624,1,0.478305,...,0,44197692,34067853,16394815,16147738,1217758,0,33760311,307542,10129839
1,서울,합계,2022,대통령,보수정당,진보정당,2,0.505618,1,0.457356,...,0,8346647,6501831,3255747,2944981,238411,0,6439139,62692,1844816
2,서울,종로구,2022,대통령,보수정당,진보정당,2,0.494857,1,0.464243,...,0,129968,100629,49172,46130,4064,0,99366,1263,29339
3,서울,중구,2022,대통령,보수정당,진보정당,2,0.509669,1,0.454291,...,0,111448,84998,42906,38244,3034,0,84184,814,26450
4,서울,용산구,2022,대통령,보수정당,진보정당,2,0.564414,1,0.398608,...,0,199077,152068,85047,60063,5572,0,150682,1386,47009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,경남,거창군,2022,대통령,보수정당,진보정당,2,0.665625,1,0.292172,...,0,53049,41399,27254,11963,1728,0,40945,454,11650
264,경남,합천군,2022,대통령,보수정당,진보정당,2,0.737610,1,0.224150,...,0,39768,31270,22742,6911,1179,0,30832,438,8498
265,제주,합계,2022,대통령,진보정당,보수정당,1,0.525938,2,0.426944,...,0,564354,409649,173014,213130,19094,0,405238,4411,154705
266,제주,제주시,2022,대통령,진보정당,보수정당,1,0.537019,2,0.415748,...,0,408552,296826,122084,157695,13870,0,293649,3177,111726


## v4.1 ~ v4.3

In [54]:
# 1. 전체 데이터 저장
df_20.to_csv("temp4_1_president_20.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_20
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_20.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_20
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_20.csv", index=False, encoding="utf-8-sig")
)

# 21st_2025

In [55]:
df_21 = election_results['df_21']

In [56]:
df_21

Unnamed: 0,시도,구시군,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,...,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,득표_1위_정당,득표_2위_정당,보수정당_후보자수,진보정당_후보자수,그외정당_후보자수,무소속_후보자수
0,전국,합계,44391871,35236497,14395639,17287513,3261673,35791,34980616,255881,...,1,0.494203,2,0.411532,진보정당,보수정당,1,1,2,1
1,서울특별시,합계,8293885,6641606,2738405,3105459,739246,5998,6589108,52498,...,1,0.471302,2,0.415596,진보정당,보수정당,1,1,2,1
2,서울특별시,종로구,125901,99261,39574,47735,11080,89,98478,783,...,1,0.484728,2,0.401856,진보정당,보수정당,1,1,2,1
3,서울특별시,중구,110181,87166,36302,40482,9570,62,86416,750,...,1,0.468455,2,0.420084,진보정당,보수정당,1,1,2,1
4,서울특별시,용산구,183614,143813,67927,58705,15960,102,142694,1119,...,2,0.476033,1,0.411405,보수정당,진보정당,1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,경상남도,거창군,52316,41373,25259,12623,2972,65,40919,454,...,2,0.617293,1,0.308487,보수정당,진보정당,1,1,2,1
267,경상남도,합천군,37329,28935,20132,7001,1388,46,28567,368,...,2,0.704729,1,0.245073,보수정당,진보정당,1,1,2,1
268,제주특별자치도,합계,565255,421576,145290,228729,43100,528,417647,3929,...,1,0.547661,2,0.347878,진보정당,보수정당,1,1,2,1
269,제주특별자치도,제주시,409912,306685,102224,169119,32148,384,303875,2810,...,1,0.556541,2,0.336401,진보정당,보수정당,1,1,2,1


## preprocessing

In [57]:
df_21['시도'].unique()

array(['전국', '서울특별시', '부산광역시', '대구광역시', '인천광역시', '광주광역시', '대전광역시',
       '울산광역시', '세종특별자치시', '경기도', '강원특별자치도', '충청북도', '충청남도', '전북특별자치도',
       '전라남도', '경상북도', '경상남도', '제주특별자치도'], dtype=object)

In [58]:
# 1. 지역명 매핑 딕셔너리 정의
region_mapping = {
    '서울특별시': '서울',
    '부산광역시': '부산',
    '대구광역시': '대구',
    '인천광역시': '인천',
    '광주광역시': '광주',
    '대전광역시': '대전',
    '울산광역시': '울산',
    '세종특별자치시': '세종',
    '경기도': '경기',
    '강원특별자치도': '강원',
    '충청북도': '충북',
    '충청남도': '충남',
    '전북특별자치도': '전북',
    '전라남도': '전남',
    '경상북도': '경북',
    '경상남도': '경남',
    '제주특별자치도': '제주'
}

# 2. 지역명 변경
df_21 = df_21.replace({'시도': region_mapping})

In [59]:
df_21.columns

Index(['시도', '구시군', '선거인수', '투표수', '보수정당', '진보정당', '그외정당', '무소속', '득표수_계',
       '무효투표수', '기권수', '득표_1위_후보번호', '득표_1위_득표율', '득표_2위_후보번호', '득표_2위_득표율',
       '득표_1위_정당', '득표_2위_정당', '보수정당_후보자수', '진보정당_후보자수', '그외정당_후보자수',
       '무소속_후보자수'],
      dtype='object')

In [60]:
# 고정 열 순서 정의
fixed_cols = [
    '시도', '구시군', '선거년도', '선거종류',
    '득표_1위_정당', '득표_2위_정당',
    '득표_1위_후보번호', '득표_1위_득표율',
    '득표_2위_후보번호', '득표_2위_득표율',
    '보수정당_후보자수', '진보정당_후보자수',
    '그외정당_후보자수', '무소속_후보자수'
]

# 나머지 열: fixed_cols에 포함되지 않은 열
other_cols = [col for col in df_21.columns if col not in fixed_cols]

# 메소드 체이닝
df_21 = (
    df_21
    .assign(선거종류='대통령', 선거년도='2025')
    .loc[:, fixed_cols + other_cols]
    .rename(columns={'시도': '지역'})
)

In [61]:
df_21

Unnamed: 0,지역,구시군,선거년도,선거종류,득표_1위_정당,득표_2위_정당,득표_1위_후보번호,득표_1위_득표율,득표_2위_후보번호,득표_2위_득표율,...,무소속_후보자수,선거인수,투표수,보수정당,진보정당,그외정당,무소속,득표수_계,무효투표수,기권수
0,전국,합계,2025,대통령,진보정당,보수정당,1,0.494203,2,0.411532,...,1,44391871,35236497,14395639,17287513,3261673,35791,34980616,255881,9155374
1,서울,합계,2025,대통령,진보정당,보수정당,1,0.471302,2,0.415596,...,1,8293885,6641606,2738405,3105459,739246,5998,6589108,52498,1652279
2,서울,종로구,2025,대통령,진보정당,보수정당,1,0.484728,2,0.401856,...,1,125901,99261,39574,47735,11080,89,98478,783,26640
3,서울,중구,2025,대통령,진보정당,보수정당,1,0.468455,2,0.420084,...,1,110181,87166,36302,40482,9570,62,86416,750,23015
4,서울,용산구,2025,대통령,보수정당,진보정당,2,0.476033,1,0.411405,...,1,183614,143813,67927,58705,15960,102,142694,1119,39801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,경남,거창군,2025,대통령,보수정당,진보정당,2,0.617293,1,0.308487,...,1,52316,41373,25259,12623,2972,65,40919,454,10943
267,경남,합천군,2025,대통령,보수정당,진보정당,2,0.704729,1,0.245073,...,1,37329,28935,20132,7001,1388,46,28567,368,8394
268,제주,합계,2025,대통령,진보정당,보수정당,1,0.547661,2,0.347878,...,1,565255,421576,145290,228729,43100,528,417647,3929,143679
269,제주,제주시,2025,대통령,진보정당,보수정당,1,0.556541,2,0.336401,...,1,409912,306685,102224,169119,32148,384,303875,2810,103227


## v4.1 ~ v4.3

In [62]:
# 1. 전체 데이터 저장
df_21.to_csv("temp4_1_president_21.csv", index=False, encoding="utf-8-sig")

# 2. '구시군' != '합계' 행만 추출하여 저장
(
    df_21
    .query("구시군 != '합계'")
    .to_csv("temp4_2_president_21.csv", index=False, encoding="utf-8-sig")
)

# 3. '구시군' == '합계' 행만 추출 후 '구시군' 열 제거하여 저장
(
    df_21
    .query("구시군 == '합계'")
    .drop(columns="구시군")
    .to_csv("temp4_3_president_21.csv", index=False, encoding="utf-8-sig")
)

# Batch CSV Files to ZIP

In [63]:
import zipfile
import glob

# Find all CSV files in current directory
csv_files = glob.glob('*.csv')

# Create ZIP file
with zipfile.ZipFile('all_csv_files.zip', 'w') as zipf:
   for file in csv_files:
       zipf.write(file)
       print(f"Added: {file}")  # Show progress

print(f"Total {len(csv_files)} files compressed.")

Added: temp4_1_president_18.csv
Added: temp4_3_president_18.csv
Added: temp4_2_president_16.csv
Added: temp4_2_president_14.csv
Added: temp4_1_president_17.csv
Added: temp4_2_president_18.csv
Added: temp4_3_president_20.csv
Added: temp4_1_president_21.csv
Added: temp4_2_president_21.csv
Added: temp4_3_president_21.csv
Added: temp4_2_president_20.csv
Added: temp4_3_president_19.csv
Added: temp4_3_president_17.csv
Added: temp4_1_president_19.csv
Added: temp4_3_president_16.csv
Added: temp4_3_president_15.csv
Added: temp4_1_president_16.csv
Added: temp4_2_president_17.csv
Added: temp4_1_president_20.csv
Added: temp4_1_president_14.csv
Added: temp4_2_president_15.csv
Added: temp4_1_president_15.csv
Added: temp4_2_president_19.csv
Added: temp4_3_president_14.csv
Total 24 files compressed.
