<a href="https://colab.research.google.com/github/tjwjddnr/Data-Analysis-with-Open-Source/blob/main/%EC%98%A4%ED%94%88%EC%86%8C%EC%8A%A4_%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%B6%84%EC%84%9D_4%EA%B0%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# 오픈소스 기반 데이터 분석 4강 - 데이터 수집


## 4-1 CSV 파일 읽기

In [None]:
import pandas as pd

## data.csv 파일 읽기
df = pd.read_csv('data.csv', encoding='utf-8', sep=',', header=0,
                 index_col=None, skiprows=None, nrows=None)

print(df)

           날짜    체중  골격근량  체지방량
0  2025.02.06  64.7  30.0  11.1
1  2025.02.04  64.0  29.3  11.6


## 4-2 JSON 파일 읽기



In [None]:
import json
import pandas as pd

## data.json 파일 출력
with open('data.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
print(data)

## data.json 파일 DataFrame 읽기
df = pd.read_json('data.json', orient='records', encoding='utf-8')
print(df)

{'매출데이터': [{'월': '2025-01', '매출액': 1000000, '비용': 700000, '이익': 300000}, {'월': '2025-02', '매출액': 1200000, '비용': 800000, '이익': 400000}, {'월': '2025-03', '매출액': 1500000, '비용': 900000, '이익': 600000}]}
                                               매출데이터
0  {'월': '2025-01', '매출액': 1000000, '비용': 700000,...
1  {'월': '2025-02', '매출액': 1200000, '비용': 800000,...
2  {'월': '2025-03', '매출액': 1500000, '비용': 900000,...


## 4-3 텍스트 파일 읽기 및 데이터 추출

In [None]:
from ast import pattern
import re

## 파일(callcenter20250301.log) 오픈 및 읽기
with open('callcenter20250301.log', 'r', encoding='utf-8') as f:
    content = f.read()
## 주민등록번호 패턴 생성
pattern = re.compile(r'(\d{6})-(\d{7})')

## 주민등록번호 마스킹
masked_content = pattern.sub(r'\1-*******', content)

## 마스킹된 파일(callcenter20250301_masked.log) 오픈 및 쓰기
with open('collcenter20250301_masked.log', mode='w') as f:
    f.write(masked_content)

print("주민등록번호 마스킹 완료. 'callcenter20250301_masked.log.txt' 파일로 저장되었습니다.")

주민등록번호 마스킹 완료. 'callcenter20250301_masked.log.txt' 파일로 저장되었습니다.


## 4-4 Open-Meteo의 무료 날씨 API를 통한 특정 지역 온도 조회

In [None]:
import requests
import json

url = "https://api.open-meteo.com/v1/forecast?=&=&current=temperature_2m"
params = {
    "latitude": "37.58638333",
    "longitude": "127.0203333",
    "current": "temperature_2m"
}

try:
    ## URL 및 파라미터 전송
    response = requests.get(url, params=params)
    response.raise_for_status()

    ## JSON 데이터 읽기
    data = response.json()

    print("API 응답:", data)
    print("서울시 종로구의 현재 온도는 : {0}{1} 입니다.".format(data['current']['temperature_2m'], data['current_units']['temperature_2m']))

except requests.exceptions.RequestException as e:
    print(f"API 호출 실패: {e}")
except json.JSONDecodeError as e:
    print(f"JSON 파싱 실패: {e}")

API 응답: {'latitude': 37.6, 'longitude': 127.0, 'generationtime_ms': 0.019073486328125, 'utc_offset_seconds': 0, 'timezone': 'GMT', 'timezone_abbreviation': 'GMT', 'elevation': 29.0, 'current_units': {'time': 'iso8601', 'interval': 'seconds', 'temperature_2m': '°C'}, 'current': {'time': '2025-09-25T12:00', 'interval': 900, 'temperature_2m': 19.8}}
서울시 종로구의 현재 온도는 : 19.8°C 입니다.


## 4-5 Selenium과 lxml을 이용한 웹 스크래핑

In [None]:
!curl -o google-chrome-stable_current_amd64.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt install ./google-chrome-stable_current_amd64.deb -y
!pip install selenium webdriver_manager

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  114M  100  114M    0     0  21.7M      0  0:00:05  0:00:05 --:--:-- 26.0M
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'google-chrome-stable' instead of './google-chrome-stable_current_amd64.deb'
The following additional packages will be installed:
  libvulkan1 mesa-vulkan-drivers
The following NEW packages will be installed:
  google-chrome-stable libvulkan1 mesa-vulkan-drivers
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 10.9 MB/131 MB of archives.
After this operation, 447 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libvulkan1 amd64 1.3.204.1-2 [128 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 mesa-vulkan-drivers amd64 23.2.1-1ubuntu3.1~22.04.3 [10.7

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from lxml import html
import time

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')               # 브라우저 창 없이 실행
chrome_options.add_argument('--no-sandbox')             # 보안모드 비활성화 (Colab 필수)
chrome_options.add_argument('--disable-dev-shm-usage')  # 메모리 부족 방지 (Colab 필수)
chrome_options.add_argument('--window-size=1920x1080')  # 창 크기 설정(가상)
chrome_options.add_argument('--disable-gpu')            # GPU 가속 비활성화 (일부 환경 안정성)
chrome_options.binary_location = "/usr/bin/google-chrome-stable"  # Colab용 크롬 경로 지정

## 드라이버 실행
driver = webdriver.Chrome(options=chrome_options)

## 사이트 접속
url = 'https://professor.knou.ac.kr/jaehwachung/index.do'
driver.get(url)

## 사이트 접속 대기
time.sleep(2)

## 페이지 제목 출력
page_source = driver.page_source
tree = html.fromstring(page_source)

title_text = tree.xpath('//title/text()')
print(title_text)

## 드라이버 종료
driver.quit()

['\n\t\tAlert \n\t\t\n\t']



# 실습 시나리오

## 공공데이터 포털 가입 및 데이터 신청

- [https://www.data.go.kr](https://www.data.go.kr)
- 한국환경공단 에어코리아 대기오염정보 데이터 신청

In [None]:
import requests

## 데이터 수집 url 및 api key 설정
url = 'http://apis.data.go.kr/B552584/ArpltnInforInqireSvc/getCtprvnRltmMesureDnsty'
api_key = 'ufEA375H6EQcMFlUIAjdNjZWwQYKsy+5pHi/OqJcPmtIZ4Q4rGnomKar8DpCkYQposvGwkTYL8emlxbf6Ns9+Q=='

params = {
    'serviceKey': api_key,
    'returnType': 'json',
    'numOfRows': '100',
    'pageNo': '1',
    'sidoName': '전국',
    'ver': '1.0'
}

## 데이터 수집
response = requests.get(url, params=params)


## 수집된 데이터 출력
print(response.json())

## 호출 성공/실패 출력



{'response': {'body': {'totalCount': 666, 'items': [{'so2Grade': '1', 'coFlag': None, 'khaiValue': '67', 'so2Value': '0.003', 'coValue': '0.5', 'pm25Flag': None, 'pm10Flag': None, 'o3Grade': '1', 'pm10Value': '20', 'khaiGrade': '2', 'pm25Value': '14', 'sidoName': '경기', 'no2Flag': None, 'no2Grade': '2', 'o3Flag': None, 'pm25Grade': '1', 'so2Flag': None, 'dataTime': '2025-09-25 21:00', 'coGrade': '1', 'no2Value': '0.040', 'stationName': '중앙대로(고잔동)', 'pm10Grade': '1', 'o3Value': '0.004'}, {'so2Grade': '1', 'coFlag': None, 'khaiValue': '53', 'so2Value': '0.002', 'coValue': '0.6', 'pm25Flag': None, 'pm10Flag': None, 'o3Grade': '1', 'pm10Value': '32', 'khaiGrade': '2', 'pm25Value': '18', 'sidoName': '경기', 'no2Flag': None, 'no2Grade': '2', 'o3Flag': None, 'pm25Grade': '1', 'so2Flag': None, 'dataTime': '2025-09-25 21:00', 'coGrade': '1', 'no2Value': '0.031', 'stationName': '별양동', 'pm10Grade': '1', 'o3Value': '0.011'}, {'so2Grade': '1', 'coFlag': None, 'khaiValue': '43', 'so2Value': '0.002', 'c

In [None]:
import requests

## 데이터 수집 url 및 api key 설정
url = 'http://apis.data.go.kr/1613000/ApHusEnergyUseInfoOfferServiceV2/getWntyAvrgEnergyUseAmountInfoSearchV2'
api_key = 'ufEA375H6EQcMFlUIAjdNjZWwQYKsy+5pHi/OqJcPmtIZ4Q4rGnomKar8DpCkYQposvGwkTYL8emlxbf6Ns9+Q=='


def get_year_months():
    year_months = []
    for year in range(2015, 2024+1):
        for month in range(1, 12+1):
            year_months.append(f"{year}{month:02d}")
    return year_months
# 이 함수가 실행되면 201501 부터 202412 까지의 문자열이 반환 [201501, ..., 202412]
responses = []

for year_months in get_year_months():
    # 함수로 반환된 연도월이 year_months에 하나씩 할당됨
    params = {
        'serviceKey': api_key,
        'searchDate': year_months
    }
    # 각 할당된 연도월로 요청
    response = requests.get(url, params=params)
    # 해당 연도월의 결과값을 responses에 append함
    responses.append(response.json())

## 수집된 데이터 출력
if response.status_code == 200:
    print("데이터 수집 성공")
    print('총' + str(len(responses))+'개의 데이터 수집됨')
    print(response.json())
else:
    print("데이터 수집 실패")


데이터 수집 성공
총120개의 데이터 수집됨
{'response': {'body': {'item': {'heat': 514, 'waterHot': 155, 'gas': 3, 'elect': 711, 'waterCool': 241}}, 'header': {'resultCode': '00', 'resultMsg': 'NORMAL SERVICE.'}}}


In [None]:
print(responses[1])

{'response': {'body': {'item': {'heat': 435, 'waterHot': 126, 'gas': 5, 'elect': 639, 'waterCool': 174}}, 'header': {'resultCode': '00', 'resultMsg': 'NORMAL SERVICE.'}}}


In [27]:
import requests
import pandas as pd
base_url = 'http://openapi.seoul.go.kr:8088/6e6f437975736a7531344544427161/json/energyUseDataSummaryInfo/1/5/2025/01'

response = requests.get(url)
data = response.json()['energyUseDataSummaryInfo']['row']
## print(response.json())
df = pd.DataFrame(data)
df.to_csv('Test.csv', encoding='utf-8')



In [37]:
import requests

## 데이터 수집 url 및 api key 설정
base_url = 'http://openapi.seoul.go.kr:8088/6e6f437975736a7531344544427161/json/energyUseDataSummaryInfo/1/5/'

responses = []

def get_year_months():
    year_months = []
    for year in range(2015, 2024+1):
        for month in range(1, 12+1):
            year_months.append(f"{year}/{month:02d}")
    return year_months

for year_months in get_year_months():
    url = f"{base_url}/{year_months}"

    # 각 할당된 연도월로 요청
    response = requests.get(url)
    responses.append(response.json())

## 수집된 데이터 출력
if response.status_code == 200:
    print('데이터 수집 성공')

else:
    print('데이터 수집 실패')


데이터 수집 성공


In [53]:
all_rows = []
for item in responses:
    rows = item['energyUseDataSummaryInfo']['row']
    all_rows.extend(rows)

df = pd.DataFrame(all_rows)
df.to_csv('energy_data.csv', index=False, encoding='utf-8')
df

Unnamed: 0,YEAR,MON,MM_TYPE,CNT,EUS,EUS1,EUS2,ECO2_1,ECO2_2,GUS,...,WUS1,WUS2,WCO2_1,WCO2_2,HUS,HUS1,HUS2,HCO2_1,HCO2_2,REG_DATE
0,2025,06,개인,1269454,278745288,275851011,273151435,4244065,2076623.656,13084768,...,15166284.8,15371662.2,-341289.9,-611822.446,379182.22,368839.049,389847.167,-160.888,-22448.67848156,2025-09-01 04:02:15.0
1,2025,06,학교,6745,117808793,114793127,114623246,3100606.5,3096973.668,2093441,...,1914804.2,1956560.2,-111344,-333633.134,8.03,8.03,8.03,0,0,2025-09-01 04:02:15.0
2,2025,06,공동주택관리소,3729,498863841,496003195,488758810,6482838.5,6482838.5,2853437,...,20616193.4,20794568.4,-297568.2,-892704.6,0,0,0,0,0,2025-09-01 04:02:15.0
3,2025,06,종교단체,849,6205342,6171454,6094109,72560.5,72560.5,139336,...,51779,49094,1791.5,5374.5,0,0,0,0,0,2025-09-01 04:02:15.0
4,2025,06,소상공인,53274,96254029,98122772,98464917,-2039815.5,-2043866.22,1638563,...,1541313.8,1576869.2,-32353.7,-97437.288,2,2,2,0,0,2025-09-01 04:02:15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,2025,06,개인,1269454,278745288,275851011,273151435,4244065,2076623.656,13084768,...,15166284.8,15371662.2,-341289.9,-611822.446,379182.22,368839.049,389847.167,-160.888,-22448.67848156,2025-09-01 04:02:15.0
596,2025,06,학교,6745,117808793,114793127,114623246,3100606.5,3096973.668,2093441,...,1914804.2,1956560.2,-111344,-333633.134,8.03,8.03,8.03,0,0,2025-09-01 04:02:15.0
597,2025,06,공동주택관리소,3729,498863841,496003195,488758810,6482838.5,6482838.5,2853437,...,20616193.4,20794568.4,-297568.2,-892704.6,0,0,0,0,0,2025-09-01 04:02:15.0
598,2025,06,종교단체,849,6205342,6171454,6094109,72560.5,72560.5,139336,...,51779,49094,1791.5,5374.5,0,0,0,0,0,2025-09-01 04:02:15.0




  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  fig.canvas.draw()
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [51]:
df_persnal = df[df["MM_TYPE" == "개인"]]

KeyError: False