In [9]:
import requests

base_url = 'https://opendart.fss.or.kr/api/'

with open('api_key.txt', 'r', encoding='utf-8') as file:
    api_key = file.read()

In [10]:
import zipfile
import xml.etree.ElementTree as ET
import os

fs_data_path = './fs_data'
os.makedirs(fs_data_path, exist_ok=True)

def download_copr_code_xml():
    response = requests.get(base_url + 'corpCode.xml?crtfc_key=' + api_key)
    response.raise_for_status()  # HTTP 에러 발생 시 예외 처리

    file_name = 'corp_code.zip'
    
    with open(fs_data_path + "/{0}".format(file_name), 'wb') as f:
        f.write(response.content)

    with zipfile.ZipFile(fs_data_path + "/{0}".format(file_name), 'r') as zip_ref:
            zip_ref.extractall(fs_data_path)

def get_corp_list():
    # XML 파일 파싱
    tree = ET.parse(fs_data_path + '/CORPCODE.xml') 
    root = tree.getroot()
    
    # 결과를 저장할 리스트
    result_list = []

    # 모든 'list' 태그 순회
    for list_elem in root.findall('list'):
        # 'list' 태그 내의 정보를 딕셔너리로 저장
        list_data = {}
        for child in list_elem:
            list_data[child.tag] = child.text

        # 결과 리스트에 추가
        result_list.append(list_data)

    return result_list

def get_corp_by_name(name):
    for corp in get_corp_list():
        if corp['corp_name'] == name:
            return corp

    raise Exception("%s를 찾을 수 없음" % name)

In [11]:
def get_financial_statement(corp_code, bsns_year, reprt_code):
    resp = requests.get(base_url
                        + "fnlttSinglAcnt.json?crtfc_key={0}".format(api_key)
                        + "&corp_code={0}".format(corp_code)
                        + "&bsns_year={0}".format(bsns_year)
                        + "&reprt_code={0}".format(reprt_code))
    
    if resp.status_code == 200:
        return resp.json()
    else:
        raise Exception("status_code: {0}, body: {1}".format(resp.status_code, resp.text))

In [12]:
import pandas as pd
import os

download_copr_code_xml()

years = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
corp_name = '삼성전자'
corp_data = get_corp_by_name(corp_name)
corp_code = corp_data['corp_code']
fsList = []

print("{0} 재무제표 생성 시작".format(corp_name))
for year in years:
    print("{0}년도 {1}({2}) 재무제표 받아오는 중..".format(year, corp_name, corp_code))
    fsJson = get_financial_statement(corp_code, year, '11011')
    fs = fsJson['list']
    fsDataFrame = pd.DataFrame(fs)
    fsList.append(fsDataFrame)

os.makedirs(fs_data_path + "/{0}".format(corp_name), exist_ok=True)

for i in range(len(fsList)):
    fsList[i].to_csv(fs_data_path + "/{0}/{1}.csv".format(corp_name, years[i]))

삼성전자 재무제표 생성 시작
2015년도 삼성전자(00126380) 재무제표 받아오는 중..
2016년도 삼성전자(00126380) 재무제표 받아오는 중..
2017년도 삼성전자(00126380) 재무제표 받아오는 중..
2018년도 삼성전자(00126380) 재무제표 받아오는 중..
2019년도 삼성전자(00126380) 재무제표 받아오는 중..
2020년도 삼성전자(00126380) 재무제표 받아오는 중..
2021년도 삼성전자(00126380) 재무제표 받아오는 중..
2022년도 삼성전자(00126380) 재무제표 받아오는 중..
2023년도 삼성전자(00126380) 재무제표 받아오는 중..


In [13]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the CSV files into DataFrames
df_2015 = pd.read_csv(fs_data_path + "/{0}/2015.csv".format(corp_name))
df_2016 = pd.read_csv(fs_data_path + "/{0}/2016.csv".format(corp_name))
df_2017 = pd.read_csv(fs_data_path + "/{0}/2017.csv".format(corp_name))
df_2018 = pd.read_csv(fs_data_path + "/{0}/2018.csv".format(corp_name))
df_2019 = pd.read_csv(fs_data_path + "/{0}/2019.csv".format(corp_name))
df_2020 = pd.read_csv(fs_data_path + "/{0}/2020.csv".format(corp_name))
df_2021 = pd.read_csv(fs_data_path + "/{0}/2021.csv".format(corp_name))
df_2022 = pd.read_csv(fs_data_path + "/{0}/2022.csv".format(corp_name))
df_2023 = pd.read_csv(fs_data_path + "/{0}/2023.csv".format(corp_name))

dfs = [df_2015, df_2016, df_2017, df_2018, df_2019, df_2020, df_2021, df_2022, df_2023]

def create_refine_fs(years, dfs):
    columns = ['연도', '매출액', '당기순이익', '자본총계', '부채비율', 'ROE', 'ROA']
    newDf = pd.DataFrame(columns=columns)
    
    new_rows = []
    for year, df in zip(years, dfs):
        net_income_row = df[df['account_nm'] == '당기순이익']
        revenue_row = df[df['account_nm'] == '매출액']
        total_capital_row = df[df['account_nm'] == '자본총계']
        total_liabilities_row = df[df['account_nm'] == '부채총계']
        total_assets_row = df[df['account_nm'] == '자산총계']
        
        net_income_str = net_income_row['thstrm_amount'].values[0].replace(',', '') if not net_income_row.empty else 0
        revenue_str = revenue_row['thstrm_amount'].values[0].replace(',', '') if not revenue_row.empty else 0
        total_capital_str = total_capital_row['thstrm_amount'].values[0].replace(',', '') if not total_capital_row.empty else 0
        total_liabilities_str = total_liabilities_row['thstrm_amount'].values[0].replace(',', '') if not total_liabilities_row.empty else 0
        total_assets_str = total_assets_row['thstrm_amount'].values[0].replace(',', '') if not total_assets_row.empty else 0
        
        net_income = int(net_income_str)
        revenue = int(revenue_str)
        total_capital = int(total_capital_str)
        total_liabilities = int(total_liabilities_str)
        total_assets = int(total_assets_str)
        
        roe = net_income / total_capital
        roa = net_income / total_assets * 100
        debt_ratio = total_liabilities / total_capital * 100
        new_rows.append([year, revenue, net_income, total_capital, debt_ratio, roe, roa])
    
    newDf = pd.concat([newDf, pd.DataFrame(new_rows, columns=columns)], ignore_index=True)
    
    return newDf

In [14]:
import matplotlib.pyplot as plt

def show_refine_fs_graph(df):
    # 1. 연도별 당기순이익 추이 그래프
    plt.figure(figsize=(10, 5))
    plt.plot(df['연도'], df['당기순이익'], marker='o', linestyle='-')
    plt.xlabel('연도')
    plt.ylabel('당기순이익')
    plt.title('연도별 당기순이익 추이')
    plt.grid(True)

    # 2. 연도별 자본총계 추이 그래프
    plt.figure(figsize=(10, 5))
    plt.plot(df['연도'], df['자본총계'], marker='o', linestyle='-')
    plt.xlabel('연도')
    plt.ylabel('자본총계')
    plt.title('연도별 자본총계 추이')
    plt.grid(True)

    # 3. 연도별 ROA 추이 그래프
    plt.figure(figsize=(10, 5))
    plt.plot(df['연도'], df['ROA'], marker='o', linestyle='-')
    plt.xlabel('연도')
    plt.ylabel('ROA')
    plt.title('연도별 ROA 추이')
    plt.grid(True)

    # 4. 그래프 화면에 표시
    plt.show()

In [15]:
refine_fs_df = create_refine_fs(years, dfs)
display(refine_fs_df)
# show_refine_fs_graph(refine_fs_df)

  newDf = pd.concat([newDf, pd.DataFrame(new_rows, columns=columns)], ignore_index=True)


Unnamed: 0,연도,매출액,당기순이익,자본총계,부채비율,ROE,ROA
0,2015,200653482000000,19060144000000,179059805000000,35.250634,0.106446,7.870254
1,2016,201866745000000,22726092000000,192963033000000,35.867643,0.117774,8.668313
2,2017,239575376000000,42186747000000,214491428000000,40.682587,0.196683,13.980598
3,2018,243771415000000,44344857000000,247753177000000,36.973922,0.178988,13.067308
4,2019,230400881000000,21738865000000,262880421000000,34.115921,0.082695,6.165926
5,2020,236806988000000,26407832000000,275948016000000,37.067743,0.095699,6.981845
6,2021,279604799000000,39907450000000,304899931000000,39.921697,0.130887,9.354306
7,2022,302231360000000,55654077000000,354749604000000,26.405922,0.156883,12.411025
8,2023,258935494000000,15487100000000,363677865000000,25.359837,0.042585,3.396994


In [16]:
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# 1. 데이터 준비 (실제 데이터로 대체해야 합니다)
df = refine_fs_df

# 2. 특징(X)과 타겟(y) 설정
X = df[['당기순이익', '자본총계', 'ROA', '부채비율', 'ROE']]

# 각 재무 항목 예측을 위한 타겟 변수
y_net_income = df['당기순이익']
y_revenue = df['매출액']
y_debt_ratio = df['부채비율']

# 3. 데이터 분할
X_train, X_test, y_net_income_train, y_net_income_test = train_test_split(X, y_net_income, test_size=0.2)
X_train, X_test, y_revenue_train, y_revenue_test = train_test_split(X, y_revenue, test_size=0.2)
X_train, X_test, y_debt_ratio_train, y_debt_ratio_test = train_test_split(X, y_debt_ratio, test_size=0.2)

# 4. 예측 모델 생성 및 훈련
model_net_income = LinearRegression()
model_net_income.fit(X_train, y_net_income_train)

model_revenue = LinearRegression()
model_revenue.fit(X_train, y_revenue_train)

model_debt_ratio = LinearRegression()
model_debt_ratio.fit(X_train, y_debt_ratio_train)

# 5. 예측 및 평가
y_net_income_pred = model_net_income.predict(X_test)
mse_net_income = mean_squared_error(y_net_income_test, y_net_income_pred)
print("당기순이익 예측 MSE:", mse_net_income)

y_revenue_pred = model_revenue.predict(X_test)
mse_revenue = mean_squared_error(y_revenue_test, y_revenue_pred)
print("매출액 예측 MSE:", mse_revenue)

y_debt_ratio_pred = model_debt_ratio.predict(X_test)
mse_debt_ratio = mean_squared_error(y_debt_ratio_test, y_debt_ratio_pred)
print("부채비율 예측 MSE:", mse_debt_ratio)

# 6. 다음 연도(2024년) 데이터 생성
# 각 특징별로 마지막 2개 값의 차이를 계산하여 다음 값을 예측합니다.
next_year_data = pd.DataFrame({
    '당기순이익': [df['당기순이익'].iloc[-1] + (df['당기순이익'].iloc[-1] - df['당기순이익'].iloc[-2])],
    '자본총계': [df['자본총계'].iloc[-1] + (df['자본총계'].iloc[-1] - df['자본총계'].iloc[-2])],
    'ROA': [df['ROA'].iloc[-1] + (df['ROA'].iloc[-1] - df['ROA'].iloc[-2])],
    '부채비율': [df['부채비율'].iloc[-1] + (df['부채비율'].iloc[-1] - df['부채비율'].iloc[-2])],
    'ROE': [df['ROE'].iloc[-1] + (df['ROE'].iloc[-1] - df['ROE'].iloc[-2])]
})

# 7. 다음 연도 예측
next_year_net_income_pred = model_net_income.predict(next_year_data[['당기순이익', '자본총계', 'ROA', '부채비율', 'ROE']])
next_year_revenue_pred = model_revenue.predict(next_year_data[['당기순이익', '자본총계', 'ROA', '부채비율', 'ROE']])
next_year_debt_ratio_pred = model_debt_ratio.predict(next_year_data[['당기순이익', '자본총계', 'ROA', '부채비율', 'ROE']])

# 8. 추정재무제표 데이터프레임 생성
estimated_fs_df = pd.DataFrame({
    '연도': [2024],
    '당기순이익': [next_year_net_income_pred[0]],
    '매출액': [next_year_revenue_pred[0]],
    '자본총계': next_year_data['자본총계'],
    '부채비율': [next_year_debt_ratio_pred[0]],
    'ROE': [next_year_net_income_pred[0] / next_year_debt_ratio_pred[0]],
    'ROA': next_year_net_income_pred[0] / next_year_data['자본총계'] * 100
})

# 9. 추정재무제표 출력
print("\n추정재무제표:")
display(estimated_fs_df)


당기순이익 예측 MSE: 7.368726574773573e+27
매출액 예측 MSE: 2.6546675898869336e+28
부채비율 예측 MSE: 1.4652909958439834e-11

추정재무제표:


Unnamed: 0,연도,당기순이익,매출액,자본총계,부채비율,ROE,ROA
0,2024,293689000000000.0,802897600000000.0,372606126000000,24.313764,12079130000000.0,78.82023
