<a href="https://colab.research.google.com/github/yeonghun00/stock_public/blob/main/Stock_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
from multiprocessing import Pool

class Stock:
  def __init__(self, code):
    self.code = code
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

  def page_to_df(self, page):
    url = "https://finance.naver.com/item/sise_day.nhn?code=" + self.code + "&page=" + str(page)
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    tr = bs_obj.find_all("tr", {'onmouseover':'mouseOver(this)', 'onmouseout':"mouseOut(this)"})
    data_dict = {}
    for i in range(10):
      try: 
        spans = tr[i].find_all('span')
        lines = [span.get_text() for span in spans]
        lines = list(map(lambda x: x.replace(',',''), lines))

        objdate = datetime.datetime.strptime(lines[0], '%Y.%m.%d')
        data_dict[objdate] = lines[1:]
      except:
        break
      
    df = pd.DataFrame.from_dict(data_dict).transpose()
    df.columns = ['Close','_','Open','High','Low','Volume']
    df = df.drop(columns="_")
    return df

  def crawl(self, start_page:int=1, end_page:int=26):
    df = self.page_to_df(start_page)
    # 1 page = 10 days
    for i in range(start_page, end_page): 
      try: 
        if (any(self.page_to_df(i).index != self.page_to_df(i-1).index)):
          df = df.append(self.page_to_df(i))
          print('.', end = '')
          if (i%100==0):
            print(i)
        else:
          break
      except:
        df = df.append(self.page_to_df(i))
    df = df.sort_index(axis = 0) 
    for i in ['Close', 'Open', 'High', 'Low', 'Volume']:
      df[i] = df[i].astype(int)
    return df

  def get_data(self, page:int=100):
    with Pool(page) as p:
        li = p.starmap(self.crawl, [(x,x) for x in range(page,0,-1)])
    return pd.concat(li)
 
  def get_fundamental(self):
    url = 'https://finance.naver.com/item/main.nhn?code=' + self.code
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")

    ths = bs_obj.find_all("th", {'scope':'col'})
    ths = [th.get_text() for th in ths][10:-22]
    dates = list(map(lambda x: x.translate(str.maketrans('','','\n\t, ')),ths))
    dates = list(map(lambda x: x + '(Y)', dates[:4])) + list(map(lambda x: x + '(M)', dates[4:]))

    tr = bs_obj.find_all("tbody")
    tds = tr[2].find_all('td')
    tds = [td.get_text() for td in tds]
    elements = list(map(lambda x: x.translate(str.maketrans('','','\n\t, ')),tds))
    elements = list(map(lambda x: float(x) if x.replace('.','').isdigit() else np.nan, elements))

    temp_dict = {}
    cnt = 0
    index = ['sales', 'operating profit', 'net income', 'operating margin', 'net margin', 'roe', 'debt ratio', 'quick ratio', \
    'reserve ratio', 'eps', 'per', 'bps', 'pbr', 'dividend per share', 'dividend yield ratio', 'dividend payout ratio']

    for i in dates:
      temp_dict[i] = elements[cnt::10]
      cnt += 1

    df = pd.DataFrame.from_dict(temp_dict)
    df = df.set_index([pd.Index(index)])
    return df
  
  def get_code_name(self):
    url = 'https://finance.naver.com/item/main.nhn?code=' + str(self.code)
    result = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
    bs_obj = BeautifulSoup(result.content, "html.parser")
    name = bs_obj.find_all("div", {'class':'wrap_company'})
    return name[0].find_all('h2')[0].get_text()

In [2]:
# z - normalization
from scipy import stats
def z_norm(x):
  return stats.zscore(x)

# min-max normalization
def min_max_norm(x):
  return (change - min(change))/(max(change)-min(change))

# numpy version normalization
def norm(x):
  norm = np.linalg.norm(x)
  return np.round(x/norm, 16)

In [3]:
stock = Stock('005930')

In [4]:
price_df = stock.get_data()
price_df

Unnamed: 0,Close,Open,High,Low,Volume
2017-04-18,2075000,2084000,2091000,2064000,137213
2017-04-19,2045000,2065000,2071000,2045000,235258
2017-04-20,2014000,2029000,2040000,2004000,422977
2017-04-21,2038000,2024000,2070000,2024000,302610
2017-04-24,2062000,2063000,2063000,2046000,179803
...,...,...,...,...,...
2021-05-11,81200,82500,82600,81100,28996680
2021-05-12,80000,80800,81200,79800,35812268
2021-05-13,78500,78900,79600,78400,31490096
2021-05-14,80100,79000,80300,78900,16450920


In [5]:
fundamental_df = stock.get_fundamental()
fundamental_df

Unnamed: 0,2018.12(Y),2019.12(Y),2020.12(Y),2021.12(E)(Y),2019.12(M),2020.03(M),2020.06(M),2020.09(M),2020.12(M),2021.03(E)(M)
sales,2437714.0,2304009.0,2368070.0,2664884.0,598848.0,553252.0,529661.0,669642.0,615515.0,608058.0
operating profit,588867.0,277685.0,359939.0,482325.0,71603.0,64473.0,81463.0,123532.0,90470.0,88344.0
net income,443449.0,217389.0,264078.0,362304.0,52270.0,48849.0,55551.0,93607.0,66071.0,65450.0
operating margin,24.16,12.05,15.2,18.1,11.96,11.65,15.38,18.45,14.7,14.53
net margin,18.19,9.44,11.15,13.59,8.73,8.83,10.49,13.98,10.73,10.76
roe,19.63,8.69,9.98,12.98,8.69,8.45,8.49,9.51,9.98,
debt ratio,36.97,34.12,37.07,,34.12,34.19,32.67,36.09,37.07,
quick ratio,204.12,233.57,214.82,,233.57,237.8,250.04,229.69,214.82,
reserve ratio,27531.92,28856.02,30692.79,,28856.02,29134.12,29477.97,30242.29,30692.79,
eps,6024.0,3166.0,3841.0,5287.0,770.0,720.0,808.0,1364.0,949.0,956.0
