<a href="https://colab.research.google.com/github/yeonghun00/stock_public/blob/main/Stock_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime

class Stock:
  def __init__(self, code):
    self.code = code
    self.headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}

  def page_to_df(self, page):
    url = "https://finance.naver.com/item/sise_day.nhn?code=" + self.code + "&page=" + str(page)
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    tr = bs_obj.find_all("tr", {'onmouseover':'mouseOver(this)', 'onmouseout':"mouseOut(this)"})
    data_dict = {}
    for i in range(10):
      try: 
        spans = tr[i].find_all('span')
        lines = [span.get_text() for span in spans]
        lines = list(map(lambda x: x.replace(',',''), lines))

        objdate = datetime.datetime.strptime(lines[0], '%Y.%m.%d')
        data_dict[objdate] = lines[1:]
      except:
        break
      
    df = pd.DataFrame.from_dict(data_dict).transpose()
    df.columns = ['Close','_','Open',
                        'High','Low','Volume']
    df = df.drop(columns="_")

    return df

  def get_data(self, start_page:int=1, end_page:int=26):
    df = self.page_to_df(start_page)
    # max 9999
    # 1 page = 10 days
    for i in range(start_page, end_page): 
      try: 
        if (any(self.page_to_df(i).index != self.page_to_df(i-1).index)):
          df = df.append(self.page_to_df(i))
          print('.', end = '')
          if (i%100==0):
            print(i)
        else:
          break
      except:
        df = df.append(self.page_to_df(i))
    df = df.sort_index(axis = 0) 
    for i in ['Close', 'Open', 'High', 'Low', 'Volume']:
      df[i] = df[i].astype(int)
    return df

  def get_fundamental(self):
    url = 'https://finance.naver.com/item/main.nhn?code=' + self.code
    result = requests.get(url, headers = self.headers)
    bs_obj = BeautifulSoup(result.content, "html.parser")

    ths = bs_obj.find_all("th", {'scope':'col'})
    ths = [th.get_text() for th in ths][10:-22]
    dates = list(map(lambda x: x.translate(str.maketrans('','','\n\t, ')),ths))
    dates = list(map(lambda x: x + '(Y)', dates[:4])) + list(map(lambda x: x + '(M)', dates[4:]))

    tr = bs_obj.find_all("tbody")
    tds = tr[2].find_all('td')
    tds = [td.get_text() for td in tds]
    elements = list(map(lambda x: x.translate(str.maketrans('','','\n\t, ')),tds))
    elements = list(map(lambda x: float(x) if x.replace('.','').isdigit() else np.nan, elements))

    temp_dict = {}
    cnt = 0
    index = ['sales', 'operating profit', 'net income', 'operating margin', 'net margin', 'roe', 'debt ratio', 'quick ratio', \
    'reserve ratio', 'eps', 'per', 'bps', 'pbr', 'dividend per share', 'dividend yield ratio', 'dividend payout ratio']

    for i in dates:
      temp_dict[i] = elements[cnt::10]
      cnt += 1

    df = pd.DataFrame.from_dict(temp_dict)
    df = df.set_index([pd.Index(index)])
    return df
  
  def get_code_name(self):
    url = 'https://finance.naver.com/item/main.nhn?code=' + str(self.code)
    result = requests.get(url, headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'})
    bs_obj = BeautifulSoup(result.content, "html.parser")
    name = bs_obj.find_all("div", {'class':'wrap_company'})
    return name[0].find_all('h2')[0].get_text()

In [None]:
# z - normalization
from scipy import stats
def z_norm(x):
  return stats.zscore(x)

# min-max normalization
def min_max_norm(x):
  return (change - min(change))/(max(change)-min(change))

# numpy version normalization
def norm(x):
  norm = np.linalg.norm(x)
  return np.round(x/norm, 16)

In [None]:
stock = Stock('005930')

In [None]:
price_df = stock.get_data(1,13)
price_df

...........

Unnamed: 0,Close,Open,High,Low,Volume
2020-10-12,60400,60000,60400,59900,16145837
2020-10-13,60900,61000,61400,60400,19247631
2020-10-14,60900,61000,61100,60500,16086716
2020-10-15,60000,60700,60800,59700,17756232
2020-10-16,59500,60000,60400,59000,16554190
...,...,...,...,...,...
2021-04-01,82900,82500,83000,82000,18676461
2021-04-02,84800,84000,85200,83900,22997538
2021-04-02,84800,84000,85200,83900,22997538
2021-04-05,85400,85800,86000,84800,16174746


quantile

In [None]:
print('1st quantile', np.quantile(price_df['Close'], 0.25))
print('2nd quantile', np.quantile(price_df['Close'], 0.5))
print('3rd quantile', np.quantile(price_df['Close'], 0.75))
print('range: ', np.quantile(price_df['Close'], 0.75) - np.quantile(price_df['Close'], 0.25))

1st quantile 67550.0
2nd quantile 81450.0
3rd quantile 82900.0
range:  15350.0


81450.0

82900.0