In [2]:
import concurrent.futures
import math

In [4]:
PRIMES = [
    112272535095293,
    112582705942171,
    112272535095293,
    115280095190773,
    115797848077099,
    1099726899285419]

def is_prime(n):
    if n % 2 == 0:
        return False

    sqrt_n = int(math.floor(math.sqrt(n)))
    for i in range(3, sqrt_n + 1, 2):
        if n % i == 0:
            return False
    return True

def main():
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
            print('%d is prime: %s' % (number, prime))

if __name__ == '__main__':
    main()

112272535095293 is prime: True
112582705942171 is prime: True
112272535095293 is prime: True
115280095190773 is prime: True
115797848077099 is prime: True
1099726899285419 is prime: False


In [13]:
import time

def main():
    print("병렬처리 시작")
    start = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
        for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
            print('%d is prime: %s' % (number, prime))
    end = time.time()
    print("병렬처리 수행 시각", end-start, 's')

    start = time.time()
    for number, prime in zip(PRIMES, map(is_prime, PRIMES)):
        print('%d is prime: %s' % (number, prime))
    end = time.time()
    print("단일처리 수행 시각", end-start, 's')

In [14]:
if __name__ == '__main__':
    main()

병렬처리 시작
112272535095293 is prime: True
112582705942171 is prime: True
112272535095293 is prime: True
115280095190773 is prime: True
115797848077099 is prime: True
1099726899285419 is prime: False
병렬처리 수행 시각 1.6510310173034668 s
112272535095293 is prime: True
112582705942171 is prime: True
112272535095293 is prime: True
115280095190773 is prime: True
115797848077099 is prime: True
1099726899285419 is prime: False
단일처리 수행 시각 1.6259403228759766 s


In [1]:
import concurrent.futures
import urllib.request

URLS = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://europe.wsj.com/',
        'http://www.bbc.co.uk/',
        'http://nonexistant-subdomain.python.org/']

# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
    with urllib.request.urlopen(url, timeout=timeout) as conn:
        return conn.read()

# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d bytes' % (url, len(data)))

'http://nonexistant-subdomain.python.org/' generated an exception: <urlopen error [Errno 11001] getaddrinfo failed>
'http://www.cnn.com/' page is 2501316 bytes
'http://www.foxnews.com/' page is 558153 bytes
'http://www.bbc.co.uk/' page is 541729 bytes
'http://europe.wsj.com/' generated an exception: HTTP Error 403: Forbidden


In [2]:
import concurrent.futures
import math

PRIMES = [
    112272535095293,
    112582705942171,
    112272535095293,
    115280095190773,
    115797848077099,
    1099726899285419]

def is_prime(n):
    if n < 2:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False

    sqrt_n = int(math.floor(math.sqrt(n)))
    for i in range(3, sqrt_n + 1, 2):
        if n % i == 0:
            return False
    return True

def main():
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
            print('%d is prime: %s' % (number, prime))

if __name__ == '__main__':
    main()

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [11]:
import itertools

st = time.time()
list1 = ['a', 'b', 'c'] *100
list2 = ['d', 'e', 'f'] * 100

# Generate all combinations of elements from list1 and list2
combinations = set(itertools.product(list1, list2))

# Filter out combinations where 'crab' is present in both lists
filtered_combinations = [combo for combo in tqdm(combinations) if 'crab' not in combo]

# Print the filtered combinations
for combo in filtered_combinations:
    print(combo)
ed = time.time()
print(ed-st)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<?, ?it/s]

('c', 'f')
('a', 'f')
('b', 'f')
('c', 'd')
('c', 'e')
('a', 'e')
('a', 'd')
('b', 'e')
('b', 'd')
0.005475282669067383





In [5]:
print(combinations)

[('a', 'd'), ('a', 'e'), ('a', 'f'), ('b', 'd'), ('b', 'e'), ('b', 'f'), ('c', 'd'), ('c', 'e'), ('c', 'f')]


In [7]:
import time

In [9]:
from tqdm import tqdm

In [24]:
import pandas as pd

In [22]:
import os

folder_path = 'datasets-KOSDAQ'  # Replace with the actual folder path

file_names = []  # Initialize an empty list to store the file names

for file_name in tqdm(os.listdir(folder_path)):
    if os.path.isfile(os.path.join(folder_path, file_name)):
        file_names.append(file_name[-13:-4])

print(file_names)

100%|███████████████████████████████████████████████████████████████████████████| 1623/1623 [00:00<00:00, 33119.05it/s]

['000250 KS', '000440 KS', '001000 KS', '001540 KS', '001810 KS', '001840 KS', '002230 KS', '002290 KS', '002680 KS', '002800 KS', '003100 KS', '003310 KS', '003380 KS', '003800 KS', '004590 KS', '004650 KS', '004780 KS', '005160 KS', '005290 KS', '005670 KS', '005710 KS', '005860 KS', '005990 KS', '006050 KS', '006140 KS', '006580 KS', '006620 KS', '006730 KS', '006910 KS', '006920 KS', '007330 KS', '007370 KS', '007390 KS', '007530 KS', '007680 KS', '007720 KS', '007770 KS', '007820 KS', '008290 KS', '008370 KS', '008470 KS', '008830 KS', '009300 KS', '009520 KS', '009620 KS', '009730 KS', '009780 KS', '010170 KS', '010240 KS', '010280 KS', '010470 KS', '011040 KS', '011080 KS', '011320 KS', '011370 KS', '011560 KS', '012340 KS', '012620 KS', '012700 KS', '012790 KS', '012860 KS', '013030 KS', '013120 KS', '013310 KS', '013720 KS', '013810 KS', '013990 KS', '014100 KS', '014190 KS', '014200 KS', '014470 KS', '014570 KS', '014620 KS', '014940 KS', '014970 KS', '015710 KS', '015750 KS'




In [271]:
kosdaq = pd.DataFrame()

In [272]:
kosdaq['ticker_bloomberg'] = file_names

In [278]:
kosdaq['ticker_DB'] = [x.replace(' ', '_') for x in kosdaq['ticker_bloomberg']]

In [279]:
kosdaq

Unnamed: 0,ticker_bloomberg,ticker_DB
0,000250 KS,000250_KS
1,000440 KS,000440_KS
2,001000 KS,001000_KS
3,001540 KS,001540_KS
4,001810 KS,001810_KS
...,...,...
1618,448830 KS,448830_KS
1619,449020 KS,449020_KS
1620,450050 KS,450050_KS
1621,450410 KS,450410_KS


In [280]:
kosdq_mems = [x[:-3] for x in kosdaq['ticker_bloomberg']]

In [281]:
kosdq_mems

['000250',
 '000440',
 '001000',
 '001540',
 '001810',
 '001840',
 '002230',
 '002290',
 '002680',
 '002800',
 '003100',
 '003310',
 '003380',
 '003800',
 '004590',
 '004650',
 '004780',
 '005160',
 '005290',
 '005670',
 '005710',
 '005860',
 '005990',
 '006050',
 '006140',
 '006580',
 '006620',
 '006730',
 '006910',
 '006920',
 '007330',
 '007370',
 '007390',
 '007530',
 '007680',
 '007720',
 '007770',
 '007820',
 '008290',
 '008370',
 '008470',
 '008830',
 '009300',
 '009520',
 '009620',
 '009730',
 '009780',
 '010170',
 '010240',
 '010280',
 '010470',
 '011040',
 '011080',
 '011320',
 '011370',
 '011560',
 '012340',
 '012620',
 '012700',
 '012790',
 '012860',
 '013030',
 '013120',
 '013310',
 '013720',
 '013810',
 '013990',
 '014100',
 '014190',
 '014200',
 '014470',
 '014570',
 '014620',
 '014940',
 '014970',
 '015710',
 '015750',
 '016100',
 '016250',
 '016600',
 '016670',
 '016790',
 '016920',
 '017000',
 '017250',
 '017480',
 '017510',
 '017650',
 '017890',
 '018000',
 '018120',

In [282]:
def get_company_info(ticker):
    url = f"https://stockmarketmba.com/analyze.php?s=KQ:{ticker}"
    response = requests.get(url)
    html_text = BeautifulSoup(response.content, 'html.parser').prettify()

    sector_match = re.search(r"GICS sector:\s+([^\<]+)", str(html_text))

    try:
        name_element = BeautifulSoup(html_text, 'html.parser').find('h3').find_next('a')
    
    except:
        name_element = None
        
    if name_element:
        name = name_element.find_next_sibling(text=True).strip()
        name = html.unescape(name)
    else:
        name = None
    
    if sector_match:
        sector = sector_match.group(1).strip()
        sector = html.unescape(sector)
    else:
        sector = None

    return name, sector

In [283]:
from joblib import Parallel, delayed

In [284]:
import requests
import html
from bs4 import BeautifulSoup
import re

In [285]:
with Parallel(n_jobs = 12) as parallel:
    results = parallel(delayed(get_company_info)(ticker) for ticker in tqdm(kosdq_mems))





  0%|                                                                                         | 0/1623 [00:00<?, ?it/s][A[A[A[A



  1%|█▏                                                                              | 24/1623 [00:04<04:44,  5.62it/s][A[A[A[A



  2%|█▊                                                                              | 36/1623 [00:07<05:28,  4.83it/s][A[A[A[A



  3%|██▎                                                                             | 48/1623 [00:11<06:52,  3.82it/s][A[A[A[A



  4%|██▉                                                                             | 60/1623 [00:14<06:43,  3.87it/s][A[A[A[A



  4%|███▌                                                                            | 72/1623 [00:17<06:22,  4.05it/s][A[A[A[A



  5%|████▏                                                                           | 84/1623 [00:19<06:11,  4.15it/s][A[A[A[A



  6%|████▋                                         

 45%|███████████████████████████████████▋                                           | 732/1623 [02:51<03:35,  4.13it/s][A[A[A[A



 46%|████████████████████████████████████▏                                          | 744/1623 [02:53<03:31,  4.15it/s][A[A[A[A



 47%|████████████████████████████████████▊                                          | 756/1623 [02:56<03:26,  4.20it/s][A[A[A[A



 47%|█████████████████████████████████████▍                                         | 768/1623 [02:59<03:17,  4.33it/s][A[A[A[A



 48%|█████████████████████████████████████▉                                         | 780/1623 [03:01<03:05,  4.54it/s][A[A[A[A



 49%|██████████████████████████████████████▌                                        | 792/1623 [03:04<02:59,  4.63it/s][A[A[A[A



 50%|███████████████████████████████████████▏                                       | 804/1623 [03:06<03:02,  4.49it/s][A[A[A[A



 50%|███████████████████████████████████████▋          

 89%|█████████████████████████████████████████████████████████████████████▊        | 1452/1623 [05:43<00:38,  4.48it/s][A[A[A[A



 90%|██████████████████████████████████████████████████████████████████████▎       | 1464/1623 [05:45<00:35,  4.43it/s][A[A[A[A



 91%|██████████████████████████████████████████████████████████████████████▉       | 1476/1623 [05:49<00:34,  4.23it/s][A[A[A[A



 92%|███████████████████████████████████████████████████████████████████████▌      | 1488/1623 [05:52<00:32,  4.21it/s][A[A[A[A



 92%|████████████████████████████████████████████████████████████████████████      | 1500/1623 [05:54<00:29,  4.15it/s][A[A[A[A



 93%|████████████████████████████████████████████████████████████████████████▋     | 1512/1623 [05:57<00:26,  4.16it/s][A[A[A[A



 94%|█████████████████████████████████████████████████████████████████████████▏    | 1524/1623 [06:00<00:23,  4.24it/s][A[A[A[A



 95%|██████████████████████████████████████████████████

In [286]:
name_list = []
sector_list = []

for i in tqdm(results):
    name_list.append(i[0])
    sector_list.append(i[1])





100%|█████████████████████████████████████████████████████████████████████████| 1623/1623 [00:00<00:00, 1160872.34it/s][A[A[A[A


In [287]:
kosdaq2 = kosdaq.copy()

In [291]:
kosdaq

Unnamed: 0,ticker_bloomberg,ticker_DB,name,gics_sector
0,000250 KS,000250_KS,Sam Chun Dang Pharm Ltd,Health Care
1,000440 KS,000440_KS,tool.,
2,001000 KS,001000_KS,tool.,
3,001540 KS,001540_KS,tool.,
4,001810 KS,001810_KS,tool.,
...,...,...,...,...
1618,448830 KS,448830_KS,tool.,
1619,449020 KS,449020_KS,tool.,
1620,450050 KS,450050_KS,tool.,
1621,450410 KS,450410_KS,tool.,


In [290]:
kosdaq['name'] = name_list
kosdaq['gics_sector'] = sector_list

In [292]:
other_sector = ['Unknown', 'Communication', 'SEMICONDUCTORS & SEMICONDUCTOR EQUIPMEN', 'Household & Personal Products', 'Technology Hardware & Equipment', 'Other',  'Health Care Equipment & Services', 'Food Beverage & Tobacco']

In [293]:
error = kosdaq[(kosdaq['name'] == '') | (kosdaq['name'] == 'tool.') | (kosdaq['gics_sector'].isin(other_sector))]

In [294]:
error

Unnamed: 0,ticker_bloomberg,ticker_DB,name,gics_sector
1,000440 KS,000440_KS,tool.,
2,001000 KS,001000_KS,tool.,
3,001540 KS,001540_KS,tool.,
4,001810 KS,001810_KS,tool.,
5,001840 KS,001840_KS,tool.,
...,...,...,...,...
1618,448830 KS,448830_KS,tool.,
1619,449020 KS,449020_KS,tool.,
1620,450050 KS,450050_KS,tool.,
1621,450410 KS,450410_KS,tool.,


In [295]:
error_index = list(error.index)

In [296]:
error_list = [x[:-3] for x in error['ticker_bloomberg']]

In [297]:
error_list

['000440',
 '001000',
 '001540',
 '001810',
 '001840',
 '002230',
 '002290',
 '002680',
 '002800',
 '003100',
 '003310',
 '003800',
 '004590',
 '004650',
 '004780',
 '005160',
 '005670',
 '005710',
 '005860',
 '006050',
 '006140',
 '006580',
 '006620',
 '006910',
 '006920',
 '007330',
 '007370',
 '007530',
 '007680',
 '007720',
 '007770',
 '007820',
 '008290',
 '008370',
 '008470',
 '008830',
 '009300',
 '009520',
 '009620',
 '009730',
 '009780',
 '010240',
 '010280',
 '010470',
 '011040',
 '011080',
 '011320',
 '011370',
 '011560',
 '012340',
 '012620',
 '012700',
 '012790',
 '012860',
 '013310',
 '013720',
 '013810',
 '013990',
 '014100',
 '014190',
 '014200',
 '014470',
 '014570',
 '014940',
 '014970',
 '015710',
 '016250',
 '016600',
 '016670',
 '016790',
 '016920',
 '017000',
 '017250',
 '017480',
 '017510',
 '017650',
 '017890',
 '018120',
 '018290',
 '018310',
 '018620',
 '018680',
 '018700',
 '019010',
 '019210',
 '019540',
 '019550',
 '019570',
 '019590',
 '019660',
 '019770',

In [None]:
def get_company_info2(ticker):
    time.sleep(1)
    try:
        url = f"https://digital.mk.co.kr/yearbook/stock_company.php?TM=STOCK&MM=S1&CC={ticker}"
        response = requests.get(url)
        html_text = BeautifulSoup(response.content, 'html.parser').prettify()
        soup = BeautifulSoup(html_text, 'html.parser')

        tbody = soup.find('tbody')

        if tbody:
            tr_elements = tbody.find_all('tr')
            if len(tr_elements) >= 3:
                td_elements = tr_elements[2].find_all('td')
                if len(td_elements) >= 4:
                    desired_element = td_elements[3]
                    name = desired_element.text.strip()
                    if '.,Ltd.' in name:
                        name = name.replace('.,Ltd.', ' Ltd')
                    elif '., Ltd.' in name:
                        name = name.replace('., Ltd.', ' Ltd')
                    elif '., LTD.' in name:
                        name = name.replace('., LTD.', ' Ltd')
                    return name
      
        return None
    
    except:
        return None

In [None]:
with Parallel(n_jobs = 12) as parallel:
    results = parallel(delayed(get_company_info2)(ticker) for ticker in tqdm(error_list))

In [None]:
len(results)

In [None]:
results2 = [x.replace('.,LTD', ' Ltd') if '.,LTD' in x else x for x in results]

In [None]:
results2 = [x.replace('.,LTD', ' Ltd') if x.isin('.,LTD') else x for x in results]

In [None]:
results2 = [x.replace('.,LTD', ' Ltd') if '.,LTD' in x else x for x in results]


In [None]:
results2 = []
for x in results:
    if x is None:
        results2.append(x)
    
    elif '.,LTD' in x:
        results2.append(x.replace('.,LTD', ' Ltd'))
    elif '., Ltd' in x:
        results2.append(x.replace('., Ltd', ' Ltd'))    
    elif '.,LTD' in x:
        results2.append(x.replace('., LTD', ' Ltd'))
    elif '., LTD' in x:
        results2.append(x.replace('., LTD', ' Ltd'))
    else:
        results2.append(x)

In [None]:
results3 = []
for x in results2:
    if x is None:
        results3.append(x)
    elif x is '':
        results3.append(None)
  
    elif x[-1] is '.':
        results3.append(x[:-1])
    else:
        results3.append(x)

In [None]:
len(results3)

In [None]:
results3

In [298]:
for i, name in zip(error_index, results3):
    kosdaq.iloc[i]['name'] = name

In [300]:
kosdaq.head(100)

Unnamed: 0,ticker_bloomberg,ticker_DB,name,gics_sector
0,000250 KS,000250_KS,Sam Chun Dang Pharm Ltd,Health Care
1,000440 KS,000440_KS,Joong Ang Enervis Co Ltd,
2,001000 KS,001000_KS,SillaTextile,
3,001540 KS,001540_KS,AHN-GOOK PHARMACEUTICAL Co Ltd,
4,001810 KS,001810_KS,MOORIM SP CO Ltd,
...,...,...,...,...
95,018700 KS,018700_KS,Barunson,
96,019010 KS,019010_KS,VenueG Co Ltd,
97,019210 KS,019210_KS,YG-1 CO Ltd,
98,019540 KS,019540_KS,ILJI TECHNOLOGY CO Ltd,


In [308]:
kosdaq.to_csv('KOSDAQ_members.csv', index=None)

In [304]:
sector = [None if x in other_sector else x for x in kosdaq['gics_sector']]

In [306]:
kosdaq['gics_sector'] = sector

In [307]:
kosdaq['gics_sector'].value_counts()

Information Technology    68
Health Care               59
Materials                 15
Consumer Staples          13
Consumer Discretionary    12
Industrials               10
Communication Services     4
Financials                 3
Real Estate                1
Name: gics_sector, dtype: int64