In [1]:
#basis
import datetime
from	datetime import datetime as dt
import time

#debug
import pickle

#util
import subprocess
import configparser
import logging
import csv
import socket
import time

#crawl
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# mail 
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.utils import formatdate

from timeout_decorator import timeout, TimeoutError

# cron is run when current dir. pls move to your dir 
DB_CONFIG		= 'mail.conf' 
ELICA_CONFIG	= 'elica.conf' 

ENV = 'DEV'
SEAVICE_NAME = 'ELICA_EDINET'

logging.basicConfig(level=logging.INFO,format='%(asctime)s:%(message)s')

#独自
class NotFoundElementException(Exception):
	pass

class AccessFailedException(Exception):
	pass

class UTIL:

	def __init__(self,version,service_name):

		# order should be kept
		####	start
		self.version = version
		self.service_name =service_name
		self.set_logger()
		#### end

		# 
		self.dt_init = dt.now()
		self.get_machine_localname()

	def set_logger(self):
		self.logger = logging.getLogger(self.service_name)

	def read_config(self,path_conf):
		logging.info(path_conf)
		config = configparser.ConfigParser()
		config.read(path_conf,encoding='utf-8')
		target_config = config[self.version]
		if target_config is None:
			return False
		else: 		
			self.config = target_config
			return True

	def set_time(self):
		self.now = datetime.datetime.now()

	def run_cmd(self,cmd):
		try:
			subprocess.call(cmd.split())	
		except Exception as e:
			self.logger.warning('[NG]Cannot execute cmd: %s', e)
			return False
		return True

	def get_machine_localname(self):
		try:
			hostname = socket.gethostname()
			if hostname == 'Macico.local':
				self.hostname = 'macico'
			elif hostname == 'elica03':
				self.hostname = 'gcp'
			else:
				self.hostname = 'unknown'
				raise NotFoundElementException('Cannot find host in known host list : '+hostname)

			self.logger.info('[OK]Set host: %s', hostname)

		except NotFoundElementException as e:
			self.logger.warning('[NG]Set unknown host: %s', e)

		except Exception as e:
			self.logger.warning('[NG]Cannot find host for unknown error: %s', e)
			return False

		return True


In [None]:
class DBSentConf:
    def __init__():
        self.util = UTIL(version=ENV,service_name=SEAVICE_NAME)
		self.util.read_config(DB_SENTENCE_CONFIG)
    

In [None]:
import pymysql.cursors
class DBOperator:
    def __init__():
		#set log
		self.util = UTIL(version=ENV,service_name=SEAVICE_NAME)
		self.util.read_config(DB_CONFIG)
    
    def get_connection(self);
        conn = pymysql.connect(
            user=self.util.config.get('USER'),
            passwd=self.util.config.get('PASSWORD'),
            host=self.util.config.get('HOST'),
            db=self.util.config.get('NAME')
        )
        logging.info("[DONE]Get connection")
        return conn
    
    def close_connection(self,conn);
        try:
            conn.close()
        except:
            pass
            
        
    def select(self,str_sql, params):
        conn = self.get_connection()
        result = None
        try:
            with conn.cursor() as cursor:
                cursor.execute(str_sql, params)
                result = cursor.fetchall()
                logging.info("[DONE]Select")
        finally:
            close_connection(conn)
        return result

    def insert(self,str_sql, params):
        conn = self.get_connection()
        try:
            with conn.cursor() as cursor:
                cursor.execute(str_sql, params)
            conn.commit()
            logging.info("[DONE]Insert")
        finally:
            close_connection(conn)



In [2]:
class ELICA:
	def __init__(self):

		#True/False
		self.headless = True

		#set log
		self.util = UTIL(version=ENV,service_name=SEAVICE_NAME)
		self.util.read_config(ELICA_CONFIG)

		# sec
		self.default_timeout_sec = 200

		# history
		self.ls_url_sucess = []
		self.ls_url_failed = []



		self.browser_height = self.util.config.get['BROWSER_HEIGHT']
		self.browser_width  = self.util.config.get['BROWSER_WIDTH']

		#driver
		self.set_driver()

	def set_driver(self):
		options = webdriver.ChromeOptions()
		if self.headless:
			# options.add_argument('--headless')
			options.add_argument('--window-size='+str(self.browser_width)+','+str(self.browser_height))
			options.add_argument('--disable-gpu')
			options.add_argument('--disable-infobars')
			options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36")
		if self.util.hostname == 'gcp':
			self.driver = webdriver.Chrome(options=options,executable_path="/home/kotetsu219specialpartner/bin/chromedriver")
		if self.util.hostname == 'macico':
			self.driver = webdriver.Chrome(options=options) 

		return True

	def del_driver(self):
		try:
			self.driver.quit()
			self.util.logger.info('[OK]Driver was killed')
		except Exception as e:
			self.util.logger.warning('[NG]Driver wasn\'t killed')
			return False

		try:
			self.util.run_cmd('pkill -9 chromedriver')
			self.util.logger.info('[OK]chromedriver process was killed')
		except Exception as e:
			self.util.logger.warning('[NG]chromedriver process wasn\'t killed:')
			return False

		return True

	def scroll_page(self, width, height):
		try:
			self.driver.execute_script("window.scrollTo(" + str(width) + "+, " + str(height) + ");")
			self.util.logger.info('[OK]Scrolled')	
		except Exception as e:
			self.util.logger.info('[NG]Cannot scroll')
			return False
		return True

	def stop_driver(self,time_for_wait):
		try:
			self.util.logger.info('[・]Driver enter wait time : %d',time_for_wait)
			time.sleep(time_for_wait)
			self.util.logger.info('[OK]Driver restart from wait time')
		except Exception as e:
			self.util.logger.warning('[NG]Driver occur error in waiting: %s',e)
			return False
		return True


	def to_page(self,url,**kwargs):

		if 'waittime' in kwargs.keys():
			waittime = kwargs['waittime']
		else:
			waittime = 0			
		if 'timeout_sec' in kwargs.keys():
			timeout_sec = kwargs['timeout_sec']
		else:
			timeout_sec = 120

		@timeout(timeout_sec)
		def access_page(url):
			self.driver.get(url)

		try:
			self.util.logger.info('[・]Accessing Web page.. : %s',url)
			access_page(url)
			self.util.logger.info('[OK]Driver get web page: %s',url)
			self.ls_url_sucess.append(url)
		except TimeoutError :
			self.util.logger.warning('[NG]Driver cannot get web page for timeout: %s',url)
			self.ls_url_failed.append(url)
			return False
		except Exception as e:
			self.util.logger.warning('[NG]Driver occur error in access page: %s',e)	
			self.ls_url_failed.append(url)
			return False

		if waittime>0:
			self.stop_driver(waittime)

		return True



In [3]:
agent = ELICA()

2021-05-20 20:02:09,671:[OK]Set host: Macico.local
2021-05-20 20:02:09,680:elica.conf


In [4]:
import re
bar_pattern = '.*[-ー－―]+'
void_dt_pattern = '[月日\s]'
def to_search_results(current_page_no=0,go_to_page=1):
    assert go_to_page!=0, "Cannnot move myself"
    assert (current_page_no-go_to_page)>0,  "page_no must be higher than min."
    upper_pager = agent.driver.find_elements(By.XPATH, "//p[@class='pageLink']")[0] # get upper
    page_link_btns = upper_pager.find_elements(By.XPATH, "span[not(contains(@class,'current'))]")
    n_pages = len(page_link_btns)+1
    assert len(page_link_btns) > (current_page_no+go_to_page), "page_no must be lower than max."
    new_age_no = current_page_no+go_to_page
    page_link_btns[current_page_no+go_to_page].click()
    return new_age_no,n_pages
    
def get_search_result_rows():
    search_result_table_parent = agent.driver.find_elements(By.XPATH, "//div[@class='result']")[0] # get resutl
    search_result_table = search_result_table_parent.find_elements(By.XPATH, "table")[0] #get table 
    search_result_rows = search_result_table.find_elements(By.XPATH, "tbody/tr[not(contains(@class,'tableHeader'))]") #get row
    return search_result_rows

def analyze_search_result_row(row):
    tds = row.find_elements(By.XPATH, "td")
    submit_date = tds[0].text
    detail_link_obj = tds[1]
    edinet_code = tds[2].text
    submitter_name= tds[3].text
    pdf_link= tds[5].find_element_by_tag_name('a').get_attribute("href")
    report_name =  detail_link_obj.text
    logging.info("{0}-{1}".format(edinet_code,submitter_name))
    row_result = {'submit_date':submitter_name,'report_name':report_name, 'edinet_code':edinet_code, 'submitter_name':submitter_name,'pdf_link':pdf_link,'detail_link_obj':detail_link_obj }
    return row_result


def to_content_in_detail():
    # Action to display Content
    agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[1])
    agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[1])

    leftbar_body = agent.driver.find_elements(By.XPATH, "//body")[0]
    fetch_content_btn = agent.driver.find_elements(By.XPATH, "//a[contains(text(), '取得状況')]")[0]
    fetch_content_btn.click()
    agent.driver.switch_to.default_content()
    time.sleep(1)

    # Action to move onto Content
    agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[1])
    agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[2])

    content = agent.driver.find_elements(By.XPATH, "//body")[0]
    return content

def scrtutinize_contetnt_in_detail(content):
    progress_reports = content.find_elements(By.XPATH, "//*[contains(text(), '報告月における取得自己株式')]")
    content_result = {}
    for type_no in  range(len(progress_reports)):
        # Expected: 株主総会決議による取得の状況】 and 【取締役会決議による取得の状況】
        content_result[type_no] = None

        #find table obj ( it is parent obj)
        parent_report_table =progress_reports[type_no]
        cannot_find_record_tr =False
        while parent_report_table.tag_name != 'tr':
            try:
                # If html tag, error
                parent_report_table = parent_report_table.find_element_by_xpath('..')
            except:
                cannot_find_record_tr = True
                break
        if cannot_find_record_tr:
            break

        # Search base on upper level tr
        parent_report_table = parent_report_table.find_element_by_xpath('..')
        report_rows = parent_report_table.find_elements_by_xpath('tr')
        report_result = {'daily':[],'total':None}
        logging.info("Find {0} rows. Look up rows".format(len(report_rows)))
        for i_report_row in range(len(report_rows)):
            report_row = report_rows[i_report_row]
            qty, value = None, None
            sum_qty, sum_value = None, None
            report_cols = report_row.find_elements_by_xpath('td')

            if "日現在" in report_cols[0].text:
                continue
            if len(report_cols) <=1:
                continue

            date_col_txt = report_cols[1].text
            if "月" in date_col_txt:
                if "日" in date_col_txt:
                    if  re.match(bar_pattern, date_col_txt) is None and re.sub(void_dt_pattern, '', date_col_txt) != '':
                        month = str(int(date_col_txt.split('月')[0])).zfill(2)
                        day = str(int(date_col_txt.split('月')[1].split('日')[0])).zfill(2)
                        dt = month+day
                        qty = int(report_cols[2].text.replace(',', ''))
                        value = int(report_cols[3].text.replace(',', ''))
                        if qty is not None:
                            report_result['daily'].append({'date':dt,'qty':qty,'value':value})
                            logging.info("Data: {0}".format(report_result['daily'][-1]))
                        else:
                            logging.info("Data: No data")

            #　計がきたらおしまい（ループ）
            if report_row.text.startswith("計") and report_row.tag_name == 'tr':
                report_cols = report_row.find_elements_by_xpath('td')
                qty_col_txt = report_cols[2].text
                if  re.match(bar_pattern, qty_col_txt) is None:
                    sum_qty = int(report_cols[2].text.replace(',', ''))
                    sum_value = int(report_cols[3].text.replace(',', ''))
                    if sum_qty is not None:
                        report_result['total'] = {'qty':sum_qty,'value':sum_value}
                        logging.info("Sum : {0}".format(report_result['total']))
                    else:
                        logging.info("Sum: No data")
                break
        report_result['daily_total'] = len(report_result['daily'])
        content_result[type_no] = report_result
        logging.info("Daily report: {0} rows are fetched.".format(len(report_result['daily'])))
    return content_result





In [5]:
# agent.to_page("https://disclosure.edinet-fsa.go.jp/E01EW/BLMainController.jsp?uji.verb=W1E63020CXW1E6A020DSPSch&uji.bean=ee.bean.parent.EECommonSearchBean&PID=W1E63020&TID=W1E63021&SESSIONKEY=1621250924027&lgKbn=2&pkbn=0&skbn=0&dskb=&dflg=0&iflg=0&preId=1&row=100&idx=0&syoruiKanriNo=&sec=&scc=&shb=&snm=&spf1=1&spf2=1&iec=&icc=&inm=&spf3=1&fdc=&fnm=&spf4=1&spf5=2&otd=220&cal=1&era=R&yer=&mon=&psr=1&pfs=5")
# page_no = 0 
# search_result_rows = get_search_result_rows()
# i_row=0
# row_result = analyze_search_result_row(search_result_rows[i_row])
# row_result['detail_link_obj'].click()
# agent.driver.switch_to.window(agent.driver.window_handles[1])

# content=to_content_in_detail()
# content_result = scrtutinize_contetnt_in_detail(content)

In [None]:
agent.to_page("https://disclosure.edinet-fsa.go.jp/E01EW/BLMainController.jsp?uji.verb=W1E63020CXW1E6A020DSPSch&uji.bean=ee.bean.parent.EECommonSearchBean&PID=W1E63020&TID=W1E63021&SESSIONKEY=1621250924027&lgKbn=2&pkbn=0&skbn=0&dskb=&dflg=0&iflg=0&preId=1&row=100&idx=0&syoruiKanriNo=&sec=&scc=&shb=&snm=&spf1=1&spf2=1&iec=&icc=&inm=&spf3=1&fdc=&fnm=&spf4=1&spf5=2&otd=220&cal=1&era=R&yer=&mon=&psr=1&pfs=5")
time.sleep(2)
page_no = 0 
while True:
    search_result_rows = get_search_result_rows()
    for i_row in range(len(search_result_rows)):
        try: 
            row_result = analyze_search_result_row(search_result_rows[i_row])
            if row_result['report_name'] != '自己株券買付状況報告書（法２４条の６第１項に基づくもの）':
                continue
            row_result['detail_link_obj'].click()
            time.sleep(3)
            try:
                # Entry detail window
                agent.driver.switch_to.window(agent.driver.window_handles[1])
                row_result['detail_url']= agent.driver.current_url
                try:
                    content=to_content_in_detail()
                    content_result = scrtutinize_contetnt_in_detail(content)
                    logging.info("Analyze Done: {0}-{1}".format(row_result['submit_date'],row_result['edinet_code']))
                    row_result['detail_result'] = content_result
                    # close detail
                except Exception as e :
                    logging.warning("Detail screen error(L3): {0}".format(e),exc_info=True)
                finally:
                    agent.driver.switch_to.default_content()
                    agent.driver.close() 

            except Exception as e :
                logging.warning("Detail screen error(L2): {0}".format(e),exc_info=True)
            finally:
                #　return search  result
                handle_array = agent.driver.window_handles
                agent.driver.switch_to.window(handle_array[0])

        except Exception as e :
            logging.warning("Detail screen error(L1): {0}".format(e))
        
    logging.info("Done: {0}-{1}".format(row_result['submit_date'],row_result['edinet_code']))
    if page_no+1 == n_pages:
        break
    page_no, n_pages = to_search_results(page_no,1)
    break

2021-05-20 20:02:22,357:[・]Accessing Web page.. : https://disclosure.edinet-fsa.go.jp/E01EW/BLMainController.jsp?uji.verb=W1E63020CXW1E6A020DSPSch&uji.bean=ee.bean.parent.EECommonSearchBean&PID=W1E63020&TID=W1E63021&SESSIONKEY=1621250924027&lgKbn=2&pkbn=0&skbn=0&dskb=&dflg=0&iflg=0&preId=1&row=100&idx=0&syoruiKanriNo=&sec=&scc=&shb=&snm=&spf1=1&spf2=1&iec=&icc=&inm=&spf3=1&fdc=&fnm=&spf4=1&spf5=2&otd=220&cal=1&era=R&yer=&mon=&psr=1&pfs=5
2021-05-20 20:02:27,363:[OK]Driver get web page: https://disclosure.edinet-fsa.go.jp/E01EW/BLMainController.jsp?uji.verb=W1E63020CXW1E6A020DSPSch&uji.bean=ee.bean.parent.EECommonSearchBean&PID=W1E63020&TID=W1E63021&SESSIONKEY=1621250924027&lgKbn=2&pkbn=0&skbn=0&dskb=&dflg=0&iflg=0&preId=1&row=100&idx=0&syoruiKanriNo=&sec=&scc=&shb=&snm=&spf1=1&spf2=1&iec=&icc=&inm=&spf3=1&fdc=&fnm=&spf4=1&spf5=2&otd=220&cal=1&era=R&yer=&mon=&psr=1&pfs=5
2021-05-20 20:02:29,464:E31248-株式会社ファーストロジック
2021-05-20 20:02:34,318:Find 7 rows. Look up rows
2021-05-20 20:02:34,52

Collecting PyMySQL
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 741 kB/s eta 0:00:01
[?25hInstalling collected packages: PyMySQL
Successfully installed PyMySQL-1.0.2
You should consider upgrading via the '/Users/macico/.pyenv/versions/3.7.6/envs/py37b/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [16]:
agent.to_page("https://disclosure.edinet-fsa.go.jp/E01EW/BLMainController.jsp?uji.verb=W1E63020CXW1E6A020DSPSch&uji.bean=ee.bean.parent.EECommonSearchBean&PID=W1E63020&TID=W1E63021&SESSIONKEY=1621250924027&lgKbn=2&pkbn=0&skbn=0&dskb=&dflg=0&iflg=0&preId=1&row=100&idx=0&syoruiKanriNo=&sec=&scc=&shb=&snm=&spf1=1&spf2=1&iec=&icc=&inm=&spf3=1&fdc=&fnm=&spf4=1&spf5=2&otd=220&cal=1&era=R&yer=&mon=&psr=1&pfs=5")


2021-05-20 19:20:13,862:[・]Accessing Web page.. : https://disclosure.edinet-fsa.go.jp/E01EW/BLMainController.jsp?uji.verb=W1E63020CXW1E6A020DSPSch&uji.bean=ee.bean.parent.EECommonSearchBean&PID=W1E63020&TID=W1E63021&SESSIONKEY=1621250924027&lgKbn=2&pkbn=0&skbn=0&dskb=&dflg=0&iflg=0&preId=1&row=100&idx=0&syoruiKanriNo=&sec=&scc=&shb=&snm=&spf1=1&spf2=1&iec=&icc=&inm=&spf3=1&fdc=&fnm=&spf4=1&spf5=2&otd=220&cal=1&era=R&yer=&mon=&psr=1&pfs=5
2021-05-20 19:20:16,849:[OK]Driver get web page: https://disclosure.edinet-fsa.go.jp/E01EW/BLMainController.jsp?uji.verb=W1E63020CXW1E6A020DSPSch&uji.bean=ee.bean.parent.EECommonSearchBean&PID=W1E63020&TID=W1E63021&SESSIONKEY=1621250924027&lgKbn=2&pkbn=0&skbn=0&dskb=&dflg=0&iflg=0&preId=1&row=100&idx=0&syoruiKanriNo=&sec=&scc=&shb=&snm=&spf1=1&spf2=1&iec=&icc=&inm=&spf3=1&fdc=&fnm=&spf4=1&spf5=2&otd=220&cal=1&era=R&yer=&mon=&psr=1&pfs=5


True

In [21]:
result = []
# get rows
table_parent = agent.driver.find_elements(By.XPATH, "//div[@class='result']")[0] # get resutl
table = table_parent.find_elements(By.XPATH, "table")[0] #get table 
rows = table.find_elements(By.XPATH, "tbody/tr[not(contains(@class,'tableHeader'))]") #get row
i_row=0
target_content = rows[i_row].find_elements(By.XPATH, "td")
target_content_submit_date = target_content[0].text
target_content_link_obj = target_content[1]
target_content_edinet_code = target_content[2].text
target_content_submitter_name= target_content[3].text
target_content_pdf_link= target_content[5].find_element_by_tag_name('a').get_attribute("href")
col_text =  target_content_link_obj.text
logging.info("{0}-{1}".format(target_content_edinet_code,target_content_submitter_name))
# if col_text != '自己株券買付状況報告書（法２４条の６第１項に基づくもの）':

progress_report_result = {'submit_date':target_content_submit_date, 'edinet_code':target_content_edinet_code, 'pdf_link':target_content_pdf_link}
# press and move activationn
target_content_link_obj.click()
time.sleep(3)
# try:
# Entry detail window
agent.driver.switch_to.window(agent.driver.window_handles[1])
progress_report_result['detail_url']= agent.driver.current_url
# try:
# Activate detail window

# Action to display Ccontent
agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[1])
agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[1])

leftbar_body = agent.driver.find_elements(By.XPATH, "//body")[0]
fetch_content_btn = agent.driver.find_elements(By.XPATH, "//a[contains(text(), '取得状況')]")[0]
fetch_content_btn.click()
agent.driver.switch_to.default_content()

# Action to move onto Ccontent
time.sleep(2)
agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[1])
agent.driver.switch_to.frame(agent.driver.find_elements(By.XPATH, "//frame")[2])

report_content = agent.driver.find_elements(By.XPATH, "//body")[0]

# Scrtutinize contetnt
progress_reports = report_content.find_elements(By.XPATH, "//*[contains(text(), '報告月における取得自己株式')]")
# for type_no in  range(len(progress_reports)):
#     # Expected: 株主総会決議による取得の状況】 and 【取締役会決議による取得の状況】
#     progress_report_result[type_no] = None

#     #find table obj ( it is parent obj)
#     parent_report_table =progress_reports[type_no]
#     cannot_find_record_tr =False
#     while parent_report_table.tag_name != 'tr':
#         try:
#             # If html tag, error
#             parent_report_table = parent_report_table.find_element_by_xpath('..')
#         except:
#             cannot_find_record_tr = True
#             break
#     if cannot_find_record_tr:
#         break

#     # Search base on upper level tr
#     parent_report_table = parent_report_table.find_element_by_xpath('..')
#     report_rows = parent_report_table.find_elements_by_xpath('tr')
#     report_result = {'daily':[],'total':None}
#     logging.info("Find {0} rows. Look up rows".format(len(report_rows)))
#     for i_report_row in range(len(report_rows)):
#         report_row = report_rows[i_report_row]
#         qty, value = None, None
#         sum_qty, sum_value = None, None
#         report_cols = report_row.find_elements_by_xpath('td')

#         if "日現在" in report_cols[0].text:
#             continue
#         if len(report_cols) <=1:
#             for _node in report_cols:
#                 logging.info("P1:{0}".format(_node.text))
#             continue

#         date_col_txt = report_cols[1].text
#         if "月" in date_col_txt:
#             if "日" in date_col_txt:
#                 if  re.match(bar_pattern, date_col_txt) is None and re.sub(void_dt_pattern, '', date_col_txt) != '':
#                     month = str(int(date_col_txt.split('月')[0])).zfill(2)
#                     day = str(int(date_col_txt.split('月')[1].split('日')[0])).zfill(2)
#                     dt = month+day
#                     qty = int(report_cols[2].text.replace(',', ''))
#                     value = int(report_cols[3].text.replace(',', ''))
#                     if qty is not None:
#                         report_result['daily'].append({'date':dt,'qty':qty,'value':value})
#                         logging.info("Data: {0}".format(report_result['daily'][-1]))
#                     else:
#                         logging.info("Data: No data")

#         #　計がきたらおしまい（ループ）
#         if report_row.text.startswith("計") and report_row.tag_name == 'tr':
#             report_cols = report_row.find_elements_by_xpath('td')
#             qty_col_txt = report_cols[2].text
#             if  re.match(bar_pattern, qty_col_txt) is None:
#                 sum_qty = int(report_cols[2].text.replace(',', ''))
#                 sum_value = int(report_cols[3].text.replace(',', ''))
#                 if sum_qty is not None:
#                     report_result['total'] = {'qty':sum_qty,'value':sum_value}
#                     logging.info("Sum : {0}".format(report_result['total']))
#                 else:
#                     logging.info("Sum: No data")
#             break
#     report_result['daily_total'] = len(report_result['daily'])
#     progress_report_result[type_no] = report_result
#     logging.info("Daily report: {0} rows are fetched.".format(len(report_result['daily'])))

# result.append(progress_report_result)
# logging.info("Done: {0}-{1}".format(progress_report_result['submit_date'],progress_report_result['edinet_code']))
# # close detail
# except Exception as e :
# logging.warning("Detail screen error(L3): {0}".format(e),exc_info=True)
# finally:
# agent.driver.switch_to.default_content()
# agent.driver.close() 

# except Exception as e :
# logging.warning("Detail screen error(L2): {0}".format(e),exc_info=True)
# finally:
# #　return search  result
# handle_array = agent.driver.window_handles
# agent.driver.switch_to.window(handle_array[0])

# except Exception as e :
# logging.warning("Detail screen error(L1): {0}".format(e))

In [9]:
import pandas as pd
rr = result[0]

def get_detail_table_result_df(detail_result):
    key_list = detail_result.keys
    key_list.remove('edinet_code')
    key_list.remove('submit_date')
    dfs = []
    for _key in key_list
        daily_df = pd.DataFrame(detail_result[_key]['daily'])
        daily_df['edinet_code'] = detail_result['edinet_code']
        daily_df['submit_date'] = detail_result['submit_date']
        dfs.append(daily_df)
    df = pd.concat(dfs,axis=0,ignore_index=True)
    return df


SyntaxError: invalid syntax (<ipython-input-9-025f12e19a43>, line 9)

In [None]:
key_list = list(result[0].keys())

In [100]:
agent.driver.execute_script("window.open()")
new_window = agent.driver.window_handles.last
agent.driver.switch_to.window(new_window)

AttributeError: 'list' object has no attribute 'last'

In [116]:
actions = ActionChains(agent.driver)
actions.key_down(Keys.COMMAND)
actions.click(page_link_btns[0])
actions.perform()


In [108]:
from selenium.webdriver.common.keys import Keys

In [None]:
class MAILER:
	def __init__(self):
		self.util = UTIL(version=ENV,service_name=SEAVICE_NAME)

		
		# read config and set
		self.util.read_config(MAIL_CONFIG)
		self.from_addr = self.util.config['FROM_ADDRESS']
		self.url_smtp =  self.util.config['URL_SMTP']
		self.port_smtp = self.util.config['PORT_SMTP']
		self.addr_login = self.util.config['LOGIN_ADDRESS']
		self.pw_login = self.util.config['LOGIN_PW']
		self.service_name = self.util.service_name
		# self.obj_msg = {}

	def read_ls_to_address(self):
		try:
			self.ls_to_addr = []
			with open(MAIL_TO_LIST,encoding = 'utf-8') as f:
				for _line  in csv.reader(f):
					for _address in _line : 
						self.ls_to_addr.append(_address)
						self.util.logger.info('[OK]Add send email list: %s',_address)
			self.util.logger.info('[OK]Get all address list')

		except Exception as e:
			self.util.logger.warning('[NG]Cannot read .csv of TO ADDR LIST:',e)
			return False

		return True




	def set_smtp_obj(self):
		try:
			self.obj_smtp = smtplib.SMTP(self.url_smtp, self.port_smtp)
			self.obj_smtp.ehlo()
			self.obj_smtp.starttls()
			self.obj_smtp.ehlo()
			self.obj_smtp.login(self.addr_login, self.pw_login)
			self.util.logger.info('[OK]Get smtp object')
		except Exception as e:
			self.util.logger.warning('[OK]Cannot get smtp object:',e)
			return False
		return True


	def exec_send(self):

		if not self.obj_smtp is None:
			try:
				self.util.logger.info('[・]Mail is sending for %s',str(self.ls_to_addr))
				self.obj_smtp.sendmail(self.from_addr, self.ls_to_addr, self.obj_msg.as_string())
				self.util.logger.info('[OK]Mail is sent')
				self.obj_smtp.close()
				return True
			except Exception as e:
				self.util.logger.warning('[NG]Cannot send mail: %s',e)		
				return False
		else:
			self.util.logger.warning('[NG]No smtp obj')	
			return False


	def make_content(self,subject, body):
		try:
			self.obj_msg = MIMEMultipart('alternative')
			part = MIMEText(self.finalize_html(body),'html')
			self.obj_msg['Subject'] = subject
			self.obj_msg['From']= self.from_addr
			self.obj_msg['To']	= ", ".join(self.ls_to_addr)
			self.obj_msg['Date']= formatdate()
			self.obj_msg.attach(part)
			self.util.logger.info('[OK]Get mail contents object')
		except Exception as e:
			self.util.logger.warning('[NG]Cannot make mail: %s',e)	
			return False

		return True

	def finalize_html(self,body):
		# add credit
		body += "<br>"
		body += "SENT BY "+self.service_name
		body += "<br>"
		return body	


In [2]:





class NIKKEI:
	def __init__(self):
		self.agent = ACCESS()
		self.util = UTIL(version=ENV,service_name=SEAVICE_NAME)
		self.util.read_config(ELICA_CONFIG)
		self.url_login = 'https://www.nikkei.com/login'
		self.url_logout= 'https://regist.nikkei.com/ds/etc/accounts/logout'
		self.util.logger.info('[OK]NIKKEI INIT')


	def get_edition_paper(self,obj_dt=None,type='en'):
		# set dt
		if obj_dt is None:
			obj_dt = self.util.dt_init

		# set edition
		str_edition_paper=  "morning" if obj_dt.hour < 15 else "evening"

		# translate
		if type=='ja':
			dct_en_ja = {'morning':'朝','evening':'夕'}
			str_edition_paper = dct_en_ja[str_edition_paper]

		return str_edition_paper

	def get_paper_url(self,obj_dt=None):

		# set dt
		if obj_dt is None:
			obj_dt = self.util.dt_init

		# construct url
		url_paper = 'https://www.nikkei.com/paper/'+self.get_edition_paper(obj_dt)+'/?b='+"{0:%Y%m%d}".format(obj_dt)+'&d=0' 

		return url_paper

	def get_contents_in_paper(self):

		# get session
		self.logout()
		self.login()

		# get paper obj 
		is_obj = self.agent.to_page(self.get_paper_url(),**{'waittime':30})

		# make subject
		str_edition = self.get_edition_paper(obj_dt=None,type='ja')
		str_subject = "{0:%Y%m%d}".format(self.util.dt_init)+'日経紙面('+str_edition+'刊)一覧'

		if is_obj:

			self.util.logger.info('[・]Scraping Web page..')
			obj_content = self.scrape_paper()

			# discard session
			self.logout()
			self.util.logger.info('[OK]Get content from web pagesucessfly')
			self.agent.del_driver()

		else:
			self.util.logger.warning('[NG]Access is failed : May be not available today?')
			str_subject = '[NG]'+str_subject
			obj_content = None

		return  str_subject, obj_content

	def login(self):
		self.agent.get_login_page(self.url_login, allow_revisit=True, **{'waittime':10,'timeout_sec':120}	)
		self.agent.driver.find_element_by_id('LA7010Form01:LA7010Email').send_keys(self.util.config['ID'])
		self.agent.driver.find_element_by_id('LA7010Form01:LA7010Password').send_keys(self.util.config['PW'])
		self.agent.driver.find_element_by_class_name("btnM1").click();
		return 

	def logout(self):
		self.agent.get_logout_page(self.url_logout, allow_revisit=True, **{'waittime':10,'timeout_sec':120} )
		return 	

	def scrape_paper(self):
		ls_body = []
		try:
			ls_el_section = self.agent.driver.find_elements_by_css_selector('.cmn-section')

			for el_section in ls_el_section:
				# title
				try:
					section_title = el_section.find_element_by_css_selector('.cmnc-title').text
					if (section_title != '')&(section_title!='短信'): 
						# print("<"+section_title+">") 
						# body += ('<br>'+"＝＝"+section_title+"＝＝"+'<br>')
						dct_section = {}
						dct_section['section_name'] = section_title
						self.util.logger.info('[OK] %s',section_title)
						dct_section['ls_top_news'] = []

						#top news
						try:
							ls_el_article = el_section.find_elements_by_css_selector('.cmn-top_news')
							for el_article in  ls_el_article:
								try:
									ls_el_topic = el_article.find_elements_by_css_selector('.cmn-article_title')
									for el_topic in  ls_el_topic :
										topic_text =  el_topic.text.replace('<br>','')
										topic_text=  topic_text.strip()#空白削除

										dct_section['ls_top_news'].append(topic_text)
										self.util.logger.info('[OK] %s > (L)%s',section_title,topic_text)								
								except Exception as e:
									self.util.logger.warning('[NG] %s > (L)???',section_title,e)
									continue
						except Exception as e:
							self.util.logger.warning('[NG] %s : (L)No top news: %s ',section_title,e)

						#article
						dct_section['ls_article'] = []
						try:
							ls_el_article = el_section.find_elements_by_css_selector('.cmn-article_list')
							for el_article in  ls_el_article:
								try:
									ls_el_topic = el_article.find_elements_by_css_selector('.cmn-article_title')
									ls_small_topic = []
									for el_topic in ls_el_topic:

										#テキスト取得	
										topic_text = el_topic.text.replace('<br>','')
										topic_text=  topic_text.strip()#空白削除

										#すでにTOPnewsにある
										if topic_text in dct_section['ls_top_news']:
											self.util.logger.info('[・] %s > %s is already added',section_title,topic_text)										
											continue

										#すでに articleにある(small list外して)
										if topic_text in dct_section['ls_article']:
											self.util.logger.info('[・] %s > %s is already added',section_title,topic_text)										
											continue											

										#配置先チェック
										try:
											#小規模リスト
											if el_topic.tag_name == 'h5':

												#すでにls_small_topicある
												if topic_text in ls_small_topic:
													self.util.logger.info('[・] %s > > %s is already added',section_title,topic_text)										
												else:
													ls_small_topic.append(topic_text)
													self.util.logger.info('[OK] %s >  > %s',section_title,topic_text)	
											else:
												#直前までを挿入
												if len(ls_small_topic) > 0:
													dct_section['ls_article'].append(ls_small_topic)
												ls_small_topic = []

												#当該記事入れる
												dct_section['ls_article'].append(topic_text)
												self.util.logger.info('[OK] %s > %s',section_title,topic_text)
										except Exception as e:
											self.util.logger.warning('[NG] %s > %s (cannot insert): %s',section_title,topic_text,e)
											continue										

									#残存を挿入
									if len(ls_small_topic) > 0:
										dct_section['ls_article'].append(ls_small_topic)														
										
								except Exception as e:
									self.util.logger.warning('[NG] %s > ???(topic): %s',section_title,e)
									continue
						except Exception as e:
							self.util.logger.warning('[NG] %s > ???(topic list): %s',section_title,e)

						finally:
							ls_body.append(dct_section)

					else:
						self.util.logger.warning('[・] ???(section title is null or excluded * do it on purpose *)')

				except Exception as e:
					self.util.logger.warning('[NG] ???(No section title): %s',e)
					continue
		except Exception as e:
			self.util.logger.warning('[NG] ???(Cannot find sections): %s',e)

		return ls_body		


	@staticmethod
	def make_html(obj_content):
		body =''
		if not obj_content is None:
			for obj_section in obj_content:
				if isinstance(obj_section['section_name'],str):
					body += '=== {0} ===<br>'.format(obj_section['section_name']) 

				for obj_top_news in obj_section['ls_top_news']:
					if isinstance(obj_top_news,str):
						body += '　★ {0}<br>'.format(obj_top_news)
				for obj_article in obj_section['ls_article']:
					if isinstance(obj_article,str):
						body += '　・{0}<br>'.format(obj_article)
					else:
						for obj_small_article in obj_article:
							body += '　　・{0}<br>'.format(obj_small_article)
				body += '<br>'

		return body

class BLOOMBERG:
	def __init__(self):
		self.agent = ACCESS()
		self.util = UTIL(version=ENV,service_name=SEAVICE_NAME)
		self.util.read_config(ELICA_CONFIG)
		self.util.logger.info('[OK]BLOOMBERG INIT')

	def get_contents(self):

		# get paper obj 
		is_obj = self.agent.to_page('https://www.bloomberg.co.jp/',**{'waittime':5})

		# make subject		
		str_subject = "{0:%Y%m%d %H%M}".format(self.util.dt_init)+'Bloomberg一覧'

		if is_obj:
			self.util.logger.info('[・]Scraping Web page..')
			obj_content = self.scrape_paper()


			str_subject = "{0:%Y%m%d}".format(self.util.dt_init)+'Bloomberg('+"{0:%H:%M}".format(self.util.dt_init)+')一覧'

			# discard session
			self.util.logger.info('[OK]Get content from web pagesucessfly')
			self.agent.del_driver()
		else:
			self.util.logger.warning('[NG]Access is failed : May be not available')
			str_subject = '[NG]'+str_subject
			obj_content = None


		return  str_subject, obj_content


	def scrape_paper(self):
		ls_body = []
		try:
			self.agent.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
			self.agent.stop_driver(30)
			self.agent.driver.execute_script("window.scrollTo(0, 0);")
			self.agent.stop_driver(1)
			self.agent.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")		
			self.agent.stop_driver(30)
			self.agent.driver.execute_script("window.scrollTo(0, 0);")
			self.agent.stop_driver(1)

		except Exception as e:
			self.util.logger.warning('[NG] Driver scroll error: %s',e)
			return ls_body
		try:

			#hub-lazy-zones も含まれる
			ls_el_large_section = self.agent.driver.find_elements_by_css_selector('section.hub-zone-righty__content')
			self.util.logger.info('[・] Driver list up large %d sections',len(ls_el_large_section))
			# ls_el_section = self.agent.driver.find_elements_by_css_selector('article.mod-story')

			##### TOP ####
			el_large_section = ls_el_large_section[0]
			ls_el_section = el_large_section.find_elements_by_css_selector('section')
			ls_el_section = [ el_section for el_section in ls_el_section if el_section.get_attribute('class') in ['hero-module','story-list-module','section-front-header-module']]
			self.util.logger.info('[・] Driver list up %d top newses',len(ls_el_section))

			dct_section = {}
			dct_section['section_name'] = "TOP"
			ls_el_article = ls_el_section[0].find_elements_by_css_selector('article.mod-story')
			dct_section['ls_article'] = [ el_article.text for el_article in ls_el_article ]
			ls_body.append(dct_section)
			self.util.logger.info('[・] Driver get : %s',dct_section['section_name'])

			dct_section = {}
			dct_section['section_name'] = "Second"
			ls_el_article = ls_el_section[1].find_elements_by_css_selector('article.mod-story')
			dct_section['ls_article'] = [ el_article.text for el_article in ls_el_article ]
			ls_body.append(dct_section)
			self.util.logger.info('[・] Driver get : %s',dct_section['section_name'])

			dct_section = {}
			dct_section['section_name'] = "Topics"
			ls_el_article = ls_el_section[1].find_elements_by_css_selector('article.mod-story')
			dct_section['ls_article'] = [ el_article.text for el_article in ls_el_article ]
			ls_body.append(dct_section)
			self.util.logger.info('[・] Driver get : %s',dct_section['section_name'])

			##### SECOND ####
			el_large_section = ls_el_large_section[1]
			ls_el_section = el_large_section.find_elements_by_css_selector('section')
			ls_el_section = [ el_section for el_section in ls_el_section if el_section.get_attribute('class') in ['story-list-module','section-front-header-module']]
			self.util.logger.info('[・] Driver list up %d second newses',len(ls_el_section))

			dct_section = {}
			dct_section['section_name'] = ls_el_section[0].text
			ls_el_article = ls_el_section[1].find_elements_by_css_selector('article.mod-story')
			dct_section['ls_article'] = [ el_article.text for el_article in ls_el_article ]
			ls_body.append(dct_section)
			self.util.logger.info('[・] Driver get : %s',dct_section['section_name'])

			##### Small grids ####
			el_large_section = ls_el_large_section[2]
			ls_el_section = el_large_section.find_elements_by_css_selector('section')
			ls_el_section = [ el_section for el_section in ls_el_section if el_section.get_attribute('class') in ['grid-module']]
			self.util.logger.info('[・] Driver list up %d small newses',len(ls_el_section))

			for el_section in ls_el_section:
				
				dct_section = {}
				dct_section['section_name'] = el_section.find_element_by_css_selector('.grid-module__title').text
				ls_el_article = el_section.find_elements_by_css_selector('article.mod-story')
				dct_section['ls_article'] = [ el_article.text for el_article in ls_el_article ]
				ls_body.append(dct_section)
				self.util.logger.info('[・] Driver get : %s',dct_section['section_name'])

			self.util.logger.info('[OK] Driver get ALL expected contents')

		except Exception as e:
			self.util.logger.warning('[NG] Driver occur some error: %s',e)
			return ls_body


		return ls_body		


	@staticmethod
	def make_html(obj_content):
		body =''
		if not obj_content is None:
			for obj_section in obj_content:
				body += '=== {0} ===<br>'.format(obj_section['section_name']) 

				for obj_article in obj_section['ls_article']:
					body += '　・{0}<br>'.format(obj_article)

				body += '<br>'

		return body



def get_nikkei_and_mail():
	obj_nikkei = NIKKEI()

	str_subject,obj_content = obj_nikkei.get_contents_in_paper()
	# # save_pickle(obj_content,'content.pickle')
	# obj_content = read_pickle('content.pickle')
	body_html = obj_nikkei.make_html(obj_content)
	# save_pickle(body_html,'body.pickle')
	# str_subject = 'test'

	obj_mailer = MAILER()
	obj_mailer.set_smtp_obj()
	obj_mailer.read_ls_to_address()
	obj_mailer.make_content(str_subject,body_html)
	obj_mailer.exec_send()
	return 

def get_blbrg_and_mail():
	obj_blbrg = BLOOMBERG()

	str_subject,obj_content = obj_blbrg.get_contents()
	# # save_pickle(obj_content,'content.pickle')
	# obj_content = read_pickle('content.pickle')
	body_html = obj_blbrg.make_html(obj_content)
	# save_pickle(body_html,'body.pickle')
	# str_subject = 'test'

	obj_mailer = MAILER()
	obj_mailer.set_smtp_obj()
	obj_mailer.read_ls_to_address()
	obj_mailer.make_content(str_subject,body_html)
	obj_mailer.exec_send()
	return 

# def make_newspicks_mail():
# 	body = add_system_credit(newspicks_access())
# 	time_p_str =  "朝" if now.time().hour < 15 else "夕"
# 	subject = "{0:%Y%m%d}".format(now)+'Newspicks('+time_p_str+')一覧'
# 	return (body,subject)

def save_pickle(obj,name):
	# name = folder + name
	with open(name,'wb') as f:
		pickle.dump(obj,f)

def read_pickle(name):
	# name = folder + name
	with open(name,'rb') as f:
		obj = pickle.load(f)
	return obj



if __name__ == '__main__':
	get_nikkei_and_mail()
	get_blbrg_and_mail()
	# (body,subject) = make_nikkei_mail()
	# send_action(subject,body)

	# (body,subject) = make_newspicks_mail()
	# send_action(subject,body)


NameError: name 'UTIL' is not defined