# Updating data - Kicktraq Web Crawling
-------------

## Setting
------------------
crawling 작업을 위해 아래의 python 라이브러리를 import한다. 여러 파이썬의 크롤링 라이브러리 중에 이 프로젝트는 'Selenium'을 사용한다.

In [None]:
import os
import re
import time
import datetime
import pandas as pd
from datetime import datetime
from selenium import webdriver as wd
from selenium.webdriver.chrome.options import Options
import SW4DS_django.database.db as dbt

데이터프레임을 통한 전처리 작업을 준비하고 Chrome driver를 지정하기 위해 option을 조정한다. 이 때 브라우저를 켜지 않고 자동화를 하기 위해 chrome_options.add_argument("--headless")를 추가하였다.

In [None]:
# Options
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

chrome_options = Options()
chrome_options.add_argument("--headless")

path = os.getcwd()

환율 전처리를 위해 사용할 변수를 정의한다.

In [None]:
# 2019-11-26 currency rate
currency_rate = {'AED': 3.6732, 'AFN': 78.813395, 'ALL': 111.387139, 'AMD': 477.766294, 'ANG': 1.71589, 'AOA': 475.1455, 
                 'ARS': 59.656064, 'AUD': 1.474668, 'AWG': 1.8, 'AZN': 1.7025, 'BAM': 1.77655, 'BBD': 2, 'BDT': 84.841984, 
                 'BGN': 1.7763, 'BHD': 0.377006, 'BIF': 1876.459218, 'BMD': 1, 'BND': 1.365996, 'BOB': 6.91879, 'BRL': 4.2272, 
                 'BSD': 1, 'BTC': 0.000138221454, 'BTN': 71.753547, 'BWP': 10.875574, 'BYN': 2.065464, 'BZD': 2.016777, 
                 'CAD': 1.331, 'CDF': 1665.879771, 'CHF': 0.99641, 'CLF': 0.024, 'CLP': 828.799556, 'CNH': 7.026137, 
                 'CNY': 7.0313, 'COP': 3426.198207, 'CRC': 572.689305, 'CUC': 1, 'CUP': 25.75, 'CVE': 100.375, 'CZK': 23.1504, 
                 'DJF': 178, 'DKK': 6.784255, 'DOP': 52.833199, 'DZD': 120.168016, 'EGP': 16.1341, 'ERN': 14.999746, 
                 'ETB': 30.230392, 'EUR': 0.907964, 'FJD': 2.1916, 'FKP': 0.775434, 'GBP': 0.775434, 'GEL': 2.975, 
                 'GGP': 0.775434, 'GHS': 5.573092, 'GIP': 0.775434, 'GMD': 51.1, 'GNF': 9535.428748, 'GTQ': 7.701775, 
                 'GYD': 208.708126, 'HKD': 7.82689, 'HNL': 24.633271, 'HRK': 6.752405, 'HTG': 97.302351, 'HUF': 305.07, 
                 'IDR': 14086.1, 'ILS': 3.46555, 'IMP': 0.775434, 'INR': 71.672507, 'IQD': 1194.465451, 'IRR': 42105, 
                 'ISK': 122.939982, 'JEP': 0.775434, 'JMD': 140.55937, 'JOD': 0.709, 'JPY': 108.982, 'KES': 102.1, 
                 'KGS': 69.670021, 'KHR': 4060.068987, 'KMF': 447.349843, 'KPW': 900, 'KRW': 1175.51, 'KWD': 0.303695, 
                 'KYD': 0.833757, 'KZT': 386.385911, 'LAK': 8865.709957, 'LBP': 1513.018384, 'LKR': 181.049492, 'LRD': 193.049961, 
                 'LSL': 14.727118, 'LYD': 1.408076, 'MAD': 9.659874, 'MDL': 17.40961, 'MGA': 3678.461869, 'MKD': 55.885332, 
                 'MMK': 1515.808652, 'MNT': 2688.420135, 'MOP': 8.066072, 'MRO': 357, 'MRU': 37.499262, 'MUR': 36.602138, 
                 'MVR': 15.4, 'MWK': 737.434911, 'MXN': 19.4476, 'MYR': 4.1805, 'MZN': 64.011999, 'NAD': 14.727319, 'NGN': 362.7, 
                 'NIO': 33.752513, 'NOK': 9.1757, 'NPR': 114.791337, 'NZD': 1.557297, 'OMR': 0.384985, 'PAB': 1, 'PEN': 3.390362, 
                 'PGK': 3.405952, 'PHP': 50.792825, 'PKR': 156.283679, 'PLN': 3.902061, 'PYG': 6468.632579, 'QAR': 3.643061, 
                 'RON': 4.3331, 'RSD': 106.775, 'RUB': 63.9287, 'RWF': 933.614923, 'SAR': 3.750187, 'SBD': 8.267992, 
                 'SCR': 13.699476, 'SDG': 45.136225, 'SEK': 9.63424, 'SGD': 1.36523, 'SHP': 0.775434, 'SLL': 7438.043346, 
                 'SOS': 578.775762, 'SRD': 7.458, 'SSP': 130.26, 'STD': 21560.79, 'STN': 22.4, 'SVC': 8.754496, 'SYP': 514.995156, 
                 'SZL': 14.722213, 'THB': 30.235, 'TJS': 9.692844, 'TMT': 3.5, 'TND': 2.846, 'TOP': 2.320065, 'TRY': 5.74277, 
                 'TTD': 6.760437, 'TWD': 30.505052, 'TZS': 2302.207123, 'UAH': 24.057369, 'UGX': 3702.021093, 'USD': 1, 
                 'UYU': 37.785223, 'UZS': 9515.331052, 'VEF': 248487.642241, 'VES': 22704, 'VND': 23201.13598, 'VUV': 116.36648, 
                 'WST': 2.641719, 'XAF': 595.585478, 'XAG': 0.05927682, 'XAU': 0.00068749, 'XCD': 2.70255, 'XDR': 0.728233, 
                 'XOF': 595.585478, 'XPD': 0.00055444, 'XPF': 108.348951, 'XPT': 0.0011136, 'YER': 250.349961, 'ZAR': 14.76184, 
                 'ZMW': 14.437728, 'ZWL': 322.000001}


success_currency = {'AU$': 'AUD', 'CA$': 'CAD', 'HK$': 'HKD', 'MX$': 'MXN', 'NZ$': 'NZD', 'US$': 'USD', 'S$': 'SGD'}

unsuccess_currency = {'AU':'AUD', 'Australia': 'AUD', 'Canada': 'CAD', 'Hong Kong': 'HKD', 'Mexico': 'MXN', 'NZ': 'NZD', 'Singapore': 'SGD'}

currfix = ['£','€','CHF','¥']

curr_change = {'£':'GBP', '€':'EUR', 'CHF':'CHF', '¥':'JPY'}

프로젝트 시작과 종료 날짜를 전처리 하기 위한 변수를 정의한다.

In [None]:
month_str = ['January', 'Feburary', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_int = [1,2,3,4,5,6,7,8,9,10,11,12]
month_dict = dict(zip(month_str, month_int))

Crawling을 통해 받아온 정보를 저장할 dataframe을 blurb_df로 정의한다.

In [None]:
# DataFrame for result
blurb_df = pd.DataFrame(columns = ['collected_date','name', 'blurb', 'state', 'category',
                                   'funding_rate', 'pledged', 'goal','currency_type', 'usd_pledged_real', 'usd_goal_real',
                                   'launched', 'deadline', 'term','term_bin', 'usd_goal_real_bin'])

## Explanation about Classes
------------------
### 1. KicktraqOpen
이 클래스는 Kicktraq 홈페이지를 Chrome driver를 통해 여는 역할을 한다.  
객체 생성 시 url을 parameter로 넣으면 해당 url로 Chrome driver를 연결해준다.

In [8]:
class KicktraqOpen:
    def __init__(self, url):
        self.url = url
        self.driver = wd.Chrome(path + '/chromedriver', options=chrome_options)
        self.driver.get(self.url)

### 2. WebcrawlClean(KicktraqOpen)
이 클래스는 Kicktraq에서 받아올 정보를 전처리하는 역할을 하며, 6개의 함수로 이루어져 있다.  

**1. clean_amount(amount)**  
목표 금액과 펀딩 받은 금액에 대한 정보는 숫자들 사이에 콤마(,)를 기준으로 나누어져있다. 밑에 있는 결과 테이블을 잠시 확인하면 pledged와 goal 컬럼의 숫자 사이에 콤마(,)가 존재함을 확인할 수 있다. 이러한 데이터를 전처리하여 오로지 숫자값을 갖도록 변형시켜준다.

In [None]:
blurb_df = pd.read_csv('blurb_df.csv')
blurb_df = blurb_df.drop(['Unnamed: 0'], axis = 1)

In [13]:
blurb_df.head()

Unnamed: 0,name,blurb,state,category,funding_rate,pledged,goal,launched,deadline
0,"STU, MY NAME IS STU - Season 2",Follow aspiring actor Stu's quirky life in the...,success,Film & Video,120,"£18,141","£15,000",2019 October 16th,2019 November 15th
1,Corgi Horoscope Cosplay Pins,"Tori's gone wild with her horoscope cosplays, ...",success,Art,352,"$17,610","$5,000",2019 October 16th,2019 November 15th
2,Sweet Snakes - Enamel Pin Series!,A decadent assortment of silly snakes and cakes!,success,Art,378,"$1,134",$300,2019 October 16th,2019 November 15th
3,Super Portable Towel 2.0 - Towel Made For Travel,Fits perfectly in your bag and your travel pla...,success,Design,666,"$33,342","$5,000",2019 October 16th,2019 November 15th
4,NIGHT OF THE MULLETS the Trade Paperback!,"For the first time in print, the collected 5-i...",success,Comics,116,"$8,164","$7,000",2019 October 16th,2019 November 15th


In [None]:
def clean_amount(self, amount):
    # take out ,
    clamount = amount.replace(',', '')
    result = re.findall('\d+', clamount)[0]
    result = int(result)
    return result

**2. conv_amount(amount, curr_t)**  
크롤링을 통해 받은 데이터에 저장된 화폐이름을 사용해서 USD로 환전해준다.

In [None]:
def conv_amount(self, amount, curr_t):
    # usd_pledged_real
    # usd_goal_real
    check_start = curr_t.split(' ')
    if len(check_start) == 2:
        curr_t = check_start[0]

    if curr_t != "USD" and curr_t != "USD (*)":
        curr_r = currency_rate[curr_t]
        new_amount = amount/curr_r
    else:
        new_amount = amount

    new_amount = round(new_amount,2)
    return new_amount

**3. get_amount_bin(amountx)**  
목표 금액에 대해 기존 데이터 형식과 똑같이 전처리 해주는 함수이다. 목표 금액을 해당하는 구간에 따라 binning해준다.

In [None]:
def get_amount_bin(self, amountx):
    # usd_goal_real_bin
    goal_bin = (lambda x: '1' if x <= 500 else '2' if x <= 1000 else '3' if x <= 3000 else '4' if x <= 5000 else '5' if x <= 10000 else '6' if x <= 50000 else '7' if x <= 100000 else '8')(amountx)
    return goal_bin

**4. conv_dt(datex)**  
펀딩 시작 및 종료 날짜에 대해 기존 데이터 형식과 똑같이 전처리 해주는 함수이다. 날짜 형식을 YYYY-MM-DD로 바꿔준다.

In [None]:
def conv_dt(self, datex):
    # converting date to YYYY-MM-DD format
    fmt = datex[-2:]
    convdtime = datetime.strptime(datex, '%Y %B %d' + fmt)
    convstime = convdtime.strftime("%Y-%m-%d")
    return convstime

**5. get_term(start, end)**  
펀딩 시작 날짜와 종료 날짜 사이의 기간을 구해준다.

In [None]:
def get_term(self, start, end):
    startdate = datetime.strptime(start, "%Y-%m-%d").date()
    enddate = datetime.strptime(end, "%Y-%m-%d").date()
    diffdays = (enddate - startdate).days
    return diffdays

**6. get_term_bin(term)**  
앞에서 구한 펀딩 기간에 대해 기존 데이터 형식과 똑같이 전처리 해주는 함수이다. 펀딩 기간을 해당하는 구간에 따라 binning해준다.

In [None]:
def get_term_bin(self, term):
    term_bin = (lambda x: '1' if x <= 10 else '2' if x <= 15 else '3' if x <= 21 else '4' if x <= 30 else '5' if x <= 45 else '6' if x <= 60 else '7')(term)
    return term_bin

### 3. KicktraqPage(WebcrawlClean)
이 클래스는 Kicktraq에서 정보를 받아오는 역할을 하며, 5개의 함수로 이루어져있다.   
위에서 언급한 바와 같이 성공한 프로젝트는 Day-1 Projects 카테고리에서, 실패한 프로젝트는 Archived Projects 카테고리에서 크롤링 할 것이다.

해당 클래스에 대한 객체를 만들면 생성자에 따라 kicktraq 홈페이지로 chrome driver가 열리게 된다. 

In [None]:
def __init__(self):
    super().__init__("https://www.kicktraq.com/projects/")

**1. getdayone()**  
성공한 프로젝트를 크롤링하기 위해 Day-1 Projects가 담긴 페이지의 링크를 저장해둔다.

In [None]:
def getdayone(self):
    self.dayonepage = "https://www.kicktraq.com/dayones/"
    return self.dayonepage

**2. getarchive()**  
실패한 프로젝트를 크롤링하기 위해 Archived Projects가 담긴 페이지의 링크를 저장해둔다.

In [None]:
def getarchive(self):
    self.archivepage = "https://www.kicktraq.com/archive/"
    return self.archivepage

**3. get_currloc_type(currency_synm, detail_url)**  
크롤링 시 화폐 단위 dollar에 대한 추가적인 작업을 수행하는 함수이다. HK dollar와 US dollar는 서로 다른 것임에도 불구하고 모두 똑같은 dollar sign으로 보여지고 있다. 따라서 그 $가 어느 나라의 화폐 단위인지를 확인하여 해당 나라의 화폐 이름과 나라 이름을 반환해준다.

In [None]:
def get_curloc_type(self, currency_symb, detail_url):
    global currency_type
    global country_name

    self.currency_symb = currency_symb
    self.detail_url = detail_url

    print("opening 2nd driver")
    self.driverx = wd.Chrome(path + '/chromedriver', options=chrome_options)
    self.driverx.get(self.detail_url)

    content = self.driverx.find_element_by_xpath("//div[@id='project-info-text']")
    content_lst = content.text.split('\n')

    funding_lst = [s for s in content_lst if "Funding:" in s]

    # TEMPORARY
    if len(funding_lst) == 0:
        funding_lst = [s for s in content_lst if "Funded:" in s]

    funding_goal = funding_lst[0].split(" of ")[1]

    # if the currency is in 'kr', get the country currency
    if currency_symb.isalpha():
        if currency_symb.lower() == 'k':
            currency_symb = funding_goal.split(" ")[1]  # get country for `kr` currency
        else:
            currency_symb = '$'


    ########################################
    # validate country for all currency
    ########################################
    content = self.driverx.find_element_by_id('button-backthis')
    prj_addr = content.get_attribute("href")
    print("opening 3rd driver")
    self.new_driver = wd.Chrome(path + '/chromedriver', options=chrome_options)
    self.new_driver.get(prj_addr)

    ########################
    # KICKSTARTER page
    ########################

    # check if content exists
    # check_content > 0 : project DOES NOT exist
    # check_content = 0 : project exist
    check_content = self.new_driver.find_elements_by_xpath("//div[@id='hidden_project']")

    if len(check_content) != 0:
        # project DOES NOT exist
        if currency_symb == '$':
            currency_type = 'USD (*)'
            country_name = ''
        elif currency_symb in cur_loc:
            currency_type = currency_symb + ' (*)'
            country_name = ''
    else:
        # project exist

        # check if project is canceled
        # prj_status > 0 : project CANCELED
        # prj_status = 0 : project ongoing
        prj_status = self.new_driver.find_elements_by_xpath("//div[(@class='normal type-18')]")

        if len(prj_status) != 0:
            # project canceled

            # location
            loc = self.new_driver.find_elements_by_xpath("//div[@class='py2 py3-lg flex items-center auto-scroll-x']/a['nowrap navy-700 flex items-center medium mr3 type-12 keyboard-focusable']/span[@class='ml1']")
            locx = len(loc) - 1
            location = loc[locx].text
            region = location.split(',')[1].strip()

            if region == 'AU':
                region = 'Australia'
            elif region in us_states:
                region = 'US'

            if region in us_states:
                region_cur = '$'
            elif region in country_cursym:
                region_cur = country_cursym[region]
            else:
                region_cur = '€'

            if currency_symb != region_cur:
                # if 2nd page currency and 3rd page is different
                if currency_symb == '$':  # assuming as USD as we do not know the country for $
                    currency_type = 'USD'
                else:
                    currency_type = curr_change[currency_symb]
            else:
                # if 2nd page currency and 3rd page is same
                if region in country_list:
                    currency_type = country_cursign[region]
                else:
                    currency_type = 'EUR'

            country_name = region

        else:
            # project is ongoing or successful

            # check if project is ongoing
            # check if project is ending soon
            prj_ongoing = self.new_driver.find_elements_by_xpath("//div[@class='ml5 ml0-lg']/div/div/span[@class='block type-16 type-28-md bold dark-grey-500']")
            prj_end_soon = self.new_driver.find_elements_by_xpath("//div[@class='ml5 ml0-lg']/div/div/span[@class='block type-16 type-28-md bold red-400']")

            # prj_ongoing  > 0 : project is running
            # prj_end_sonn > 0 : project is ending soon
            if len(prj_ongoing) != 0 or len(prj_end_soon) != 0:
                # ongoing project
                loc = self.new_driver.find_elements_by_xpath("//div[@class='py2 py3-lg flex items-center auto-scroll-x']/a['nowrap navy-700 flex items-center medium mr3 type-12 keyboard-focusable']/span[@class='ml1']")
                locx = len(loc) - 1
                location = loc[locx].text
                region = location.split(',')[1].strip()

                if region == 'AU':
                    region = 'Australia'
                elif region in us_states:
                    region = 'US'

                if region in us_states:
                    region_cur = '$'
                elif region in country_cursym:
                    region_cur = country_cursym[region]
                else:
                    region_cur = '€'

                if currency_symb != region_cur:
                    # if 2nd page currency and 3rd page is different
                    if currency_symb == '$':  # assuming as USD as we do not know the country for $
                        currency_type = 'USD'
                    else:
                        currency_type = curr_change[currency_symb]
                else:
                    # if 2nd page currency and 3rd page is same
                    if region in country_list:
                        currency_type = country_cursign[region]
                    else:
                        currency_type = 'EUR'

                country_name = region
            else:
                # project ended
                get_money = self.new_driver.find_element_by_xpath("//div[@class='mb3']/h3[@class='mb0']/span[@class='money']").text
                loc = self.new_driver.find_elements_by_xpath("//div[@class='NS_projects__category_location ratio-16-9 flex items-center']/a[@class='grey-dark mr3 nowrap type-12']")
                location = loc[0].text
                region = location.split(',')[1].strip()

                if region == 'AU':
                    region = 'Australia'
                elif region in us_states:
                    region = 'US'

                # get the first currency sign of money
                first_check = get_money[0]

                if first_check != '$':
                    # will either be euro, pound, sek, nok, dkk, ....
                    if first_check not in ['£', '€', '¥']:
                        second_check = get_money.split(' ')
                        second_check_sign = second_check[0]
                        currency_type = success_currency[second_check_sign]
                    else:
                        currency_type = curr_change[first_check]

                else:
                    currency_type = 'USD'

                country_name = region

        self.new_driver.close()
    res_dict = {'currency_type': currency_type, 'country': country_name}
    self.driverx.close()
    return res_dict

**4. getInfoSuccess(text, attrurl)**  
성공한 프로젝트들의 세부적인 정보(후원자 수, 펀딩 받은 금액, 목표 금액, 펀딩 시작 및 종료 날짜)를 가져온다.

![example](https://user-images.githubusercontent.com/31986977/70234469-10abee80-17a4-11ea-85ff-b78ee5727cc9.png)
위의 사진에서 보이는 바와 같이 우측 하단에 프로젝트에 대한 세부적인 정보가 있음을 확인할 수 있다. 이 때 년도의 경우 종료 시점 뒤에만 명시되어있다. 따라서 시작 시점의 년도는 다음과 같은 로직으로 정한다.  
예를 들어 종료 시점의 월이 시작 시점의 월보다 크거나 같다면, 즉 8월에 시작해서 10월에 끝난다면 시작 시점과 종료 시점은 같은 년도에 속하게 된다. 하지만 종료 시점의 월이 시작 시점의 월보다 작다면, 즉 11월에 시작해서 1월에 끝난다면 시작 시점의 년도는 종료 시점의 년도보다 하나 작게 될 것이다.

In [None]:
def getInfoSuccess(self, text, attrurl):
    backers = int(text.split("\n")[0].split(": ")[1])
    pledged = text.split('\n')[1].split(': ')[1].split(' of ')[0]
    goal = text.split('\n')[1].split(': ')[1].split(' of ')[1].split(' (')[0]
    year = text.split('\n')[3].split('(')[1][:-1]
    launched = text.split('\n')[3].split(": ")[1].split(' -> ')[0]
    deadline = text.split('\n')[3].split(": ")[1].split(' -> ')[1].split(' (')[0]

    month_start = month_dict.get(text.split('\n')[3].split(": ")[1].split(' -> ')[0].split(" ")[0])
    month_end = month_dict.get(text.split('\n')[3].split(": ")[1].split(' -> ')[1].split(' (')[0].split(" ")[0])

    if (month_end < month_start):
        start_year = str(int(year) - 1)
    else:
        start_year = year

    launched = start_year + ' ' + launched
    deadline = year + ' ' + deadline

    currency_t = goal[0]
    print("Currency at page 2:", currency_t)
    cur_loc_info = self.get_curloc_type(currency_t, attrurl)

    pledged = self.clean_amount(pledged)
    goal = self.clean_amount(goal)
    usd_pledged = self.conv_amount(pledged, cur_loc_info['currency_type'])
    usd_goal = self.conv_amount(goal, cur_loc_info['currency_type'])
    usd_goal_real_bin = self.get_amount_bin(usd_goal)
    launched = self.conv_dt(launched)
    deadline = self.conv_dt(deadline)

    term = self.get_term(launched, deadline)
    term_bin = self.get_term_bin(term)

    res_dct_pass = {'state': 'success',
                    'backers': backers,
                    'pledged': pledged,
                    'goal': goal,
                    'currency_type': cur_loc_info['currency_type'],
                    'country': cur_loc_info['country'],
                    'usd_pledged_real': usd_pledged,
                    'usd_goal_real': usd_goal,
                    'launched': launched,
                    'deadline': deadline,
                    'term': term,
                    'term_bin': term_bin,
                    'usd_goal_real_bin': usd_goal_real_bin
                    }

    return res_dct_pass

**5. getInfoFail(test, attrurl)**  
실패한 프로젝트들의 세부적인 정보(후원자 수, 펀딩 받은 금액, 목표 금액, 펀딩 시작 및 종료 날짜)를 가져온다. 펀딩 시작 날짜의 년도를 지정하는 로직은 성공한 프로젝트의 세부 정보를 저장 할 때와 동일하게 적용한다.

In [None]:
def getInfoFail(self, text, attrurl):
    backers = int(text.split("\n")[0].split(": ")[1])
    pledged = text.split('\n')[1].split(': ')[1].split(' of ')[0]
    goal = text.split('\n')[1].split(': ')[1].split(' of ')[1].split(' (')[0]
    year = text.split('\n')[2].split('(')[1][:-1]
    launched = text.split('\n')[2].split(": ")[1].split(' -> ')[0]
    deadline = text.split('\n')[2].split(": ")[1].split(' -> ')[1].split(' (')[0]

    month_start = month_dict.get(text.split('\n')[2].split(": ")[1].split(' -> ')[0].split(" ")[0])
    month_end = month_dict.get(text.split('\n')[2].split(": ")[1].split(' -> ')[1].split(' (')[0].split(" ")[0])

    if (month_end < month_start):
        start_year = str(int(year) - 1)
    else:
        start_year = year

    launched = start_year + ' ' + launched
    deadline = year + ' ' + deadline

    currency_t = goal[0]
    print("Currency at page 2:", currency_t)
    cur_loc_info = self.get_curloc_type(currency_t, attrurl)

    pledged = self.clean_amount(pledged)
    goal = self.clean_amount(goal)
    usd_pledged = self.conv_amount(pledged, cur_loc_info['currency_type'])
    usd_goal = self.conv_amount(goal, cur_loc_info['currency_type'])
    usd_goal_real_bin = self.get_amount_bin(usd_goal)
    launched = self.conv_dt(launched)
    deadline = self.conv_dt(deadline)
    term = self.get_term(launched, deadline)
    term_bin = self.get_term_bin(term)

    res_dct_fail = {'state': 'fail',
                    'backers': backers,
                    'pledged': pledged,
                    'goal': goal,
                    'currency_type': cur_loc_info['currency_type'],
                    'country': cur_loc_info['country'],
                    'usd_pledged_real': usd_pledged,
                    'usd_goal_real': usd_goal,
                    'launched': launched,
                    'deadline': deadline,
                    'term': term,
                    'term_bin': term_bin,
                    'usd_goal_real_bin': usd_goal_real_bin
                    }

    return res_dct_fail

### 4. KicktraqCrawl(KicktraqPage)
Kicktraq 홈페이지에서 정보를 받아오기 위해 Chrome driver를 제어하는 역할을 한다. 2개의 함수로 이루어져있다.  
이 클래스의 객체가 생성되면 생성자에 따라 chrome driver가 open 되었다는 문자를 print 해준다.

In [None]:
def __init__(self):
    print("opening 1st driver")
    super().__init__()
    print("1st driver opened")

**1. webcrawl(start, end, text)**  
지금까지 앞에서 다루어진 모든 함수들을 사용하여 데이터를 크롤링해오는 실질적인 작업을 하는 함수이다.  
함수 안에는 3개의 파라미터가 들어간다. 우선 kicktraq 홈페이지에서 보고자 하는 프로젝트 목록의 시작 페이지와 마지막 페이지를 각각 start, end에 넣어주고, text 자리에는 'dayone' 또는 'archive'를 넣어주면 된다.

또한 getInfoSuccess / getInfoFail에서는 넣어주지 않은 프로젝트의 세부 정보(이름, 카테고리, 한 줄 설명, 목표 금액 대비 후원 금액의 비율)를 추가적으로 저장한다.

모든 정보를 다 저장하고나면 데이터베이스에 접속하여 이를 insert해준다.

In [None]:
def webcrawl(self, start, end, text):
    print("Initiating crawling...")
    global collect

    self.timestart = time.time()
    self.start = start
    self.end = end
    self.text = text

    current_date = datetime.date(datetime.now())

    if self.text == "dayone":
        self.openpg = super().getdayone()
    elif self.text == "archive":
        self.openpg = super().getarchive()
    else:
        return "Either `dayone` or `archive` page available"

    for page in range(self.start, self.end + 1):
        print("==================================")
        print("Collecting page", page, "of",self.text,"......")
        print("==================================")

        self.driver.get('{}?page={}'.format(self.openpg, page))

        # Common
        prj_list = self.driver.find_elements_by_xpath("//div[@class='project-infobox']")
        prj_length = len(prj_list)

        for i in range(prj_length):
            print(">> Collecting project",i,"right now...")
            x = prj_list[i]
            p1 = x.find_element_by_xpath("h2/a")
            detail_url = p1.get_attribute("href")

            # Project Title
            name = p1.text
            name = name.replace("'", "")

            # Project Content
            blurb = x.find_element_by_xpath("div[not(@class)]").text
            blurb = blurb.replace("'", "")

            # if `blurb` is not empty
            if blurb != "":

                # Project Category
                cat = x.find_element_by_xpath("div[@class='project-cat']")
                cat_lst = cat.find_elements_by_tag_name("a")
                category = cat_lst[0].text

                # Project Information
                info = x.find_element_by_xpath("div[@class='project-infobits']/div[@class='project-details']")
                prj_info = info.text

                # funding rate
                rate_info = info.find_element_by_tag_name("span").text
                percent = int(re.findall('\d+', rate_info)[0])

                if percent >= 100 and self.text == 'dayone':
                    collect = super().getInfoSuccess(prj_info, detail_url)
                elif percent < 100 and self.text == 'archive':
                    collect = super().getInfoFail(prj_info, detail_url)
                else:
                    collect = {}

                if len(collect) != 0:
                    main_dct = {'collected_date': current_date,
                                'updated_date': current_date,
                                'name': name,
                                'blurb': blurb,
                                'category': category,
                                'funding_rate': percent}

                    if collect['country'] != '':
                        # merging results
                        main_dct.update(collect)
                        # print(main_dct)
                        print('Country:', main_dct['country'])
                        print('Currency type:', main_dct['currency_type'])

                        row_sql = dbt.DBcls.sqlselect(main_dct)
                        dbt.cur.execute(row_sql)
                        chk_row = dbt.cur.rowcount
                        get_row = dbt.cur.fetchone()
                        # print(chk_row)
                        if chk_row == 0:
                            insert_sql = dbt.DBcls.sqlinsert()
                            print("Query:", insert_sql)
                            dbt.cur.execute(insert_sql, main_dct)
                            print("Inserted in database\n")

                        else:
                            print("Data already exist. Updating to new information...")
                            get_id = get_row['id']
                            update_sql = dbt.DBcls.sqlupdate(get_id, main_dct)
                            print("Query:", update_sql, '\n')
                            dbt.cur.execute(update_sql)
                    else:
                        pass


    print("==================================")
    print("!COMPLETED!")
    print("==================================")
    print("Completed in {} seconds...".format(round(time.time() - self.timestart),2))

**2. quitWeb()**  
크롤링을 끝낸 chrome driver를 종료한다.

In [None]:
def quitWeb(self):
    qt = self.driver.close()
    return qt

## Example to use code
------------------

In [None]:
# Example
a = KicktraqCrawl()
a.webcrawl(1,1,"dayone")

# ...
# ...
a.quitWeb()
# dbt.DBcls.clcn()

## FULL CODES
------------------

In [None]:
#!/usr/bin/env python3

"""
Kicktraq Web Crawling
"""
import sys
import os
import re
import time
import datetime
import pandas as pd
from datetime import datetime
from selenium import webdriver as wd
from selenium.webdriver.chrome.options import Options

# Options
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

chrome_options = Options()
chrome_options.add_argument("--headless")

############
# LOCAL
############
import SW4DS_django.database.db as dbt
path = os.getcwd()



#########################
# Required Dictionaries
#########################
success_currency = {'AU$': 'AUD', 'CA$': 'CAD', 'HK$': 'HKD', 'MX$': 'MXN', 'NZ$': 'NZD', 'US$': 'USD', 'S$': 'SGD', 'SEK':'SEK','CHF':'CHF','NOK':'NOK','DKK':'DKK'}

curr_change = {'£':'GBP', '€':'EUR', 'CHF':'CHF', '¥':'JPY', 'SEK':'SEK','NOK':'NOK','DKK':'DKK'}

cur_loc = {'SEK': 'Sweden', 'NOK': 'Norway', 'DKK': 'Denmark', 'CHF': 'Switzerland', '£': 'UK', '¥': 'Japan', '€':'EU'}

country_list = ['AU','Australia', 'Canada', 'Denmark', 'Hong Kong', 'Japan', 'Mexico', 'New Zealand', 'Norway', 'UK', 'US', 'Sweden', 'Singapore', 'Switzerland']

us_states = {'AK': 'Alaska', 'AL': 'Alabama', 'AR': 'Arkansas', 'AS': 'American Samoa', 'AZ': 'Arizona', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DC': 'District of Columbia', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'GU': 'Guam', 'HI': 'Hawaii', 'IA': 'Iowa', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'MA': 'Massachusetts', 'MD': 'Maryland', 'ME': 'Maine', 'MI': 'Michigan', 'MN': 'Minnesota', 'MO': 'Missouri', 'MP': 'Northern Mariana Islands', 'MS': 'Mississippi', 'MT': 'Montana', 'NA': 'National', 'NC': 'North Carolina', 'ND': 'North Dakota', 'NE': 'Nebraska', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NV': 'Nevada', 'NY': 'New York', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'PR': 'Puerto Rico', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VA': 'Virginia', 'VI': 'Virgin Islands', 'VT': 'Vermont', 'WA': 'Washington', 'WI': 'Wisconsin', 'WV': 'West Virginia', 'WY': 'Wyoming'}

country_cursym = {'AU': '$', 'Australia': '$', 'Canada': '$', 'Denmark': 'DKK', 'Hong Kong': '$', 'Japan': '¥', 'Mexico': '$', 'New Zealand': '$', 'Norway': 'NOK', 'UK': '£', 'US': '$', 'Sweden': 'SEK', 'Singapore': '$', 'Switzerland': 'CHF'}

country_cursign = {'AU': 'AUD', 'Australia': 'AUD', 'Canada': 'CAD', 'Denmark': 'DKK', 'Hong Kong': 'HKD', 'Japan': 'JPY', 'Mexico': 'MXN', 'New Zealand': 'NZD', 'Norway': 'NOK', 'UK': 'GBP', 'US': 'USD', 'Sweden': 'SEK', 'Singapore': 'SGD', 'Switzerland': 'CHF'}


month_str = ['January', 'Feburary', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_int = [1,2,3,4,5,6,7,8,9,10,11,12]
month_dict = dict(zip(month_str, month_int))


# 2019-11-29 currency rate
# currency_rate = {'AED': 3.67294, 'AFN': 78.488715, 'ALL': 111.366429, 'AMD': 477.643925, 'ANG': 1.718723, 'AOA': 490.921, 'ARS': 59.7836, 'AUD': 1.475547, 'AWG': 1.8, 'AZN': 1.7025, 'BAM': 1.776316, 'BBD': 2, 'BDT': 84.831426, 'BGN': 1.77644, 'BHD': 0.376993, 'BIF': 1874.266469, 'BMD': 1, 'BND': 1.364788, 'BOB': 6.909796, 'BRL': 4.1894, 'BSD': 1, 'BTC': 0.000131988977, 'BTN': 71.559511, 'BWP': 10.87353, 'BYN': 2.107897, 'BZD': 2.014168, 'CAD': 1.32892, 'CDF': 1663.774438, 'CHF': 0.999624, 'CLF': 0.024, 'CLP': 836.299391, 'CNH': 7.024966, 'CNY': 7.0194, 'COP': 3504.68892, 'CRC': 561.219026, 'CUC': 1, 'CUP': 25.75, 'CVE': 100.7, 'CZK': 23.218009, 'DJF': 178, 'DKK': 6.788803, 'DOP': 52.800507, 'DZD': 120.205945, 'EGP': 16.1166, 'ERN': 14.999703, 'ETB': 30.536398, 'EUR': 0.908684, 'FJD': 2.1911, 'FKP': 0.775819, 'GBP': 0.775819, 'GEL': 2.97, 'GGP': 0.775819, 'GHS': 5.573357, 'GIP': 0.775819, 'GMD': 51.15, 'GNF': 9527.068875, 'GTQ': 7.694292, 'GYD': 208.499132, 'HKD': 7.82628, 'HNL': 24.601971, 'HRK': 6.761144, 'HTG': 97.060976, 'HUF': 304.124977, 'IDR': 14114.6, 'ILS': 3.4737, 'IMP': 0.775819, 'INR': 71.785006, 'IQD': 1192.923651, 'IRR': 42105, 'ISK': 122.499995, 'JEP': 0.775819, 'JMD': 140.55937, 'JOD': 0.709, 'JPY': 109.562, 'KES': 102.89, 'KGS': 69.670113, 'KHR': 4074.453665, 'KMF': 447.849773, 'KPW': 900, 'KRW': 1179.24, 'KWD': 0.304129, 'KYD': 0.832751, 'KZT': 385.985803, 'LAK': 8858.904662, 'LBP': 1511.104191, 'LKR': 180.616256, 'LRD': 193.000002, 'LSL': 14.736115, 'LYD': 1.407021, 'MAD': 9.642889, 'MDL': 17.494842, 'MGA': 3680.764949, 'MKD': 55.907583, 'MMK': 1505.383821, 'MNT': 2688.612672, 'MOP': 8.056128, 'MRO': 357, 'MRU': 37.452432, 'MUR': 36.671983, 'MVR': 15.41, 'MWK': 735.774765, 'MXN': 19.441454, 'MYR': 4.1765, 'MZN': 64.06, 'NAD': 14.736115, 'NGN': 362.65, 'NIO': 33.70964, 'NOK': 9.190995, 'NPR': 114.495256, 'NZD': 1.554032, 'OMR': 0.38502, 'PAB': 1, 'PEN': 3.381954, 'PGK': 3.401662, 'PHP': 50.845, 'PKR': 155.140125, 'PLN': 3.930198, 'PYG': 6468.75888, 'QAR': 3.639072, 'RON': 4.3467, 'RSD': 106.86, 'RUB': 64.095, 'RWF': 932.664695, 'SAR': 3.750013, 'SBD': 8.267992, 'SCR': 13.699962, 'SDG': 45.079153, 'SEK': 9.54568, 'SGD': 1.366362, 'SHP': 0.775819, 'SLL': 7438.043346, 'SOS': 578.084428, 'SRD': 7.458, 'SSP': 130.26, 'STD': 21560.79, 'STN': 22.35, 'SVC': 8.74438, 'SYP': 515.029856, 'SZL': 14.736116, 'THB': 30.21, 'TJS': 9.687026, 'TMT': 3.51, 'TND': 2.8485, 'TOP': 2.3211, 'TRY': 5.744156, 'TTD': 6.751737, 'TWD': 30.508998, 'TZS': 2300.301993, 'UAH': 24.007175, 'UGX': 3697.282408, 'USD': 1, 'UYU': 37.91418, 'UZS': 9487.711828, 'VEF': 248487.642241, 'VES': 22704, 'VND': 23196.03724, 'VUV': 116.342107, 'WST': 2.643078, 'XAF': 596.057688, 'XAG': 0.05915423, 'XAU': 0.00068667, 'XCD': 2.70255, 'XDR': 0.728518, 'XOF': 596.057688, 'XPD': 0.00054318, 'XPF': 108.434855, 'XPT': 0.00111608, 'YER': 250.400036, 'ZAR': 14.627867, 'ZMW': 14.614141, 'ZWL': 322.000001}

currency_rate = dbt.currency_lst
print("Currency rate called")

chrome_options = Options()
chrome_options.add_argument("--headless")

class KicktraqOpen:
    def __init__(self, url):
        self.url = url
        self.driver = wd.Chrome(path + '/chromedriver', options=chrome_options)
        self.driver.get(self.url)

class WebcrawlClean(KicktraqOpen):
    def clean_amount(self, amount):
        # take out ,
        clamount = amount.replace(',', '')
        result = re.findall('\d+', clamount)[0]
        result = int(result)
        return result

    def conv_amount(self, amount, curr_t):
        # usd_pledged_real
        # usd_goal_real
        check_start = curr_t.split(' ')
        if len(check_start) == 2:
            curr_t = check_start[0]

        if curr_t != "USD" and curr_t != "USD (*)":
            curr_r = currency_rate[curr_t]
            new_amount = amount/curr_r
        else:
            new_amount = amount

        new_amount = round(new_amount,2)
        return new_amount

    def get_amount_bin(self, amountx):
        # usd_goal_real_bin
        goal_bin = (lambda x: '1' if x <= 500 else '2' if x <= 1000 else '3' if x <= 3000 else '4' if x <= 5000 else '5' if x <= 10000 else '6' if x <= 50000 else '7' if x <= 100000 else '8')(amountx)
        return goal_bin

    def conv_dt(self, datex):
        # converting date to YYYY-MM-DD format
        fmt = datex[-2:]
        convdtime = datetime.strptime(datex, '%Y %B %d' + fmt)
        convstime = convdtime.strftime("%Y-%m-%d")
        return convstime

    def get_term(self, start, end):
        startdate = datetime.strptime(start, "%Y-%m-%d").date()
        enddate = datetime.strptime(end, "%Y-%m-%d").date()
        diffdays = (enddate - startdate).days
        return diffdays

    def get_term_bin(self, term):
        term_bin = (lambda x: '1' if x <= 10 else '2' if x <= 15 else '3' if x <= 21 else '4' if x <= 30 else '5' if x <= 45 else '6' if x <= 60 else '7')(term)
        return term_bin


class KicktraqPage(WebcrawlClean):
    def __init__(self):
        super().__init__("https://www.kicktraq.com/projects/")

    def getdayone(self):
        self.dayonepage = "https://www.kicktraq.com/dayones/"
        return self.dayonepage

    def getarchive(self):
        self.archivepage = "https://www.kicktraq.com/archive/"
        return self.archivepage

    def get_curloc_type(self, currency_symb, detail_url):
        global currency_type
        global country_name

        self.currency_symb = currency_symb
        self.detail_url = detail_url

        print("opening 2nd driver")
        self.driverx = wd.Chrome(path + '/chromedriver', options=chrome_options)
        self.driverx.get(self.detail_url)

        content = self.driverx.find_element_by_xpath("//div[@id='project-info-text']")
        content_lst = content.text.split('\n')

        funding_lst = [s for s in content_lst if "Funding:" in s]

        # TEMPORARY
        if len(funding_lst) == 0:
            funding_lst = [s for s in content_lst if "Funded:" in s]

        funding_goal = funding_lst[0].split(" of ")[1]

        # if the currency is in 'kr', get the country currency
        if currency_symb.isalpha():
            if currency_symb.lower() == 'k':
                currency_symb = funding_goal.split(" ")[1]  # get country for `kr` currency
            else:
                currency_symb = '$'


        ########################################
        # validate country for all currency
        ########################################
        content = self.driverx.find_element_by_id('button-backthis')
        prj_addr = content.get_attribute("href")
        print("opening 3rd driver")
        self.new_driver = wd.Chrome(path + '/chromedriver', options=chrome_options)
        self.new_driver.get(prj_addr)

        ########################
        # KICKSTARTER page
        ########################

        # check if content exists
        # check_content > 0 : project DOES NOT exist
        # check_content = 0 : project exist
        check_content = self.new_driver.find_elements_by_xpath("//div[@id='hidden_project']")

        if len(check_content) != 0:
            # project DOES NOT exist
            if currency_symb == '$':
                currency_type = 'USD (*)'
                country_name = ''
            elif currency_symb in cur_loc:
                currency_type = currency_symb + ' (*)'
                country_name = ''
        else:
            # project exist

            # check if project is canceled
            # prj_status > 0 : project CANCELED
            # prj_status = 0 : project ongoing
            prj_status = self.new_driver.find_elements_by_xpath("//div[(@class='normal type-18')]")

            if len(prj_status) != 0:
                # project canceled

                # location
                loc = self.new_driver.find_elements_by_xpath("//div[@class='py2 py3-lg flex items-center auto-scroll-x']/a['nowrap navy-700 flex items-center medium mr3 type-12 keyboard-focusable']/span[@class='ml1']")
                locx = len(loc) - 1
                location = loc[locx].text
                region = location.split(',')[1].strip()

                if region == 'AU':
                    region = 'Australia'
                elif region in us_states:
                    region = 'US'

                if region in us_states:
                    region_cur = '$'
                elif region in country_cursym:
                    region_cur = country_cursym[region]
                else:
                    region_cur = '€'

                if currency_symb != region_cur:
                    # if 2nd page currency and 3rd page is different
                    if currency_symb == '$':  # assuming as USD as we do not know the country for $
                        currency_type = 'USD'
                    else:
                        currency_type = curr_change[currency_symb]
                else:
                    # if 2nd page currency and 3rd page is same
                    if region in country_list:
                        currency_type = country_cursign[region]
                    else:
                        currency_type = 'EUR'

                country_name = region

            else:
                # project is ongoing or successful

                # check if project is ongoing
                # check if project is ending soon
                prj_ongoing = self.new_driver.find_elements_by_xpath("//div[@class='ml5 ml0-lg']/div/div/span[@class='block type-16 type-28-md bold dark-grey-500']")
                prj_end_soon = self.new_driver.find_elements_by_xpath("//div[@class='ml5 ml0-lg']/div/div/span[@class='block type-16 type-28-md bold red-400']")

                # prj_ongoing  > 0 : project is running
                # prj_end_sonn > 0 : project is ending soon
                if len(prj_ongoing) != 0 or len(prj_end_soon) != 0:
                    # ongoing project
                    loc = self.new_driver.find_elements_by_xpath("//div[@class='py2 py3-lg flex items-center auto-scroll-x']/a['nowrap navy-700 flex items-center medium mr3 type-12 keyboard-focusable']/span[@class='ml1']")
                    locx = len(loc) - 1
                    location = loc[locx].text
                    region = location.split(',')[1].strip()

                    if region == 'AU':
                        region = 'Australia'
                    elif region in us_states:
                        region = 'US'

                    if region in us_states:
                        region_cur = '$'
                    elif region in country_cursym:
                        region_cur = country_cursym[region]
                    else:
                        region_cur = '€'

                    if currency_symb != region_cur:
                        # if 2nd page currency and 3rd page is different
                        if currency_symb == '$':  # assuming as USD as we do not know the country for $
                            currency_type = 'USD'
                        else:
                            currency_type = curr_change[currency_symb]
                    else:
                        # if 2nd page currency and 3rd page is same
                        if region in country_list:
                            currency_type = country_cursign[region]
                        else:
                            currency_type = 'EUR'

                    country_name = region
                else:
                    # project ended
                    get_money = self.new_driver.find_element_by_xpath("//div[@class='mb3']/h3[@class='mb0']/span[@class='money']").text
                    loc = self.new_driver.find_elements_by_xpath("//div[@class='NS_projects__category_location ratio-16-9 flex items-center']/a[@class='grey-dark mr3 nowrap type-12']")
                    location = loc[0].text
                    region = location.split(',')[1].strip()

                    if region == 'AU':
                        region = 'Australia'
                    elif region in us_states:
                        region = 'US'

                    # get the first currency sign of money
                    first_check = get_money[0]

                    if first_check != '$':
                        # will either be euro, pound, sek, nok, dkk, ....
                        if first_check not in ['£', '€', '¥']:
                            second_check = get_money.split(' ')
                            second_check_sign = second_check[0]
                            currency_type = success_currency[second_check_sign]
                        else:
                            currency_type = curr_change[first_check]

                    else:
                        currency_type = 'USD'

                    country_name = region

            self.new_driver.close()
        res_dict = {'currency_type': currency_type, 'country': country_name}
        self.driverx.close()
        return res_dict


    def getInfoSuccess(self, text, attrurl):
        backers = int(text.split("\n")[0].split(": ")[1])
        pledged = text.split('\n')[1].split(': ')[1].split(' of ')[0]
        goal = text.split('\n')[1].split(': ')[1].split(' of ')[1].split(' (')[0]
        year = text.split('\n')[3].split('(')[1][:-1]
        launched = text.split('\n')[3].split(": ")[1].split(' -> ')[0]
        deadline = text.split('\n')[3].split(": ")[1].split(' -> ')[1].split(' (')[0]

        month_start = month_dict.get(text.split('\n')[3].split(": ")[1].split(' -> ')[0].split(" ")[0])
        month_end = month_dict.get(text.split('\n')[3].split(": ")[1].split(' -> ')[1].split(' (')[0].split(" ")[0])

        if (month_end < month_start):
            start_year = str(int(year) - 1)
        else:
            start_year = year

        launched = start_year + ' ' + launched
        deadline = year + ' ' + deadline

        currency_t = goal[0]
        print("Currency at page 2:", currency_t)
        cur_loc_info = self.get_curloc_type(currency_t, attrurl)

        pledged = self.clean_amount(pledged)
        goal = self.clean_amount(goal)
        usd_pledged = self.conv_amount(pledged, cur_loc_info['currency_type'])
        usd_goal = self.conv_amount(goal, cur_loc_info['currency_type'])
        usd_goal_real_bin = self.get_amount_bin(usd_goal)
        launched = self.conv_dt(launched)
        deadline = self.conv_dt(deadline)

        term = self.get_term(launched, deadline)
        term_bin = self.get_term_bin(term)

        res_dct_pass = {'state': 'success',
                        'backers': backers,
                        'pledged': pledged,
                        'goal': goal,
                        'currency_type': cur_loc_info['currency_type'],
                        'country': cur_loc_info['country'],
                        'usd_pledged_real': usd_pledged,
                        'usd_goal_real': usd_goal,
                        'launched': launched,
                        'deadline': deadline,
                        'term': term,
                        'term_bin': term_bin,
                        'usd_goal_real_bin': usd_goal_real_bin
                        }

        return res_dct_pass

    def getInfoFail(self, text, attrurl):
        backers = int(text.split("\n")[0].split(": ")[1])
        pledged = text.split('\n')[1].split(': ')[1].split(' of ')[0]
        goal = text.split('\n')[1].split(': ')[1].split(' of ')[1].split(' (')[0]
        year = text.split('\n')[2].split('(')[1][:-1]
        launched = text.split('\n')[2].split(": ")[1].split(' -> ')[0]
        deadline = text.split('\n')[2].split(": ")[1].split(' -> ')[1].split(' (')[0]

        month_start = month_dict.get(text.split('\n')[2].split(": ")[1].split(' -> ')[0].split(" ")[0])
        month_end = month_dict.get(text.split('\n')[2].split(": ")[1].split(' -> ')[1].split(' (')[0].split(" ")[0])

        if (month_end < month_start):
            start_year = str(int(year) - 1)
        else:
            start_year = year

        launched = start_year + ' ' + launched
        deadline = year + ' ' + deadline

        currency_t = goal[0]
        print("Currency at page 2:", currency_t)
        cur_loc_info = self.get_curloc_type(currency_t, attrurl)

        pledged = self.clean_amount(pledged)
        goal = self.clean_amount(goal)
        usd_pledged = self.conv_amount(pledged, cur_loc_info['currency_type'])
        usd_goal = self.conv_amount(goal, cur_loc_info['currency_type'])
        usd_goal_real_bin = self.get_amount_bin(usd_goal)
        launched = self.conv_dt(launched)
        deadline = self.conv_dt(deadline)
        term = self.get_term(launched, deadline)
        term_bin = self.get_term_bin(term)

        res_dct_fail = {'state': 'fail',
                        'backers': backers,
                        'pledged': pledged,
                        'goal': goal,
                        'currency_type': cur_loc_info['currency_type'],
                        'country': cur_loc_info['country'],
                        'usd_pledged_real': usd_pledged,
                        'usd_goal_real': usd_goal,
                        'launched': launched,
                        'deadline': deadline,
                        'term': term,
                        'term_bin': term_bin,
                        'usd_goal_real_bin': usd_goal_real_bin
                        }

        return res_dct_fail


class KicktraqCrawl(KicktraqPage):
    def __init__(self):
        print("opening 1st driver")
        super().__init__()
        print("1st driver opened")

    def webcrawl(self, start, end, text):
        print("Initiating crawling...")
        global collect

        self.timestart = time.time()
        self.start = start
        self.end = end
        self.text = text

        current_date = datetime.date(datetime.now())

        if self.text == "dayone":
            self.openpg = super().getdayone()
        elif self.text == "archive":
            self.openpg = super().getarchive()
        else:
            return "Either `dayone` or `archive` page available"

        for page in range(self.start, self.end + 1):
            print("==================================")
            print("Collecting page", page, "of",self.text,"......")
            print("==================================")

            self.driver.get('{}?page={}'.format(self.openpg, page))

            # Common
            prj_list = self.driver.find_elements_by_xpath("//div[@class='project-infobox']")
            prj_length = len(prj_list)

            for i in range(prj_length):
                print(">> Collecting project",i,"right now...")
                x = prj_list[i]
                p1 = x.find_element_by_xpath("h2/a")
                detail_url = p1.get_attribute("href")

                # Project Title
                name = p1.text
                name = name.replace("'", "")

                # Project Content
                blurb = x.find_element_by_xpath("div[not(@class)]").text
                blurb = blurb.replace("'", "")

                # if `blurb` is not empty
                if blurb != "":

                    # Project Category
                    cat = x.find_element_by_xpath("div[@class='project-cat']")
                    cat_lst = cat.find_elements_by_tag_name("a")
                    category = cat_lst[0].text

                    # Project Information
                    info = x.find_element_by_xpath("div[@class='project-infobits']/div[@class='project-details']")
                    prj_info = info.text

                    # funding rate
                    rate_info = info.find_element_by_tag_name("span").text
                    percent = int(re.findall('\d+', rate_info)[0])

                    if percent >= 100 and self.text == 'dayone':
                        collect = super().getInfoSuccess(prj_info, detail_url)
                    elif percent < 100 and self.text == 'archive':
                        collect = super().getInfoFail(prj_info, detail_url)
                    else:
                        collect = {}

                    if len(collect) != 0:
                        main_dct = {'collected_date': current_date,
                                    'updated_date': current_date,
                                    'name': name,
                                    'blurb': blurb,
                                    'category': category,
                                    'funding_rate': percent}

                        if collect['country'] != '':
                            # merging results
                            main_dct.update(collect)
                            # print(main_dct)
                            print('Country:', main_dct['country'])
                            print('Currency type:', main_dct['currency_type'])

                            row_sql = dbt.DBcls.sqlselect(main_dct)
                            dbt.cur.execute(row_sql)
                            chk_row = dbt.cur.rowcount
                            get_row = dbt.cur.fetchone()
                            # print(chk_row)
                            if chk_row == 0:
                                insert_sql = dbt.DBcls.sqlinsert()
                                print("Query:", insert_sql)
                                dbt.cur.execute(insert_sql, main_dct)
                                print("Inserted in database\n")

                            else:
                                print("Data already exist. Updating to new information...")
                                get_id = get_row['id']
                                update_sql = dbt.DBcls.sqlupdate(get_id, main_dct)
                                print("Query:", update_sql, '\n')
                                dbt.cur.execute(update_sql)
                        else:
                            pass


        print("==================================")
        print("!COMPLETED!")
        print("==================================")
        print("Completed in {} seconds...".format(round(time.time() - self.timestart),2))

    def quitWeb(self):
        qt = self.driver.close()
        return qt


# Example
a = KicktraqCrawl()
a.webcrawl(1,1,"dayone")

# ...
# ...
a.quitWeb()
# dbt.DBcls.clcn()