In [1]:
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs

In [2]:
def open_data(filename):
    with open(filename, 'r') as f:
        links = f.readlines()

    url_list = []
    for link in links:
        link = link.split("\n")[0]
        url_list.append(link)
    
    return url_list

def wrap_str(text):
    if '"' in text:
        text = text.replace('"', '""')
    return '"' + text + '"'

def clean_comma(text):
    return text.replace(',', '')

def clean_money(text):
    return re.sub('[SE|U|£|$| |,|CA|NZ|Â£|â‚¬|MX|HK|F|O|D]', '', text)


In [3]:
# filename = 'Dataset/final_urls.txt'
url_list = open_data('../Dataset/final_urls.txt')

In [4]:
successful_url = 'https://www.kickstarter.com/projects/914635363/lace-anchors-20-a-simple-design-for-simple-people'
live_url = 'https://www.kickstarter.com/projects/487334780/living-in-a-dream'
fail_url = 'https://www.kickstarter.com/projects/1579473521/re-sail-bags-hand-crafted-in-redondo-beach-from-us'
testurl = [successful_url, live_url, fail_url]

In [5]:
def failed_project(soup):

    title = str(soup.find('title')\
                    .text\
                    .split('by')[0]\
                    .strip())
    inventor = re.split(r"[—K|— K]", str(soup.find('title')\
                   .text\
                   .split('by')[1]))[1]\
                   .strip()
    number_of_backers = str(soup.find_all('div', attrs = {'id': 'backers_count'})[0]\
                            .text.strip())
    total_pledged = str(soup.find('div', attrs={'class':'num nowrap'})['data-pledged']\
                        .split('.')[0])
    goal = str(soup.find(attrs={'class':'num nowrap'})['data-goal'].split('.')[0])
    location = str(soup.find_all('a', href=re.compile(r'/discover/places/'))[-1]\
                   .text.strip()\
                   .replace(', ', '-'))                
    category = str(soup.find_all('a', href=re.compile(r'/discover/categories/'))[-1]\
                   .text\
                   .strip())
    number_of_pledged_option = str(len(soup.find_all('li', class_='pledge-selectable-sidebar')))

    all_categories = soup.find_all('li', class_='pledge-selectable-sidebar')#not being scraped for project
    Pledge_Detail = [(category.find('span', class_='pledge__backer-count').text.strip(), 
                      category.find('span', class_='money').text.strip()) for category in all_categories]
    Pledge_Detail_str = ', '.join(map(str, Pledge_Detail))
    
    return [wrap_str(title), 
            inventor, 
            clean_comma(number_of_backers), 
            clean_money(total_pledged), 
            clean_money(clean_comma(goal)), 
            location, 
            category, 
            number_of_pledged_option,
            wrap_str(Pledge_Detail_str), 'fail']

In [6]:
def live_project(soup):

    title = str(soup.find('title')\
                    .text\
                    .split('by')[0]\
                    .strip())
    inventor = re.split(r"[—K|— K]", str(soup.find('title')\
                   .text\
                   .split('by')[1]))[1]\
                   .strip()
    number_of_backers = str(soup.find_all('div', attrs = {'id': 'backers_count'})[0]\
                            .text.strip())
    total_pledged = str(soup.find('div', attrs={'class':'num nowrap'})['data-pledged']\
                        .split('.')[0])
    try:
        goal_text = soup.find("div", attrs={"class":"NS_campaigns__spotlight_stats"})\
                   .find('span', attrs={'class':'money'}).text
    except:
        goal_text = soup.find("div", attrs={"class":"NS_campaigns__stats"})\
                   .find('span', attrs={'class':'money'}).text
        
    location = str(soup.find_all('a', href=re.compile(r'/discover/places/'))[-1]\
                   .text.strip()\
                   .replace(', ', '-'))                
    category = str(soup.find_all('a', href=re.compile(r'/discover/categories/'))[-1]\
                   .text\
                   .strip())
    number_of_pledged_option = str(len(soup.find_all('li', class_='pledge-selectable-sidebar')))

    all_categories = soup.find_all('li', class_='pledge-selectable-sidebar')#not being scraped for project
    Pledge_Detail = [(category.find('span', class_='pledge__backer-count').text.strip(), 
                      category.find('span', class_='money').text.strip()) for category in all_categories[1:]]
    Pledge_Detail_str = ', '.join(map(str, Pledge_Detail))
    
    return [wrap_str(title), 
            inventor, 
            clean_comma(number_of_backers), 
            clean_money(total_pledged), 
            clean_money(clean_comma(goal)), 
            location, 
            category, 
            number_of_pledged_option,
            wrap_str(Pledge_Detail_str), 'live'
           ]

In [7]:
def success_project(soup):

    title = str(soup.find('title')\
                .text\
                .split('by')[0]\
                .strip())  
    inventor = re.split(r"[—K|— K]", str(soup.find('title')\
                   .text\
                   .split('by')[1]))[1]\
                   .strip()  
    number_of_backers = str(soup.find_all('b')[0]\
                            .text.split(' ')[0])
    total_pledged = str(soup.find_all('b')[0]\
                        .find_next()\
                        .text\
                        .replace('$', '')\
                        .replace(',', ''))
    
    goal_text = (soup.find("div", attrs={"class":"NS_projects__description_section"})
                 .find("div", attrs={"class":"description-container"})
                 .find_all("span", attrs={"class":"money"})[1]
                 .text
                )
    goal = clean_money(goal_text)
    
    location = str(soup.find_all('a', href=re.compile(r'/discover/places/'))[-1]\
                   .text.strip()\
                   .replace(', ', '-'))                
    category = str(soup.find_all('a', href=re.compile(r'/discover/categories/'))[-1]\
                   .text\
                   .strip())
    number_of_pledged_option = str(len(soup.find_all('li', class_='pledge-selectable-sidebar')))

    all_categories = soup.find_all('li', class_='pledge-selectable-sidebar')#not being scraped for project
    Pledge_Detail = [(category.find('span', class_='pledge__backer-count').text.strip(), 
                      category.find('span', class_='money').text.strip()) for category in all_categories]
    Pledge_Detail_str = ', '.join(map(str, Pledge_Detail))
    
    return [wrap_str(title), 
            inventor, 
            clean_comma(number_of_backers), 
            clean_money(total_pledged), 
            clean_money(clean_comma(goal)), 
            location, 
            category, 
            number_of_pledged_option,
            wrap_str(Pledge_Detail_str), 'success']

In [7]:
def get_data(url_list):
    with open('KS_data_test.csv', 'a+') as data:
        data.write(','.join(['title', 
                             'inventor', 
                             'number_of_backers', 
                             'total_pledged', 
                             'goal', 
                             'location', 
                             'category', 
                             'number_of_pledged_option',
                             'Pledge_Detail']) + '\n')

    for i in range(len(url_list)):
        try:
            print("Link", i)
            response = requests.get(url_list[i]).text
            soup = bs(response, 'html5lib')

            status_successful = soup.find('div', attrs = {'class':'Campaign-state-successful'})
            status_live = soup.find('div', attrs = {'class':'Campaign-state-live'})
            status_failed = soup.find('div', attrs = {'class':'Campaign-state-failed'})

            if status_successful is None:
                pass
            else:
                result = success_project(soup)

            if status_live is None:
                pass
            else:
                result = live_project(soup)

            if status_failed is None:
                pass
            else:
                result = failed_project(soup)

            with open('KS_data_test.csv', 'a+') as data:
                data.write(','.join([result[0], result[1], result[2], result[3], result[4], result[5], result[6], result[7], result[8], result[9]]) + '\n')
        except:
            with open("failed_links.csv", "a+") as data:
                data.write(str(i) + "," + wrap_str(url_list[i]) + "\n")

In [None]:
%%time
from multiprocessing import Pool

def get_single_data(url):
    try:
        response = requests.get(url).text
        soup = bs(response, "html5lib")

        status_successful = soup.find('div', attrs = {'class':'Campaign-state-successful'})
        status_live = soup.find('div', attrs = {'class':'Campaign-state-live'})
        status_failed = soup.find('div', attrs = {'class':'Campaign-state-failed'})

        if status_successful is None:
            pass
        else:
            result = success_project(soup)
        
        if status_live is None:
            pass
        else:
            result = live_project(soup)

        if status_failed is None:
            pass
        else:
            result = failed_project(soup)

        with open('KickStarter_data.csv', 'a+') as data:
            data.write(','.join([result[0], result[1], result[2], 
                                 result[3], result[4], result[5], 
                                 result[6], result[7], result[8], result[9]]) + '\n')
    except:
        with open('failed_links.csv', 'a+') as data:
            data.write("," + wrap_str(url) + "\n")
        
if __name__ == "__main__":
    pool = Pool(20)
    pool.map(get_single_data, url_list[0:])

In [21]:
get_single_data(url_list[0])

<class 'bs4.element.Tag'> <class 'NoneType'> <class 'NoneType'>
['"Imaginary Drugs"', 'Michael', '650', '13609', '5500', 'Barnegat-NJ', 'Comics', '20', '"(\'13 backers\', \'$1\'), (\'164 backers\', \'$5\'), (\'278 backers\', \'$10\'), (\'40 backers\', \'$15\'), (\'78 backers\', \'$25\'), (\'2 backers\', \'$30\'), (\'37 backers\', \'$40\'), (\'6 backers\', \'$50\'), (\'11 backers\', \'$65\'), (\'1 backer\', \'$100\'), (\'3 backers\', \'$150\'), (\'5 backers\', \'$150\'), (\'0 backers\', \'$150\'), (\'3 backers\', \'$150\'), (\'0 backers\', \'$150\'), (\'1 backer\', \'$150\'), (\'1 backer\', \'$200\'), (\'1 backer\', \'$250\'), (\'1 backer\', \'$250\'), (\'1 backer\', \'$300\')"', 'success']


In [14]:
get_data(url_list[0:20])

Link 0
Link 1
Link 2
Link 3
Link 4
Link 5
Link 6
Link 7
Link 8
Link 9
Link 10
Link 11
Link 12
Link 13
Link 14
Link 15
Link 16
Link 17
Link 18
Link 19
