#  KICKSTARTER.COM WEB SCRAPER
  
##  Written for Python 3.3


In [6]:
import urllib3
import bs4
import csv
import re
import datetime
# import os

#======================
# Initialized variables
#======================

search = 'cards'

BASE_URL = 'https://www.kickstarter.com'
first_url = 'https://www.kickstarter.com/projects/search?page=1&term=' + search
subcat = 'https://www.kickstarter.com/discover/categories/34'
headers = ['URL','Title','Category','Start Date', 'End Date', 'Goal','Funding Amount','Backers','# of Rewards','Reward Price','# Claimed']

search_results = [first_url]
to_crawl = []
crawled = []

In [8]:
#======================
# addUrl
# Check if URL is in either list
# Add an individual URL to specified list
#======================
def addUrl(url,in_list):
    if url not in search_results and url not in to_crawl and url not in crawled:
        in_list.append(url)

In [9]:
#======================
# soupify
# Turns an html page into soup for further manipulation
#======================
def soupify(url):
    doc = urllib3.PoolManager().request('GET',url)
    return bs4.BeautifulSoup(doc.data)

In [11]:
#======================
# numify
# Remove letters and characters from string and turn into integer
#======================
def numify(in_str):
    newNum = int(re.sub('[^0-9]', '', in_str))
    return newNum


In [12]:
#======================
# addSearchPages
# Run this once first at the beginning of search
# Start with first URL and add all the other pages from the search result
#======================
def addSearchPages(url):
    '''
    Takes in a URL. 
    Generates list of pages that needs to be crawled based on last page number.
    Returns nothing
    '''
    soup = soupify(url)
    lastPage = soup.find('div',{'class':'pagination'}).find_all('a')[-2].string
    for i in range(2,int(lastPage) + 1):
        newUrl = BASE_URL + "/projects/search?page=" + str(i) + "&term=" + search
        addUrl(newUrl, search_results)

In [13]:
#======================
# addProjectPages
# Check if URL is in either list
# Add an individual URL
#======================
def addProjectPages(url):
    '''
    Takes in one URL from search_results from the list
    Appends project page URLs to to_crawl
    Returns nothing
    '''
    soup = soupify(url)
    for projects in soup.find_all('h2',{'class':'bbcard_name'}):
        for link in projects.find_all('a'):
            newUrl = BASE_URL + link.get('href')[0:-11]
            addUrl(newUrl, to_crawl)

In [14]:
#======================
# extractData
# Identify key parts of each project
# Title, category, start date, end date, goal, funded or not, # of rewards, reward level, # of backers
#======================
def extractData(url):
    '''
    Takes in URL
    Returns title, category, goal, funding status, total backers, # or rewards, reward level, # of backers
    '''
    soup = soupify(url)
    title = soup.head.title.string[0:-14]
    title = str(re.sub(r'[^\x00-\x7f]', '', title))
    category = str(soup.find('li',{'class':'category'}).find('a').contents[1][1:-1])
    start = soup.find_all('time')[-2].string
    end = soup.find_all('time')[-1].string
    goal = float(soup.find(id='pledged')['data-goal'])
    funded = float(soup.find(id='pledged').find('data')['data-value'])
    backers = numify(soup.find(id='backers_count').find('data').contents[0])
    rewards = soup.find(id='what-you-get').find_all('li')
    reward_price = []
    reward_back = []
    for reward in rewards:
        reward_price.append(numify(reward.h5.span.string))
        reward_back.append(int(reward.find('span',{'class':'num-backers'}).string[1:].split(' ')[0]))
    return title, category, start, end, goal, funded, backers, len(rewards), reward_price, reward_back

In [15]:
#======================
# writeCSV
# Create new CSV file including
# Write URL, and all info from extractData(url)
#======================
def writeCSV(crawl_list,time,errors):
    '''
    Takes in a list of URLs to crawl and the time to create unique filename
    Writes important data to CSV
    Returns nothing
    '''
    newT = str(numify(str(time)))
    newF = open(search+'_'+newT+'.csv', 'w', newline='')
    writer = csv.writer(newF, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    writer.writerow(headers)
    while len(crawl_list) > 0:
        project = crawl_list.pop()
        print(project)
        try:
            t, c, s,e, g, f, b, r, rp, rb = extractData(project)
            row = [project, t, c, s, e, g, f, b, r]
            for i in range(len(rp)):
                row.append(rp[i])
                row.append(rb[i])
            writer.writerow(row)
        except Exception as e:
            errors += 1
            row = [project, e]
            writer.writerow(row)
            print(e)
        crawled.append(project)
    newF.close()
    return

In [16]:
#======================
# startCrawl
# Start the crawl process
#======================
def startCrawl():
    errors = 0
    startTime = datetime.datetime.now()

    addSearchPages(search_results[0])

    while len(search_results) > 0:
        current = search_results.pop()
        print(current)
        try:
            addProjectPages(current)
        except Exception as e:
            errors += 1
            print(e)
            continue
        crawled.append(current)
    projects = len(to_crawl)

    writeCSV(to_crawl,startTime,errors)

    # try:
    #     writeCSV(to_crawl,startTime)
    # except Exception as e:
    #     errors += 1
    #     print(e)

    print('Number of crawled pages: ' + str(len(crawled)))
    print('Number of projects: ' + str(projects))
    print('Number of errors: ' + str(errors))
    print('Elapsed time: ' + str(datetime.datetime.now() - startTime))

In [17]:
startCrawl()



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


AttributeError: 'NoneType' object has no attribute 'find_all'

In [3]:
import json
import requests
import datetime

#data outputs to a CSV file in the current directory
csv_output = open("top-funded-sample.csv", "w")

end_page = 77;

#scan through pages 1 to end_page for data, 20 results per page
for page in range(1,end_page+1):
    r = requests.get('https://www.kickstarter.com/discover/advanced?ref=discovery_overlay.json' + str(page))
    #r = requests.get('https://www.kickstarter.com/discover/advanced.json?category_id=0&woe_id=0&sort=most_funded&page=' + str(page))
    data = r.json()
    for index in range(len(data["projects"])):
        #print "%s,%f,%s,%f" % (data["projects"][index]["name"], data["projects"][index]["goal"], data["projects"][index]["currency"], data["projects"][index]["pledged"])
        csv_output.write("\"%s\",%s,%.0f,%s,%.2f,%d,%s,%s,%s\n" % (data["projects"][index]["name"].encode('ascii', 'ignore'), 
            data["projects"][index]["category"]["slug"].split("/")[0],
            data["projects"][index]["goal"], 
            data["projects"][index]["currency"], 
            data["projects"][index]["pledged"],
            data["projects"][index]["backers_count"],
            str(datetime.datetime.fromtimestamp(data["projects"][index]["created_at"])),
            str(datetime.datetime.fromtimestamp(data["projects"][index]["launched_at"])),
            str(datetime.datetime.fromtimestamp(data["projects"][index]["deadline"]))))

csv_output.close()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)