# Scrape Data from Kickstarter Website

This script (currently) goes to the "Projects We Love" page of the Kickstarter website, scrapes all projects (currently has filters that limit the total count to 21), and writes the necessary variables to a csv file (beautifulsoups_info370_preliminary_rawData)

In [107]:
import requests 
from bs4 import BeautifulSoup
# import pandas as pd 
import json
import csv

#### main()
This is the main method that calls the necessary functions to scrape the Kickstarter projects (from the "Projects We Love" filtered page)

In [111]:
def main():
    base_urls = ['https://www.kickstarter.com/discover/advanced?state=live&woe_id=0&staff_picks=1&raised=1&sort=end_date&seed=2568554&page=']
    for url in base_urls:
        project_urls = get_pages(url)
        scrape_pages(project_urls)
    

#### get_pages()
@param base_url : the base url for the overview page that lists all projects of that category (to append page numbers to iterate through pages) <br>
@return page_urls : the list of all project urls from each page of the base url

In [112]:
def get_pages(base_url):
    page = 1
    valid = True
    page_urls = []
    while valid:
        r = requests.get(base_url + str(page))
        category_soup = BeautifulSoup(r.text, 'html.parser')
        results = category_soup.find_all('div', attrs={'class':'js-react-proj-card col-full col-sm-12-24 col-lg-8-24'})
        if len(results) > 0:
            for result in results:
                data_project_json = json.loads(str(result['data-project']))
                project_url = data_project_json['urls']['web']['project']
                page_urls.append(project_url)
#                 project_url = result.find('a', attrs={'class':'block img-placeholder w100p'})
#                 print(project_url.get('href'))
            page = page + 1
        else:
            valid = False
    return page_urls

#### scrape_pages()
@param urls : list of all project urls to scrape from <br>
Writes all of the scraped data into a csv file

In [113]:
def scrape_pages(urls):
    pages = []
    for url in urls:
        r = requests.get(url)  
        soup = BeautifulSoup(r.text, 'html.parser')
        top_portion = soup.find('div', attrs={'class':'bg-grey-100'})
        attributes = get_attributes(top_portion, url)
        attributes.append(get_update_count(soup))
        reward_level_info = get_reward_levels(soup)
        attributes.append(len(reward_level_info[0]))
        attributes.append(reward_level_info[0]) # reward levels
        attributes.append(reward_level_info[1]) #backers for each reward level
        pages.append(attributes)
    write_to_csv('beautifulsoups_info370_preliminary_rawData', pages)

#### get_attributes()
@param intro_soup : the BeautifulSoup object of the introductory portion of the project page <br>
@param url : url of the current page being scraped <br>
@return records : the list of all scraped variables for the current project page

In [118]:
def get_attributes(intro_soup, url):
    records = []
    data_initial_json = json.loads(str(intro_soup['data-initial']))
    project = data_initial_json['project']
    records.append(project['pid'])
    records.append(project['name'])
    records.append(url)
    records.append("X")
#     records.append(project['category']['parentCategory']['name'])
    records.append(project['category']['name'])
    records.append(project['location']['displayableName'])
    records.append(project['state'])
    records.append(project['goal']['amount'])
    records.append(project['pledged']['amount'])
    records.append(project['percentFunded'])
    records.append(project['backersCount'])
    records.append("X")
    records.append(project['commentsCount'])
    records.append(project['duration'])
    return records

#### write_to_csv()
@param file_name : the name of the csv file to store the scraped project data in <br>
@param values : the scraped values to be stored in the csv file <br>
Stores the given data into a csv file

In [119]:
def write_to_csv(file_name, values):
    col_names = ['project id', 'name', 'url', 'category', 'subcategory', 'location', 'status', 'goal', 'pledged', 'funded percent', 'backers', 
                 'funded date', 'comments', 'duration', 'updates', 'levels', 'reward levels', 'backers per reward level']
    file=open(file_name + '.csv','w')
    writer=csv.writer(file)
    writer.writerow(col_names)
    for row in values:
        writer.writerow(row)
    file.close()

#### get_reward_levels()
@param soup : the BeautifulSoup object of the project page
@return reward_levels : the list of different pledge amounts for each reward level
@return backers_by_levels : the list of the number of backers per reward level

In [120]:
def get_reward_levels(soup):
    reward_levels = []
    backers_by_levels = []
    level_divs = soup.find_all('li', attrs={'class':'hover-group js-reward-available pledge--available pledge-selectable-sidebar'})
    for level in level_divs:
        price = level.find('span', attrs={'class':'money'}).text
        backers = level.find('span', attrs={'class':'pledge__backer-count'}).text.replace('backers', '').strip()
        reward_levels.append(price)
        backers_by_levels.append((price, backers))
    gone_level_divs = soup.find_all('li', attrs={'class':'hover-group pledge--all-gone pledge-selectable-sidebar'})
    for level in gone_level_divs:
        price = level.find('span', attrs={'class':'money'}).text
        backers = level.find('span', attrs={'class':'pledge__backer-count'}).text.replace('backers', '').strip()
        reward_levels.append(price)
        backers_by_levels.append((price, backers))
    return (reward_levels, backers_by_levels)

#### get_update_count()
@param soup : the BeautifulSoup object of the project page <br>
@return updates : the number of updates that the project has undergone

In [121]:
def get_update_count(soup):
    updates = soup.find('a', 
                        attrs={'class': 'js-load-project-content js-load-project-updates mx3 project-nav__link--updates tabbed-nav__link type-14'})
    return updates.find('span').text

In [122]:
main()