# Imports

In [1]:
import scrapy
from scrapy.selector import HtmlXPathSelector
import csv
import requests
import re
import numpy as np
import operator

# Scraping Craigslist 

In [2]:
class CraigslistItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    comp = scrapy.Field()


In [3]:
class CraigsSpider(scrapy.Spider):
    name = "craigsBase"
    allowed_domains = ["craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/search/npo"]

    def parse(self, response):
        #crawls all the links
        for href in response.css("span.pl > a::attr('href')"):
            url = response.urljoin(href.extract())
            print(url)
            yield scrapy.Request(url, callback=self.parse_dir_contents)


    def parse_dir_contents(self, response):
        item = CraigslistItem()
        item['title'] = response.xpath('//*[@id="titletextonly"]/text()').extract()
        item['comp'] = response.xpath('//*[@id="pagecontainer"]/section/section/div[1]/p/span[1]/b/text()').extract()
        yield item


# Filtering out all the salaries

In [4]:
salaries = []
clean_salaries = []
with open("craigfile.csv") as f: 
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        if "$" in row[0]:   
            salarie = re.findall('\$\d*.\d{1,4}', row[0])
            for salary in salarie:
                if "-" in salary:
                    salary = salary.split("-")
                    salaryB = salary[:len(salary)/2]
                    salaryB = salaryB[0].strip("$")
                    salaryC = salary[len(salary)/2:]
                    salaries.append(salaryB)
                    salaries.append(salaryC[0])
                else:
                    salaries.append(salary)     
    for x in salaries:
        salaries = x.strip('$').replace(',','')
        clean_salaries.append(float(salaries))
print clean_salaries    

[404.0, 552.0, 18.0, 20.0, 50.0, 15.0, 22.94, 85.0, 16.0, 18.0, 13.0, 17.0, 5600.0, 8300.0, 21.0, 50.0, 55.0, 400.0, 650.0, 15000.0, 5730.0, 15000.0, 5730.0, 11.0, 14.0, 450.0, 600.0, 16.72, 13.0, 14.0, 13.5, 13.5, 15.07, 15.0, 12.0, 13.5, 24.04, 12.0, 13.5, 18.0, 19.0, 12.0, 13.0, 17.0, 11.74, 11.74, 11.74, 11.74, 12.0, 20.0, 75000.0, 14.5, 22.5, 30.0, 20.0, 55.0, 58.0, 24.0, 15.0, 17.0, 10.5, 15.0, 15.0, 17.0, 13.0, 17.0, 5600.0, 8300.0, 1500.0, 2000.0, 19.23, 17.0, 20.0, 42.0, 19.23, 404.0, 552.0, 404.0, 552.0]


# Dividing up the salaries based on amount

In [7]:
annual = []
for i in clean_salaries:
        if i > 6000:
            annual.append(i)
print annual            

[8300.0, 15000.0, 15000.0, 75000.0, 8300.0]


In [8]:
month = []
for i in clean_salaries:
        if i > 500 and i < 6000:
            month.append(i)
print month 

[552.0, 5600.0, 650.0, 5730.0, 5730.0, 600.0, 5600.0, 1500.0, 2000.0, 552.0, 552.0]


In [9]:
week = []
for i in clean_salaries:
        if i > 50 and i < 1000:
            week.append(i)
print week 

[404.0, 552.0, 85.0, 55.0, 400.0, 650.0, 450.0, 600.0, 55.0, 58.0, 404.0, 552.0, 404.0, 552.0]


In [10]:
hour = []
for i in clean_salaries:
        if i < 50:
            hour.append(i)
print hour 

[18.0, 20.0, 15.0, 22.94, 16.0, 18.0, 13.0, 17.0, 21.0, 11.0, 14.0, 16.72, 13.0, 14.0, 13.5, 13.5, 15.07, 15.0, 12.0, 13.5, 24.04, 12.0, 13.5, 18.0, 19.0, 12.0, 13.0, 17.0, 11.74, 11.74, 11.74, 11.74, 12.0, 20.0, 14.5, 22.5, 30.0, 20.0, 24.0, 15.0, 17.0, 10.5, 15.0, 15.0, 17.0, 13.0, 17.0, 19.23, 17.0, 20.0, 42.0, 19.23]


# Average annual salary

In [11]:
print np.mean(annual)

24320.0


In [12]:
print clean_salaries

[404.0, 552.0, 18.0, 20.0, 50.0, 15.0, 22.94, 85.0, 16.0, 18.0, 13.0, 17.0, 5600.0, 8300.0, 21.0, 50.0, 55.0, 400.0, 650.0, 15000.0, 5730.0, 15000.0, 5730.0, 11.0, 14.0, 450.0, 600.0, 16.72, 13.0, 14.0, 13.5, 13.5, 15.07, 15.0, 12.0, 13.5, 24.04, 12.0, 13.5, 18.0, 19.0, 12.0, 13.0, 17.0, 11.74, 11.74, 11.74, 11.74, 12.0, 20.0, 75000.0, 14.5, 22.5, 30.0, 20.0, 55.0, 58.0, 24.0, 15.0, 17.0, 10.5, 15.0, 15.0, 17.0, 13.0, 17.0, 5600.0, 8300.0, 1500.0, 2000.0, 19.23, 17.0, 20.0, 42.0, 19.23, 404.0, 552.0, 404.0, 552.0]


# Average monthly salary

In [13]:
print np.mean(month)

2642.36363636


# Average weekly salary

In [14]:
print np.mean(week)

372.928571429


# Average hourly salary

In [15]:
print np.mean(hour)

16.6863461538


In [16]:
print clean_salaries

[404.0, 552.0, 18.0, 20.0, 50.0, 15.0, 22.94, 85.0, 16.0, 18.0, 13.0, 17.0, 5600.0, 8300.0, 21.0, 50.0, 55.0, 400.0, 650.0, 15000.0, 5730.0, 15000.0, 5730.0, 11.0, 14.0, 450.0, 600.0, 16.72, 13.0, 14.0, 13.5, 13.5, 15.07, 15.0, 12.0, 13.5, 24.04, 12.0, 13.5, 18.0, 19.0, 12.0, 13.0, 17.0, 11.74, 11.74, 11.74, 11.74, 12.0, 20.0, 75000.0, 14.5, 22.5, 30.0, 20.0, 55.0, 58.0, 24.0, 15.0, 17.0, 10.5, 15.0, 15.0, 17.0, 13.0, 17.0, 5600.0, 8300.0, 1500.0, 2000.0, 19.23, 17.0, 20.0, 42.0, 19.23, 404.0, 552.0, 404.0, 552.0]


# Alternative

In [341]:
def divide_salaries(array):
    salaries = { annual == [], month == [], week == [], hour == [] }
    
    for i in array:
        if i > 5000:
            annual.append(i)
        elif i < 50:
            hour.append(i)
        elif i > 50 and i < 500:
            week.append(i)
        else: 
            month.append(i)
    return salaries