Permalink
Browse files

added all states and ages

  • Loading branch information...
wesleyzhao committed Sep 15, 2011
1 parent 7af7d46 commit c59ca6e72fbf2bdc1bb79b369947920b24bb1e9c
Showing with 32 additions and 20 deletions.
  1. +10 −1 README
  2. +22 −19 eteamz_scraper.py
View
11 README
@@ -1 +1,10 @@
-GOAL: get as many eteamz sites as possible
+GOAL: get as many eteamz sites as possible
+
+2,952 entries with:
+ state: Washington
+ gender: Male,Female,Coed
+
+10,320 entries with:
+ state: Washington
+ gender: Male,Female,Coed,
+ ages: ALL OF THEM
View
@@ -4,7 +4,7 @@
import MySQLdb
def get_cursor():
- conn = MySQLdb.connect(host = '50.16.213.18', user = 'eteamz_temp', passwd = 'eteamz_temp_password', db = 'eteamz')
+ conn = MySQLdb.connect(host = '50.16.213.18', , db = 'eteamz')
cursor = conn.cursor()
return {'cursor': cursor, 'conn' : conn}
@@ -30,39 +30,42 @@ def make_url(page, search_box = "", city = "", state = "", sport = "", skill = "
params_url = urllib.urlencode(params)
return base_url + "?" + params_url
-def insert_links(links, state, gender):
- insert_str = ", ".join(["('%s', '%s', '%s')" % (link, state, gender) for link in links])
- query_str = "INSERT INTO teams (url, state, gender) VALUES %s" % (insert_str)
+def insert_links(links, state, gender, age):
+ insert_str = ", ".join(["('%s', '%s', '%s', '%s')" % (link, state, gender, age) for link in links])
+ query_str = "INSERT INTO teams (url, state, gender, age) VALUES %s" % (insert_str)
mysql = get_cursor()
cur = mysql['cursor']
cur.execute(query_str)
mysql['conn'].commit()
return cur.rowcount
-def mark_stop(page_num, state, gender):
+def mark_stop(page_num, state, gender, age):
mysql = get_cursor()
cur = mysql['cursor']
- cur.execute("INSERT INTO stops (page_number, state, gender) VALUES (%s, '%s', '%s')" % (str(page_num), state, gender))
+ cur.execute("INSERT INTO stops (page_number, state, gender, age) VALUES (%s, '%s', '%s', '%s')" % (str(page_num), state, gender, age))
mysql['conn'].commit()
return cur.rowcount
def main():
- states = ['Alabama, Alaska, Alberta, Arizona, Arkansas, British Columbia, California, Colorado, Connecticut, Delaware, District of Columbia, Florida, Georgia, Hawaii, Idaho, Illinois, Indiana, International, Iowa, Kansas, Kentucky, Louisiana, Maine, Manitoba, Maryland, Massachusetts, Michigan, Minnesota, Mississippi, Missouri, Montana, Nebraska, Nevada, New Brunswick, New Hampshire, New Jersey, New Mexico, New York, Newfoundland, North Carolina, North Dakota, Northwest Territories, Nova Scotia, Ohio, Oklahoma, Ontario, Oregon, Pennsylvania, Prince Edward Island, Puerto Rico, Quebec, Rhode Island, Saskatchewan, South Carolina, South Dakota, Tennessee, Texas, Utah, Vermont, Virginia, West Virginia, Wisconsin, Wyoming, Yukon'] #already did Washington
+ #states = ['Washington']
+ states = ['Alabama', 'Alaska', 'Alberta', 'Arizona', 'Arkansas', 'British Columbia', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'International', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Manitoba', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Brunswick', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'Newfoundland', 'North Carolina', 'North Dakota', 'Northwest Territories', 'Nova Scotia', 'Ohio', 'Oklahoma', 'Ontario', 'Oregon', 'Pennsylvania', 'Prince Edward Island', 'Puerto Rico', 'Quebec', 'Rhode Island', 'Saskatchewan', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'West Virginia', 'Wisconsin', 'Wyoming', 'Yukon'] #already did Washington
genders = ['Female','Male','Coed']
+ ages = ['All Ages', 'Adult', 'Freshman', 'Jr Varsity', 'Senior', 'Sophomore', 'Varsity', 'Youth', 'Youth 10u', 'Youth 11u', 'Youth 12u', 'Youth 13u', 'Youth 14u', 'Youth 15u', 'Youth 16u', 'Youth 17u', 'Youth 18u', 'Youth 6u', 'Youth 7u', 'Youth 8u', 'Youth 9u']
for state in states:
#make sure it is done for every state
for gender in genders:
#make sure it is done for every gender
- page_num = 1 #starting page
- last_link_len = 10
- while last_link_len >=10:
- url = make_url(page = page_num, gender = gender, state = state)
- links = get_links(url)
- last_link_len = len(links)
- page_num += 1
- if links:
- insert_links(links, state, gender)
- print 'inserted -- url: %s, gender: %s, # of links %s' % (url, gender, str(last_link_len))
- mark_stop(page_num - 1, state, gender)
- print 'Stopped at page # %s, state: %s, gender %s' %(str(page_num-1), state, gender)
+ for age in ages:
+ page_num = 1 #starting page
+ last_link_len = 10
+ while last_link_len >=10:
+ url = make_url(page = page_num, gender = gender, state = state, age = age)
+ links = get_links(url)
+ last_link_len = len(links)
+ page_num += 1
+ if links:
+ insert_links(links, state, gender, age)
+ print 'inserted -- url: %s, gender: %s, # of links %s, age: %s' % (url, gender, str(last_link_len), age)
+ mark_stop(page_num - 1, state, gender, age)
+ print 'Stopped at page # %s, state: %s, gender: %s, age: %s' %(str(page_num-1), state, gender, age)

0 comments on commit c59ca6e

Please sign in to comment.