Skip to content

Commit

Permalink
added socket_basic
Browse files Browse the repository at this point in the history
  • Loading branch information
utsavsabharwal committed Apr 10, 2012
1 parent 9d43a1b commit b493ff5
Show file tree
Hide file tree
Showing 14 changed files with 357 additions and 1 deletion.
Empty file added penelope/111.uss
Empty file.
Binary file added penelope/config.pyc
Binary file not shown.
3 changes: 2 additions & 1 deletion penelope/crawler.py
Expand Up @@ -10,7 +10,8 @@
invalid_domains=[]
thread_started=[]
#urls=["https://www.acuprice.com/Products/Overview/?id=M005926853:::247:::247"]
urls=open("urls").readlines()
urls=["com.singlefeed.reporting\t111\thttp://reporting.singlefeed.com/r/?cvsfa=2586&cvsfe=6&cvsfhu=343131303836&cvsfurl=http%3A%2F%2Fwww.gandermountain.com%2Fmodperl%2Ftrack.cgi%3FGSHOP_411086%2B%2Fmodperl%2Fproduct%2Fdetails.cgi%3Fpdesc%3DSlumberjack_Big_Cot%26i%3D411086%26r%3Dview%26aID%3D505B3\t123"]*200
#urls=open("urls").readlines()
#urls=urls[:10000]
class Crawler(Thread):
def __init__(self, id):
Expand Down
117 changes: 117 additions & 0 deletions penelope/crawler.py~
@@ -0,0 +1,117 @@
import pycurl
import thread
import time
import config as cc
from threading import Thread
success=[] #list of uids with urls for which we need to set last_fetch=now() and next_fetch="2012-12-12"
failure=[] #list of uids with urls for which we need to set last_fetch=now() and is_disabled=1
update=[] #list of uids, urls sperated by :::
insert=[] #list of urls with product id to be inserted
invalid_domains=[]
thread_started=[]
#urls=["https://www.acuprice.com/Products/Overview/?id=M005926853:::247:::247"]
urls=["com.singlefeed.reporting\t111\thttp://reporting.singlefeed.com/r/?cvsfa=2586&cvsfe=6&cvsfhu=343131303836&cvsfurl=http%3A%2F%2Fwww.gandermountain.com%2Fmodperl%2Ftrack.cgi%3FGSHOP_411086%2B%2Fmodperl%2Fproduct%2Fdetails.cgi%3Fpdesc%3DSlumberjack_Big_Cot%26i%3D411086%26r%3Dview%26aID%3D505B3\t123"]
#urls=open("urls").readlines()
#urls=urls[:10000]
class Crawler(Thread):
def __init__(self, id):
#print id
Thread.__init__(self)
self.crawl()

def crawl(self):
try:
thread_started.append("ok")
try:
#required urls(list) format: url(str/split):::filename(int):::product_id(int)
domain, filename, url, product_id = urls.pop().split("\t")
domain = domain.strip()
if domain not in invalid_domains:
filename=int(filename.strip())
url=str(url.strip())
product_id=int(product_id.strip())
fname = str(filename)+".uss"
c = pycurl.Curl()
c.fp = open(fname,"a+")
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
c.perform()
original_url = str(url)
final_url = str(c.getinfo(pycurl.EFFECTIVE_URL))
c.fp.close()
if not c.errstr():
if(original_url==final_url):
success.append(str(filename))
else:
update.append(str(filename)+":::"+final_url)
insert.append(str(original_url)+":::"+str(product_id))
else:
print "oye"
response_code = str(c.getinfo(pycurl.HTTP_CODE))
pattern = filename+":::"+response_code+chr(10)
print "failure", pattern
failure.append(pattern)
else:
failure.append(str(filename)+chr(10))

except Exception, ex:
failure.append(str(filename)+chr(10))
print "===", int(ex[0])
if(ex[0]==6):
invalid_domains.append(domain)
print "Eerror:", ex
pass

try:
thread_started.pop()
except Exception, ex:
print "Error:", ex
pass
except Exception, ex:
print ex

def run(pid, *args):
print "Core thread", pid
while True:
t=Crawler(cc.id)
t.start()
t.join()
cc.id+=1

x=0
while x<100:
th="thread no:"+str(x)
thread.start_new_thread(run,(th,2))
x+=1


while len(urls) > 1000:
time.sleep(10)
pass
print "O got out of the loop", len(urls)

s=open("success.log","w+")
f=open("failure.log","w+")
i=open("insert.log","w+")
u=open("update.log","a+")
invalid= open("invalid_domains.log","a+")
while len(success)>0:
s.write(success.pop()+chr(10))
s.close()
while len(failure)>0:
f.write(failure.pop()+chr(10))
f.close()
while len(insert)>0:
i.write(insert.pop()+chr(10))
i.close()
while len(update)>0:
u.write(update.pop()+chr(10))
u.close()
while len(invalid_domains)>0:
invalid.write(invalid_domains.pop()+chr(10))
invalid.close()
Empty file added penelope/failure.log
Empty file.
Empty file added penelope/insert.log
Empty file.
Empty file added penelope/invalid_domains.log
Empty file.
Empty file added penelope/output
Empty file.
Empty file added penelope/success.log
Empty file.
Empty file added penelope/update.log
Empty file.
117 changes: 117 additions & 0 deletions penelope/urls

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions penelope/urls~
@@ -0,0 +1 @@
com.singlefeed.reporting\t111\thttp://reporting.singlefeed.com/r/?cvsfa=2586&cvsfe=6&cvsfhu=343131303836&cvsfurl=http%3A%2F%2Fwww.gandermountain.com%2Fmodperl%2Ftrack.cgi%3FGSHOP_411086%2B%2Fmodperl%2Fproduct%2Fdetails.cgi%3Fpdesc%3DSlumberjack_Big_Cot%26i%3D411086%26r%3Dview%26aID%3D505B3\t123
60 changes: 60 additions & 0 deletions sock/crawler.py
@@ -0,0 +1,60 @@
import socket
from urlparse import urlparse

class abc:
def __init__(self):
urls = ['http://docs.python.org/release/3.1.3/library/urllib.parse.html']
self.run(urls)

def create_socket(self):
return socket.socket(socket.AF_INET, socket.SOCK_STREAM)

def remove_socket(self, sock):
sock.close()
del sock

def run(self, urls):
for url in urls:
scheme, self.host, path, params, query, fragment = urlparse(url)
self.uri = url[url.find(self.host)+len(self.host):]
self.send_request(self.create_socket())

def send_request(self, sock):
l = ''
print self.uri, self.host
line1 = "GET %s HTTP/1.1"%(self.uri)
line2 = "Host: %s"%(self.host)
line3 = "Connection: close"
for line in (line1, line2, line3):
print "--", line
l+= (line + "\r\n")
sock.send(l)
sock.send("\r\n")

a = abc()


'''
sock = create_socket()
print "Connecting"
sock.connect( ('en.wikipedia.org', 80) )
print "Sending Request"
import socket
sock = socket.socket()
sock.connect(('en.wikipedia.org', 80))
for line in (
"GET /wiki/List_of_HTTP_header_fields HTTP/1.1",
"Host: en.wikipedia.org",
"Connection: close",
):
sock.send(line + "\r\n")
sock.send("\r\n")
while True:
content = sock.recv(1024)
if content:
print content
else:
break'''
60 changes: 60 additions & 0 deletions sock/crawler.py~
@@ -0,0 +1,60 @@
import socket
from urlparse import urlparse

class abc:
def __init__(self):
urls = ['http://docs.python.org/release/3.1.3/library/urllib.parse.html']
self.run(urls)

def create_socket(self):
return socket.socket(socket.AF_INET, socket.SOCK_STREAM)

def remove_socket(self, sock):
sock.close()
del sock

def run(self, urls):
for url in urls:
scheme, self.host, path, params, query, fragment = urlparse(url)
self.uri = url[url.find(self.host)+len(self.host):]
self.send_request(self.create_socket())

def send_request(self, sock):
l = []
print self.uri, self.host
line1 = "GET %s HTTP/1.1"%(self.uri)
line2 = "Host: %s"%(self.host)
line3 = "Connection: close"
for line in (line1, line2, line3):
print "--", line
l+= (line + "\r\n")
sock.send(l)
sock.send("\r\n")

a = abc()


'''
sock = create_socket()
print "Connecting"
sock.connect( ('en.wikipedia.org', 80) )
print "Sending Request"

import socket
sock = socket.socket()
sock.connect(('en.wikipedia.org', 80))

for line in (
"GET /wiki/List_of_HTTP_header_fields HTTP/1.1",
"Host: en.wikipedia.org",
"Connection: close",
):
sock.send(line + "\r\n")
sock.send("\r\n")

while True:
content = sock.recv(1024)
if content:
print content
else:
break'''

0 comments on commit b493ff5

Please sign in to comment.