In [4]:
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import requests, numpy as np, pandas as pd

def parse_url(url="http://www.datatau.com", data=False):
    
    response  =  requests.get(url)
    links     =  Selector(text=response.text).xpath("//td[@class='title']/a/@href").extract()
    titles    =  Selector(text=response.text).xpath("//td[@class='title']/a/text()").extract()
    points    =  Selector(text=response.text).xpath("//td[@class='subtext']/span/text()").extract()
    domains   =  Selector(text=response.text).xpath("//td[@class='title']/span/text()").extract()
    authors   =  Selector(text=response.text).xpath("//td[@class='subtext']/a[contains(@href, 'user')]/text()").extract()
    comments  =  Selector(text=response.text).xpath("//td[@class='subtext']/a[contains(@href, 'item')]/text()").extract()

    expected_length = 30
    
    # Adding [np.nan]*(expected_length - len(points)) to the end of the lists will fill in missing
    # values at the end of results that sometimes don't exist naturally.
    scraped = dict(
        titles   =  titles[:30], 
        links    =  links[:30], # :30 Because of that "more" link.
        points   =  points + [np.nan]*(expected_length - len(points)),
        domains  =  domains + [np.nan]*(expected_length - len(domains)),
        authors  =  authors + [np.nan]*(expected_length - len(authors)),
        comments =  comments + [np.nan]*(expected_length - len(comments))
    )
    
    df = pd.DataFrame(scraped)
    
    if type(data) != bool:
        data = df.append(data)
    else:
        data = df
        
    # If there's data, append them. If not, it's the first iteration, so there's no need.
    # Find "more" link:
    more_anchor  =  Selector(text=response.text).xpath("//a[text() = 'More']/@href").extract()
    
    if len(more_anchor) > 0:
        more_url  =  "http://www.datatau.com%s" % more_anchor[0]
        print("Fetching %s..." % more_url)
        return parse_url(more_url, data=data)
    else:
        return data.reset_index()
       
        
df = parse_url("http://www.datatau.com")
df

Fetching http://www.datatau.com/x?fnid=QgU5dGpWWc...
Fetching http://www.datatau.com/x?fnid=rWM204lRWL...
Fetching http://www.datatau.com/x?fnid=PYitJ4tluQ...
Fetching http://www.datatau.com/x?fnid=1BM6qJ9FrL...
Fetching http://www.datatau.com/x?fnid=AdN0BlqqsS...
Fetching http://www.datatau.com/x?fnid=Dm76tPUUtZ...


Unnamed: 0,index,authors,comments,domains,links,points,titles
0,0,shashankg22,discuss,(paralleldots.com),https://blog.paralleldots.com/product/webinar-...,4 points,Webinar on Survey Analysis using Machine Learning
1,1,iheartai,discuss,(aiworkbox.com),https://www.aiworkbox.com/lessons/create-tenso...,3 points,Create TensorFlow Name Scopes For TensorBoard
2,2,m_ly,discuss,(medium.com),https://medium.com/inside-machine-learning/ai-...,3 points,AI and Data Science trends (ahead of DataWorks...
3,3,tagmobile56007,discuss,(tagmobile.com),https://www.tagmobile.com/blog/how-to-get-a-fr...,3 points,How to get a free Lifeline Phone service in Maine
4,4,tbugaevskii,discuss,(medium.com),https://medium.com/activewizards-machine-learn...,25 points,Hadoop 3: Comparison with Hadoop 2 and Spark
5,5,hiimtomi,discuss,(data36.com),https://data36.com/python-import-built-in-modu...,16 points,Python Import Statement & Built-in Modules for...
6,6,stijntonk,discuss,(godatadriven.com),https://blog.godatadriven.com/rod-elitist-shuffle,15 points,Elitist shuffle for recommendation systems
7,7,andrewxhill,discuss,(hackernoon.com),https://hackernoon.com/a-short-trip-to-jupyter...,3 points,"Getting Started with IPFS, Python, and Jupyter..."
8,8,ddinterview,discuss,(interviewqs.com),https://www.interviewqs.com/blog/python_sheets,5 points,Reading and writing to Google Spreadsheets usi...
9,9,vimarshk,discuss,(medium.com),https://medium.com/acing-ai/in-conversation-wi...,3 points,"An Interview with Jesse Steinweg-Woods — Ph.D,..."
