In [None]:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
import re
import json
from scrapy.http import Request, FormRequest
from collections import namedtuple
import re

class CricInfoScrapeSpider(scrapy.Spider):
    name='cricinfo-spider'
    start_urls=['http://www.espncricinfo.com/ci/content/player/country.html?country=6;alpha=A']
    first_class_batting_stats=namedtuple("first_batting_stats", 'matches, inns, not_outs, runs, highest, average, balls_faced, strike_rate, hundreds, fifties, boundaries, sixes, catches_taken, stumpings_made')
    
    def parse(self, response):
        sel = Selector(text=response.body_as_unicode(), type="html")
        players_urlpath=sel.xpath(
            '//div[@id="ciPlayerbyCharAtoZ"]//ul//li//a/@href'
        )
        for url in players_urlpath.extract()[1:-1]:
            url = "http://www.espncricinfo.com" + url
            request = scrapy.Request(url, self.parse_player_names)
            yield request

    def parse_player_names(self,response):
        sel= Selector(text=response.body_as_unicode(), type="html")
        urlPath=sel.xpath(
            '//td[@class="ciPlayernames"]//a/@href'
        )

        for url in urlPath.extract()[1:20]:
            url = "http://www.espncricinfo.com" + url
            request = scrapy.Request(url, self.parse_player_details)
            request.meta['url']=url
            yield request

    def parse_player_details(self, response):
        sel = Selector(text=response.body_as_unicode(), type="html")
        name = sel.xpath(
            '//div[@class="ciPlayernametxt"]/div/h1/text()'
        )

        dob_place=sel.xpath(
            '//p[@class="ciPlayerinformationtxt"]/b[contains(text(), "Born")]/following-sibling::span/text()'
        )

        teams=sel.xpath(
            '//p[@class="ciPlayerinformationtxt"]/b[contains(text(), "Major teams")]/following-sibling::span/text()'
        )
        self.first_class_batting_stats.matches=sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[2]/text()'
        )

        self.first_class_batting_stats.inns = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[3]/text()'
        )

        self.first_class_batting_stats.not_outs = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[4]/text()'
        )

        self.first_class_batting_stats.runs = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[5]/text()'
        )

        self.first_class_batting_stats.highest = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[6]/text()'
        )

        self.first_class_batting_stats.average = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[7]/text()'
        )

        self.first_class_batting_stats.balls_faced = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[8]/text()'
        )

        self.first_class_batting_stats.strike_rate = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[9]/text()'
        )

        self.first_class_batting_stats.hundreds = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[10]/text()'
        )

        self.first_class_batting_stats.fifties = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[11]/text()'
        )

        self.first_class_batting_stats.boundaries = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[12]/text()'
        )

        self.first_class_batting_stats.sixes = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[13]/text()'
        )

        self.first_class_batting_stats.catches_taken = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[14]/text()'
        )

        self.first_class_batting_stats.stumpings_made = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[15]/text()'
        )

        json_stats={
            "matches": re.sub('\s+','',self.first_class_batting_stats.matches.extract()[0]),
            "inns":re.sub('\s+','',self.first_class_batting_stats.inns.extract()[0]),
            "not_outs":re.sub('\s+','',self.first_class_batting_stats.not_outs.extract()[0]),
            "runs":re.sub('\s+','',self.first_class_batting_stats.runs.extract()[0]),
            "highest":re.sub('\s+','',self.first_class_batting_stats.highest.extract()[0]),
            "average":re.sub('\s+','',self.first_class_batting_stats.average.extract()[0]),
            "balls_faced":re.sub('\s+','',self.first_class_batting_stats.balls_faced.extract()[0]),
            "strike_rate":re.sub('\s+','',self.first_class_batting_stats.strike_rate.extract()[0]),
            "hundreds":re.sub('\s+','',self.first_class_batting_stats.hundreds.extract()[0]),
            "fifties":re.sub('\s+','',self.first_class_batting_stats.fifties.extract()[0]),
            "boundaries":re.sub('\s+','',self.first_class_batting_stats.boundaries.extract()[0]),
            "sixes":re.sub('\s+','',self.first_class_batting_stats.sixes.extract()[0]),
            "catches_taken":re.sub('\s+','',self.first_class_batting_stats.catches_taken.extract()[0]),
            "stumps":re.sub('\s+','',self.first_class_batting_stats.stumpings_made.extract()[0])
        }

        yield {
            "name": name.extract()[0].replace('\n',''),
            "url": response.meta['url'].replace('\n',''),
            "born": dob_place.extract()[0].replace('\n',''),
            "teams": teams.extract()[0].replace('\n',''),
            "first_class": json_stats

        }

# run it using the following command. Save the above code in cricinfo.py
scrapy runspider -o players.csv cricinfo.py -t csv


In [2]:
import pandas as pd
import json

In [4]:
players_scraped = pd.read_csv("players.csv")

In [5]:
players_scraped['first_class'] = players_scraped['first_class'].apply(lambda x: json.loads(x.replace("'", "\"")))

In [6]:
df1 = pd.DataFrame(players_scraped['first_class'].values.tolist())
df1.columns = 'first_class.' + df1.columns

In [7]:
players_scraped

Unnamed: 0,name,url,born,teams,first_class
0,Ravi Pandey,http://www.espncricinfo.com/ci/content/player/...,"December 21, 1986, Allahabad",Services,"{'matches': '2', 'inns': '2', 'not_outs': '0',..."
1,Lalith Mohan,http://www.espncricinfo.com/ci/content/player/...,"March 19, 1990, Hyderabad",Hyderabad (India),"{'matches': '25', 'inns': '28', 'not_outs': '1..."
2,Tajinder Singh,http://www.espncricinfo.com/ci/content/player/...,"May 25, 1992, Dholpur, Rajasthan","Mumbai Indians,","{'matches': '11', 'inns': '19', 'not_outs': '3..."
3,Pranay Sharma,http://www.espncricinfo.com/ci/content/player/...,"March 16, 1988, Jaipur",Rajasthan,"{'matches': '14', 'inns': '26', 'not_outs': '1..."
4,Majeed Shashavali,http://www.espncricinfo.com/ci/content/player/...,"July 25, 1987, Vijawada",Andhra,"{'matches': '3', 'inns': '3', 'not_outs': '0',..."
5,Kamran Shaikh,http://www.espncricinfo.com/ci/content/player/...,"November 4, 1988, Baroda, Gujarat",Baroda,"{'matches': '1', 'inns': '1', 'not_outs': '0',..."
6,Narinder Sharma,http://www.espncricinfo.com/ci/content/player/...,"December 31, 1989, Bhiwani",Services,"{'matches': '2', 'inns': '2', 'not_outs': '0',..."
7,Aditya Shanware,http://www.espncricinfo.com/ci/content/player/...,"August 18, 1991, Nagpur, Maharashtra","Vidarbha,","{'matches': '10', 'inns': '16', 'not_outs': '1..."
8,Mayank Sharma,http://www.espncricinfo.com/ci/content/player/...,"September 13, 1985, Guragaon, Haryana",Haryana,"{'matches': '1', 'inns': '1', 'not_outs': '0',..."
9,Mohammad Sanuth,http://www.espncricinfo.com/ci/content/player/...,"July 25, 1989, Kowdiar, Trivandrum","Kerala,","{'matches': '3', 'inns': '4', 'not_outs': '1',..."


In [8]:
final_df = pd.concat([players_scraped.drop(columns="first_class"), df1], axis=1)

In [9]:
final_df

Unnamed: 0,name,url,born,teams,first_class.average,first_class.balls_faced,first_class.boundaries,first_class.catches_taken,first_class.fifties,first_class.highest,first_class.hundreds,first_class.inns,first_class.matches,first_class.not_outs,first_class.runs,first_class.sixes,first_class.strike_rate,first_class.stumps
0,Ravi Pandey,http://www.espncricinfo.com/ci/content/player/...,"December 21, 1986, Allahabad",Services,23.50,104,7,0,0,40,0,2,2,0,47,0,45.19,0
1,Lalith Mohan,http://www.espncricinfo.com/ci/content/player/...,"March 19, 1990, Hyderabad",Hyderabad (India),8.58,368,21,2,0,17*,0,28,25,11,146,0,39.67,0
2,Tajinder Singh,http://www.espncricinfo.com/ci/content/player/...,"May 25, 1992, Dholpur, Rajasthan","Mumbai Indians,",32.18,843,70,4,0,134,1,19,11,3,515,4,61.09,0
3,Pranay Sharma,http://www.espncricinfo.com/ci/content/player/...,"March 16, 1988, Jaipur",Rajasthan,27.04,1332,108,3,1,140,2,26,14,1,676,2,50.75,0
4,Majeed Shashavali,http://www.espncricinfo.com/ci/content/player/...,"July 25, 1987, Vijawada",Andhra,12.66,107,2,1,0,22,0,3,3,0,38,0,35.51,0
5,Kamran Shaikh,http://www.espncricinfo.com/ci/content/player/...,"November 4, 1988, Baroda, Gujarat",Baroda,0.00,2,0,0,0,0,0,1,1,0,0,0,0.00,0
6,Narinder Sharma,http://www.espncricinfo.com/ci/content/player/...,"December 31, 1989, Bhiwani",Services,1.50,20,0,0,0,2,0,2,2,0,3,0,15.00,0
7,Aditya Shanware,http://www.espncricinfo.com/ci/content/player/...,"August 18, 1991, Nagpur, Maharashtra","Vidarbha,",24.00,979,44,9,2,119,1,16,10,1,360,1,36.77,0
8,Mayank Sharma,http://www.espncricinfo.com/ci/content/player/...,"September 13, 1985, Guragaon, Haryana",Haryana,10.00,26,1,0,0,10,0,1,1,0,10,0,38.46,0
9,Mohammad Sanuth,http://www.espncricinfo.com/ci/content/player/...,"July 25, 1989, Kowdiar, Trivandrum","Kerala,",7.66,72,2,0,0,12*,0,4,3,1,23,0,31.94,0
