# Data Scraping

In [9]:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
import re
import json
from scrapy.http import Request, FormRequest
from collections import namedtuple
import re

class CricInfoScrapeSpider(scrapy.Spider):
    name='cricinfo-spider'
    start_urls=['http://www.espncricinfo.com/ci/content/player/country.html?country=6;alpha=A']
    first_class_batting_stats=namedtuple("first_batting_stats", 'matches, inns, not_outs, runs, highest, average, balls_faced, strike_rate, hundreds, fifties, boundaries, sixes, catches_taken, stumpings_made')

    def parse(self, response):
        sel = Selector(text=response.body_as_unicode(), type="html")
        players_urlpath=sel.xpath(
            '//div[@id="ciPlayerbyCharAtoZ"]//ul//li//a/@href'
        )
        for url in players_urlpath.extract()[1:-1]:
            url = "http://www.espncricinfo.com" + url
            request = scrapy.Request(url, self.parse_player_names)
            yield request

    def parse_player_names(self,response):
        sel= Selector(text=response.body_as_unicode(), type="html")
        urlPath=sel.xpath(
            '//td[@class="ciPlayernames"]//a/@href'
        )

        for url in urlPath.extract()[1:10]:
            url = "http://www.espncricinfo.com" + url
            request = scrapy.Request(url, self.parse_player_details)
            request.meta['url']=url
            yield request

    def parse_player_details(self, response):
        sel = Selector(text=response.body_as_unicode(), type="html")
        name = sel.xpath(
            '//div[@class="ciPlayernametxt"]/div/h1/text()'
        )

        dob_place=sel.xpath(
            '//p[@class="ciPlayerinformationtxt"]/b[contains(text(), "Born")]/following-sibling::span/text()'
        )

        teams=sel.xpath(
            '//p[@class="ciPlayerinformationtxt"]/b[contains(text(), "Major teams")]/following-sibling::span/text()'
        )
        self.first_class_batting_stats.matches=sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[2]/text()'
        )

        self.first_class_batting_stats.inns = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[3]/text()'
        )

        self.first_class_batting_stats.not_outs = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[4]/text()'
        )

        self.first_class_batting_stats.runs = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[5]/text()'
        )

        self.first_class_batting_stats.highest = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[6]/text()'
        )

        self.first_class_batting_stats.average = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[7]/text()'
        )

        self.first_class_batting_stats.balls_faced = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[8]/text()'
        )

        self.first_class_batting_stats.strike_rate = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[9]/text()'
        )

        self.first_class_batting_stats.hundreds = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[10]/text()'
        )

        self.first_class_batting_stats.fifties = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[11]/text()'
        )

        self.first_class_batting_stats.boundaries = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[12]/text()'
        )

        self.first_class_batting_stats.sixes = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[13]/text()'
        )

        self.first_class_batting_stats.catches_taken = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[14]/text()'
        )

        self.first_class_batting_stats.stumpings_made = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[15]/text()'
        )

        json_stats= {
            "matches": re.sub('\s+','',self.first_class_batting_stats.matches.extract()[0]),
            "inns":re.sub('\s+','',self.first_class_batting_stats.inns.extract()[0]),
            "not_outs":re.sub('\s+','',self.first_class_batting_stats.not_outs.extract()[0]),
            "runs":re.sub('\s+','',self.first_class_batting_stats.runs.extract()[0]),
            "highest":re.sub('\s+','',self.first_class_batting_stats.highest.extract()[0]),
            "average":re.sub('\s+','',self.first_class_batting_stats.average.extract()[0]),
            "balls_faced":re.sub('\s+','',self.first_class_batting_stats.balls_faced.extract()[0]),
            "strike_rate":re.sub('\s+','',self.first_class_batting_stats.strike_rate.extract()[0]),
            "hundreds":re.sub('\s+','',self.first_class_batting_stats.hundreds.extract()[0]),
            "fifties":re.sub('\s+','',self.first_class_batting_stats.fifties.extract()[0]),
            "boundaries":re.sub('\s+','',self.first_class_batting_stats.boundaries.extract()[0]),
            "sixes":re.sub('\s+','',self.first_class_batting_stats.sixes.extract()[0]),
            "catches_taken":re.sub('\s+','',self.first_class_batting_stats.catches_taken.extract()[0]),
            "stumps":re.sub('\s+','',self.first_class_batting_stats.stumpings_made.extract()[0])
        }

        yield {
            "name": name.extract()[0].replace('\n',''),
            "url": response.meta['url'].replace('\n',''),
            "born": dob_place.extract()[0].replace('\n',''),
            "teams": teams.extract()[0].replace('\n',''),
            "first_class": json_stats

        }

# run it using the following command. Save the above code in cricinfo.py
scrapy runspider -o players.csv cricinfo.py -t csv


In [11]:
import pandas as pd
import json

In [12]:
players_scraped = pd.read_csv("playerLinks.csv")

In [13]:
players_scraped['first_class'] = players_scraped['first_class'].apply(lambda x: json.loads(x.replace("'", "\"")))

In [14]:
df1 = pd.DataFrame(players_scraped['first_class'].values.tolist())
df1.columns = 'first_class.' + df1.columns

In [15]:
players_scraped

Unnamed: 0,name,url,born,teams,first_class
0,Abhisek Banerjee,http://www.espncricinfo.com/ci/content/player/...,"May 18, 1984, Durgapur","Bengal,","{'matches': '4', 'inns': '4', 'not_outs': '0',..."
1,Reagan Pinto,http://www.espncricinfo.com/ci/content/player/...,"September 21, 1991, Bombay (now Mumbai), Mahar...",Goa,"{'matches': '31', 'inns': '47', 'not_outs': '4..."
2,Anupam Patel,http://www.espncricinfo.com/ci/content/player/...,"May 1, 1985, Jamshedpur, Bihar",Gujarat,"{'matches': '2', 'inns': '1', 'not_outs': '1',..."
3,Utkarsh Patel,http://www.espncricinfo.com/ci/content/player/...,"October 11, 1987, Paradi, Gujarat",Baroda,"{'matches': '7', 'inns': '10', 'not_outs': '2'..."
4,Penta Rao,http://www.espncricinfo.com/ci/content/player/...,"May 10, 1985, Vishakapatnam",Services,"{'matches': '3', 'inns': '5', 'not_outs': '1',..."
5,Pratharesh Parmar,http://www.espncricinfo.com/ci/content/player/...,"May 25, 1988, Gandhinagar, Gujarat",Gujarat,"{'matches': '12', 'inns': '19', 'not_outs': '4..."
6,Mehul Patel,http://www.espncricinfo.com/ci/content/player/...,"January 18, 1989, Gaviyar, Gujarat",Gujarat Cricket Association XI,"{'matches': '18', 'inns': '23', 'not_outs': '6..."
7,Adithya B Sagar,http://www.espncricinfo.com/ci/content/player/...,"October 21, 1988, Hubli, Karnataka",Karnataka,"{'matches': '2', 'inns': '3', 'not_outs': '1',..."
8,Jagannathan Kaushik,http://www.espncricinfo.com/ci/content/player/...,"October 25, 1985, Chennai","Ruby Trichy Warriors,","{'matches': '16', 'inns': '16', 'not_outs': '4..."
9,Mohammed Khader,http://www.espncricinfo.com/ci/content/player/...,"November 6, 1987, Hyderabad",Hyderabad (India),"{'matches': '16', 'inns': '19', 'not_outs': '6..."


In [16]:
final_df = pd.concat([players_scraped.drop(columns="first_class"), df1], axis=1)

In [17]:
final_df

Unnamed: 0,name,url,born,teams,first_class.average,first_class.balls_faced,first_class.boundaries,first_class.catches_taken,first_class.fifties,first_class.highest,first_class.hundreds,first_class.inns,first_class.matches,first_class.not_outs,first_class.runs,first_class.sixes,first_class.strike_rate,first_class.stumps
0,Abhisek Banerjee,http://www.espncricinfo.com/ci/content/player/...,"May 18, 1984, Durgapur","Bengal,",23.75,122,14,2,1,68,0,4,4,0,95,0,77.86,0
1,Reagan Pinto,http://www.espncricinfo.com/ci/content/player/...,"September 21, 1991, Bombay (now Mumbai), Mahar...",Goa,32.55,2772,159,11,7,154,3,47,31,4,1400,10,50.50,0
2,Anupam Patel,http://www.espncricinfo.com/ci/content/player/...,"May 1, 1985, Jamshedpur, Bihar",Gujarat,-,0,0,0,0,0*,0,1,2,1,0,0,-,0
3,Utkarsh Patel,http://www.espncricinfo.com/ci/content/player/...,"October 11, 1987, Paradi, Gujarat",Baroda,20.75,321,21,3,1,53,0,10,7,2,166,2,51.71,0
4,Penta Rao,http://www.espncricinfo.com/ci/content/player/...,"May 10, 1985, Vishakapatnam",Services,34.50,314,14,0,1,89,0,5,3,1,138,0,43.94,0
5,Pratharesh Parmar,http://www.espncricinfo.com/ci/content/player/...,"May 25, 1988, Gandhinagar, Gujarat",Gujarat,38.33,1546,53,12,5,67,0,19,12,4,575,4,37.19,0
6,Mehul Patel,http://www.espncricinfo.com/ci/content/player/...,"January 18, 1989, Gaviyar, Gujarat",Gujarat Cricket Association XI,16.11,703,33,5,1,62,0,23,18,6,274,8,38.97,0
7,Adithya B Sagar,http://www.espncricinfo.com/ci/content/player/...,"October 21, 1988, Hubli, Karnataka",Karnataka,24.50,140,7,1,0,29,0,3,2,1,49,0,35.00,0
8,Jagannathan Kaushik,http://www.espncricinfo.com/ci/content/player/...,"October 25, 1985, Chennai","Ruby Trichy Warriors,",7.00,312,6,2,0,20,0,16,16,4,84,2,26.92,0
9,Mohammed Khader,http://www.espncricinfo.com/ci/content/player/...,"November 6, 1987, Hyderabad",Hyderabad (India),19.30,691,36,6,0,35*,0,19,16,6,251,1,36.32,0
