# Data Scraping

## We will scrape player stats from Cricinfo.com

In [None]:
# import required libraries
import scrapy
import json
import re
from collections import namedtuple

# custom class derived from class scrapy.Spider 
class CricInfoScrapeSpider(scrapy.Spider):
    # (required) programmer defined name of the crawler
    name='cricinfo-spider'
    
    # (required) starting url for the request scheduler
    start_urls=[
        'http://www.espncricinfo.com/ci/content/player/country.html?country=6;alpha=A'
    ]
    
    # other variable definitions
    first_class_batting_stats=namedtuple(
    "first_batting_stats",
    'matches, inns, not_outs, runs, highest, average, balls_faced, strike_rate, hundreds, fifties, boundaries, sixes, catches_taken, stumpings_made')

    # (required) parser for each url
    def parse(self, response):
        sel = scrapy.Selector(text=response.body_as_unicode(),
                              type="html")
        
        # list of each page by alphabets A to Z
        players_urlpath=sel.xpath(
            '//div[@id="ciPlayerbyCharAtoZ"]//ul//li//a/@href'
        )
        
        # parse all urls
        for url in players_urlpath.extract()[1:-1]:
            url = "http://www.espncricinfo.com" + url
            # chain request to parse player name pages
            request = scrapy.Request(url,
                                     self.parse_player_names)
            yield request

    # parser for player names
    def parse_player_names(self,response):
        sel = scrapy.Selector(text=response.body_as_unicode(),
                              type="html")
        
        # list of each player page by player name
        urlPath=sel.xpath(
            '//td[@class="ciPlayernames"]//a/@href'
        )

        # select number of player pages from each alphabet page
        numPages = 1
        for url in urlPath.extract()[1:numPages + 1]:
            url = "http://www.espncricinfo.com" + url
            # chain request to parse player details
            request = scrapy.Request(url,
                                     self.parse_player_details)
            request.meta['url']=url
            yield request

    # parser for player details and stats
    def parse_player_details(self, response):
        sel = scrapy.Selector(text=response.body_as_unicode(),
                              type="html")
        
        # parse for player names
        name = sel.xpath(
            '//div[@class="ciPlayernametxt"]/div/h1/text()'
        )

        # parse for player date of birth
        dob_place=sel.xpath(
            '//p[@class="ciPlayerinformationtxt"]/b[contains(text(), "Born")]/following-sibling::span/text()'
        )

        # parse for player team
        teams=sel.xpath(
            '//p[@class="ciPlayerinformationtxt"]/b[contains(text(), "Major teams")]/following-sibling::span/text()'
        )
        
        # parse for player batting stats
        self.first_class_batting_stats.matches=sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[2]/text()'
        )

        self.first_class_batting_stats.inns = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[3]/text()'
        )

        self.first_class_batting_stats.not_outs = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[4]/text()'
        )

        self.first_class_batting_stats.runs = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[5]/text()'
        )

        self.first_class_batting_stats.highest = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[6]/text()'
        )

        self.first_class_batting_stats.average = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[7]/text()'
        )

        self.first_class_batting_stats.balls_faced = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[8]/text()'
        )

        self.first_class_batting_stats.strike_rate = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[9]/text()'
        )

        self.first_class_batting_stats.hundreds = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[10]/text()'
        )

        self.first_class_batting_stats.fifties = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[11]/text()'
        )

        self.first_class_batting_stats.boundaries = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[12]/text()'
        )

        self.first_class_batting_stats.sixes = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[13]/text()'
        )

        self.first_class_batting_stats.catches_taken = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[14]/text()'
        )

        self.first_class_batting_stats.stumpings_made = sel.xpath(
            '//*[@id="ciHomeContentlhs"]/div[4]/table[1]/tbody/tr[1]/td[15]/text()'
        )
        
        # combine all player stats into single JSON object
        json_stats={
            "matches": re.sub('\s+','',self.first_class_batting_stats.matches.extract()[0]),
            "inns":re.sub('\s+','',self.first_class_batting_stats.inns.extract()[0]),
            "not_outs":re.sub('\s+','',self.first_class_batting_stats.not_outs.extract()[0]),
            "runs":re.sub('\s+','',self.first_class_batting_stats.runs.extract()[0]),
            "highest":re.sub('\s+','',self.first_class_batting_stats.highest.extract()[0]),
            "average":re.sub('\s+','',self.first_class_batting_stats.average.extract()[0]),
            "balls_faced":re.sub('\s+','',self.first_class_batting_stats.balls_faced.extract()[0]),
            "strike_rate":re.sub('\s+','',self.first_class_batting_stats.strike_rate.extract()[0]),
            "hundreds":re.sub('\s+','',self.first_class_batting_stats.hundreds.extract()[0]),
            "fifties":re.sub('\s+','',self.first_class_batting_stats.fifties.extract()[0]),
            "boundaries":re.sub('\s+','',self.first_class_batting_stats.boundaries.extract()[0]),
            "sixes":re.sub('\s+','',self.first_class_batting_stats.sixes.extract()[0]),
            "catches_taken":re.sub('\s+','',self.first_class_batting_stats.catches_taken.extract()[0]),
            "stumps":re.sub('\s+','',self.first_class_batting_stats.stumpings_made.extract()[0])
        }

        # return full player info
        yield {
            "name": name.extract()[0].replace('\n',''),
            "url": response.meta['url'].replace('\n',''),
            "born": dob_place.extract()[0].replace('\n',''),
            "teams": teams.extract()[0].replace('\n',''),
            "first_class": json_stats

        }


## Save the above code in cricinfo.py then run the following command

In [None]:
# clear the existing players.csv
!rm -f players.csv

# the file cricinfo.py is alredy created and is present in the current folder
!scrapy runspider -o players.csv cricinfo.py -t csv

## Have a look at the data

### Import required libraries

In [None]:
import pandas as pd
import json

### Load data from generated CSV file

In [None]:
players_scraped = pd.read_csv("players.csv")

In [None]:
players_scraped

### Transform data from semi-structured to structured 

In [None]:
# replace all single quotes arround strings with double qoutes
players_scraped['first_class'] = players_scraped['first_class'].apply(lambda x: json.loads(x.replace("'", "\"")))

In [None]:
# convert JSON fields to list
df1 = pd.DataFrame(players_scraped['first_class'].values.tolist())

# generate suitable column names for the sub-fields
df1.columns = 'first_class.' + df1.columns

In [None]:
# replace json coloumn with multiple columns belonging to the sub-field
final_df = pd.concat([players_scraped.drop(columns="first_class"), df1], axis=1)

In [None]:
# display the data
final_df