In [395]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen,Request
import re

headers = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'})

In [396]:
url = 'https://en.wikipedia.org/wiki/Indian_Premier_League'

request = Request(url, headers=headers)
response = urlopen(request)

html = response.read()
html_soup = BeautifulSoup(html ,'html.parser')

paras = html_soup.find_all('p') #returns a list of paragraphs from the above page

In [397]:
len(paras)

46

# Extracting 'Background' & 'Foundation' sections 

### Background

In [398]:
background = paras[4].text #Extracting 'Background' section from the 'paras' list
background

'The Indian Cricket League (ICL) was founded in 2007, with funding provided by Zee Entertainment Enterprises.[13] The ICL was not recognised by the Board of Control for Cricket in India (BCCI) or the International Cricket Council (ICC) and the BCCI were not pleased with its committee members joining the ICL executive board.[14] To prevent players from joining the ICL, the BCCI increased the prize money in their own domestic tournaments and also imposed lifetime bans on players joining the ICL, which was considered a rebel league by the board.[15][16]\n'

In [399]:
#PREPROCESSING TEXT 

background = background.replace('\n','') #Getting rid of next line characters
background = re.sub(r'\[\d+\]', '', background) #Getting rid of citation numbers
background

'The Indian Cricket League (ICL) was founded in 2007, with funding provided by Zee Entertainment Enterprises. The ICL was not recognised by the Board of Control for Cricket in India (BCCI) or the International Cricket Council (ICC) and the BCCI were not pleased with its committee members joining the ICL executive board. To prevent players from joining the ICL, the BCCI increased the prize money in their own domestic tournaments and also imposed lifetime bans on players joining the ICL, which was considered a rebel league by the board.'

### Foundation

In [400]:
foundation = paras[7].text #Extracting 'Foundation' section
foundation

'On 13 September 2007,[17] on the back of India\'s victory at the 2007 T20 World Cup,[18] BCCI announced a franchise-based Twenty20 cricket competition called Indian Premier League. The first season was slated to start in April 2008, in a "high-profile ceremony" in New Delhi. BCCI vice-president Lalit Modi, who spearheaded the IPL effort, spelled out the details of the tournament including its format, the prize money, franchise revenue system and squad composition rules. It was also revealed that the IPL would be run by a seven-man governing council composed of former India players and BCCI officials and that the top two teams of the IPL would qualify for that year\'s Champions League Twenty20. Modi also clarified that they had been working on the idea for two years and that the IPL was not started as a "knee-jerk reaction" to the ICL.[17] The league\'s format was similar to that of the Premier League of England and the NBA in the United States.[16]\n'

In [401]:
#PREPROCESSING TEXT 

foundation = foundation.replace('\n', '') #Getting rid of next line characters
foundation = re.sub(r'\[\d+\]', '', foundation) #Getting rid of citation numbers
foundation

'On 13 September 2007, on the back of India\'s victory at the 2007 T20 World Cup, BCCI announced a franchise-based Twenty20 cricket competition called Indian Premier League. The first season was slated to start in April 2008, in a "high-profile ceremony" in New Delhi. BCCI vice-president Lalit Modi, who spearheaded the IPL effort, spelled out the details of the tournament including its format, the prize money, franchise revenue system and squad composition rules. It was also revealed that the IPL would be run by a seven-man governing council composed of former India players and BCCI officials and that the top two teams of the IPL would qualify for that year\'s Champions League Twenty20. Modi also clarified that they had been working on the idea for two years and that the IPL was not started as a "knee-jerk reaction" to the ICL. The league\'s format was similar to that of the Premier League of England and the NBA in the United States.'

# No. of Words

In [402]:
#creating a method to count the no. of words

def no_of_words(title):
    n = len(title.split())
    return n

In [403]:
no_of_words(background) #No. of words for Background section is 89

89

In [404]:
no_of_words(foundation) #No. of words for Foundation section is 161

161

# No. of Sentences

In [405]:
#creating a method to count the no. of sentences

def no_of_sentences(title):
    n = len(title.split('.'))
    return n-1

In [406]:
no_of_sentences(background) #No. of sentences for Background section is 3

3

In [407]:
no_of_sentences(foundation) #No. of sentences for Foundation section is 6

6

###### .

# Extracting Winning Team Names 

In [408]:
season_table = html_soup.find_all('table', class_ = 'wikitable plainrowheaders') #finding 'table' tag with the class name

df = pd.read_html(str(season_table)) #creating a dataframe out of the extracted table
df = pd.DataFrame(df[0])

df.head()

Unnamed: 0_level_0,Season,Final,Final,Final,Final,Final,Final venue,No. ofteams,Player of the series
Unnamed: 0_level_1,Season,Winner,Winner.1,Winning margin,Runner-up,Runner-up.1,Final venue,No. ofteams,Player of the series
0,2008Details,,Rajasthan Royals[50]164/7 (20 overs),Won by 3 wickets(Scorecard),,Chennai Super Kings[50]163/5 (20 overs),"DY Patil Stadium, Navi Mumbai[50]",8[51],Shane Watson(Rajasthan Royals)[50]
1,2009Details,,Deccan Chargers[52]143/6 (20 overs),Won by 6 runs(Scorecard),,Royal Challengers Bangalore[52]137/9 (20 overs),"Wanderers Stadium, Johannesburg[52](South Africa)",8[53],Adam Gilchrist (Deccan Chargers)[52]
2,2010Details,,Chennai Super Kings[54]168/5 (20 overs),Won by 22 runs(Scorecard),,Mumbai Indians[54]146/9 (20 overs),"DY Patil Stadium, Navi Mumbai[54]",8[55],Sachin Tendulkar (Mumbai Indians)[54]
3,2011Details,,Chennai Super Kings[56]205/5 (20 overs),Won by 58 runs(Scorecard),,Royal Challengers Bangalore[56]147/8 (20 overs),"M. A. Chidambaram Stadium, Chennai[56]",10[57],Chris Gayle (Royal Challengers Bangalore)[56]
4,2012Details,,Kolkata Knight Riders[58]192/5 (19.4 overs),Won by 5 wickets(Scorecard),,Chennai Super Kings[58]190/3 (20 overs),"M. A. Chidambaram Stadium, Chennai[58]",9[59],Sunil Narine(Kolkata Knight Riders)[58]


In [409]:
df.columns # We notice it's a muilti-index object

MultiIndex([(              'Season',               'Season'),
            (               'Final',               'Winner'),
            (               'Final',             'Winner.1'),
            (               'Final',       'Winning margin'),
            (               'Final',            'Runner-up'),
            (               'Final',          'Runner-up.1'),
            (         'Final venue',          'Final venue'),
            (         'No. ofteams',          'No. ofteams'),
            ('Player of the series', 'Player of the series')],
           )

In [410]:
df.columns = df.columns.get_level_values(1) #We select second tuple element as column names since it's multi-index

In [411]:
df.head()

Unnamed: 0,Season,Winner,Winner.1,Winning margin,Runner-up,Runner-up.1,Final venue,No. ofteams,Player of the series
0,2008Details,,Rajasthan Royals[50]164/7 (20 overs),Won by 3 wickets(Scorecard),,Chennai Super Kings[50]163/5 (20 overs),"DY Patil Stadium, Navi Mumbai[50]",8[51],Shane Watson(Rajasthan Royals)[50]
1,2009Details,,Deccan Chargers[52]143/6 (20 overs),Won by 6 runs(Scorecard),,Royal Challengers Bangalore[52]137/9 (20 overs),"Wanderers Stadium, Johannesburg[52](South Africa)",8[53],Adam Gilchrist (Deccan Chargers)[52]
2,2010Details,,Chennai Super Kings[54]168/5 (20 overs),Won by 22 runs(Scorecard),,Mumbai Indians[54]146/9 (20 overs),"DY Patil Stadium, Navi Mumbai[54]",8[55],Sachin Tendulkar (Mumbai Indians)[54]
3,2011Details,,Chennai Super Kings[56]205/5 (20 overs),Won by 58 runs(Scorecard),,Royal Challengers Bangalore[56]147/8 (20 overs),"M. A. Chidambaram Stadium, Chennai[56]",10[57],Chris Gayle (Royal Challengers Bangalore)[56]
4,2012Details,,Kolkata Knight Riders[58]192/5 (19.4 overs),Won by 5 wickets(Scorecard),,Chennai Super Kings[58]190/3 (20 overs),"M. A. Chidambaram Stadium, Chennai[58]",9[59],Sunil Narine(Kolkata Knight Riders)[58]


In [412]:
df = df[['Season', 'Winner.1', 'Runner-up.1']].copy() #creating a new dataframe with our desired columns

df = df.rename(columns = {'Winner.1':'Winner', 'Runner-up.1':'Runner-up'}) #renaming the columns

df.head()

Unnamed: 0,Season,Winner,Runner-up
0,2008Details,Rajasthan Royals[50]164/7 (20 overs),Chennai Super Kings[50]163/5 (20 overs)
1,2009Details,Deccan Chargers[52]143/6 (20 overs),Royal Challengers Bangalore[52]137/9 (20 overs)
2,2010Details,Chennai Super Kings[54]168/5 (20 overs),Mumbai Indians[54]146/9 (20 overs)
3,2011Details,Chennai Super Kings[56]205/5 (20 overs),Royal Challengers Bangalore[56]147/8 (20 overs)
4,2012Details,Kolkata Knight Riders[58]192/5 (19.4 overs),Chennai Super Kings[58]190/3 (20 overs)


### Text Pre-processing

In [413]:
df['Season'] = df['Season'].replace(r'\D','', regex=True) #removes alphabetical characters from the column elements

In [414]:
def name_cleaner(team_name): #This method removes all the characters after '[' from a given string
    idx = re.search('\[.*', team_name).start()
    return team_name[:idx]

#Applying the above method to clean elements of 'Winner' & 'Runner-up' columns

df['Winner'] = df['Winner'].apply(name_cleaner) 
df['Runner-up'] = df['Runner-up'].apply(name_cleaner)

In [415]:
df

Unnamed: 0,Season,Winner,Runner-up
0,2008,Rajasthan Royals,Chennai Super Kings
1,2009,Deccan Chargers,Royal Challengers Bangalore
2,2010,Chennai Super Kings,Mumbai Indians
3,2011,Chennai Super Kings,Royal Challengers Bangalore
4,2012,Kolkata Knight Riders,Chennai Super Kings
5,2013,Mumbai Indians,Chennai Super Kings
6,2014,Kolkata Knight Riders,Kings XI Punjab
7,2015,Mumbai Indians,Chennai Super Kings
8,2016,Sunrisers Hyderabad,Royal Challengers Bangalore
9,2017,Mumbai Indians,Rising Pune Supergiant


In [416]:
df.to_csv('IPL_Winning_Teams.csv', index = False) #Saving the table as csv file locally