In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Virat-Kohli-International-Cricket-Centuries.csv')
df

Unnamed: 0,No.,Runs,Against,Position,Innings,Venue,Ground,Date,Result
0,1,107,Sri Lanka,4,2,"Eden Gardens, Kolkata",Home,24-Dec-09,Won
1,2,102*,Bangladesh,3,2,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,11-Jan-10,Won
2,3,118,Australia,3,2,"APCA-VDCA Stadium, Visakhapatnam",Home,20-Oct-10,Won
3,4,105,New Zealand,3,1,"Nehru Stadium, Guwahati",Home,28-Nov-10,Won
4,5,100*,Bangladesh,4,1,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,19-Feb-11,Won
...,...,...,...,...,...,...,...,...,...
74,75,186,Australia,4,2,"Narendra Modi Stadium, Ahmedabad",Home,9-Mar-23,Drawn
75,76,121,West Indies,4,1,"Queen's Park Oval, Port of Spain",Away,20-Jul-23,Drawn
76,77,122*,Pakistan,3,1,"R. Premadasa Stadium, Colombo",Neutral,11-Sep-23,Won
77,78,103*,Bangladesh,3,2,"Maharashtra Cricket Association Stadium, Pune",Home,19-Oct-23,Won


#### Reformatting columns for uniform access

In [3]:
df.columns = [x.lower() for x in df.columns]
df.columns

Index(['no.', 'runs', 'against', 'position', 'innings', 'venue', 'ground',
       'date', 'result'],
      dtype='object')

In [4]:
df.rename(columns={'no.': 'no'}, inplace=True)

In [5]:
df

Unnamed: 0,no,runs,against,position,innings,venue,ground,date,result
0,1,107,Sri Lanka,4,2,"Eden Gardens, Kolkata",Home,24-Dec-09,Won
1,2,102*,Bangladesh,3,2,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,11-Jan-10,Won
2,3,118,Australia,3,2,"APCA-VDCA Stadium, Visakhapatnam",Home,20-Oct-10,Won
3,4,105,New Zealand,3,1,"Nehru Stadium, Guwahati",Home,28-Nov-10,Won
4,5,100*,Bangladesh,4,1,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,19-Feb-11,Won
...,...,...,...,...,...,...,...,...,...
74,75,186,Australia,4,2,"Narendra Modi Stadium, Ahmedabad",Home,9-Mar-23,Drawn
75,76,121,West Indies,4,1,"Queen's Park Oval, Port of Spain",Away,20-Jul-23,Drawn
76,77,122*,Pakistan,3,1,"R. Premadasa Stadium, Colombo",Neutral,11-Sep-23,Won
77,78,103*,Bangladesh,3,2,"Maharashtra Cricket Association Stadium, Pune",Home,19-Oct-23,Won


#### Inspect dtypes of all columns

In [6]:
df.dtypes

no           int64
runs        object
against     object
position     int64
innings      int64
venue       object
ground      object
date        object
result      object
dtype: object

## Analysis:

#### Runs column is of 'object' datatype which can be converted to int by removing the * (not out)

In [7]:
df['dismissed'] = ~df['runs'].str.contains('\*')

In [8]:
df['runs'] = df['runs'].apply(lambda x: x.strip()[0: -1] if x.strip()[-1] == '*' else x)

In [9]:
df

Unnamed: 0,no,runs,against,position,innings,venue,ground,date,result,dismissed
0,1,107,Sri Lanka,4,2,"Eden Gardens, Kolkata",Home,24-Dec-09,Won,True
1,2,102,Bangladesh,3,2,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,11-Jan-10,Won,False
2,3,118,Australia,3,2,"APCA-VDCA Stadium, Visakhapatnam",Home,20-Oct-10,Won,True
3,4,105,New Zealand,3,1,"Nehru Stadium, Guwahati",Home,28-Nov-10,Won,True
4,5,100,Bangladesh,4,1,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,19-Feb-11,Won,False
...,...,...,...,...,...,...,...,...,...,...
74,75,186,Australia,4,2,"Narendra Modi Stadium, Ahmedabad",Home,9-Mar-23,Drawn,True
75,76,121,West Indies,4,1,"Queen's Park Oval, Port of Spain",Away,20-Jul-23,Drawn,True
76,77,122,Pakistan,3,1,"R. Premadasa Stadium, Colombo",Neutral,11-Sep-23,Won,False
77,78,103,Bangladesh,3,2,"Maharashtra Cricket Association Stadium, Pune",Home,19-Oct-23,Won,False


In [10]:
df

Unnamed: 0,no,runs,against,position,innings,venue,ground,date,result,dismissed
0,1,107,Sri Lanka,4,2,"Eden Gardens, Kolkata",Home,24-Dec-09,Won,True
1,2,102,Bangladesh,3,2,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,11-Jan-10,Won,False
2,3,118,Australia,3,2,"APCA-VDCA Stadium, Visakhapatnam",Home,20-Oct-10,Won,True
3,4,105,New Zealand,3,1,"Nehru Stadium, Guwahati",Home,28-Nov-10,Won,True
4,5,100,Bangladesh,4,1,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,19-Feb-11,Won,False
...,...,...,...,...,...,...,...,...,...,...
74,75,186,Australia,4,2,"Narendra Modi Stadium, Ahmedabad",Home,9-Mar-23,Drawn,True
75,76,121,West Indies,4,1,"Queen's Park Oval, Port of Spain",Away,20-Jul-23,Drawn,True
76,77,122,Pakistan,3,1,"R. Premadasa Stadium, Colombo",Neutral,11-Sep-23,Won,False
77,78,103,Bangladesh,3,2,"Maharashtra Cricket Association Stadium, Pune",Home,19-Oct-23,Won,False


#### Ground is 'object' datatype Home/Away/Neutral which can be converted to categorical data

In [11]:
df['ground'].value_counts()

Home       37
Away       36
Neutral     6
Name: ground, dtype: int64

In [12]:
df['ground'].astype('category')

0        Home
1        Away
2        Home
3        Home
4        Away
       ...   
74       Home
75       Away
76    Neutral
77       Home
78       Home
Name: ground, Length: 79, dtype: category
Categories (3, object): ['Away', 'Home', 'Neutral']

In [13]:
df['ground'] = df['ground'].astype('category')

In [14]:
df.dtypes

no              int64
runs           object
against        object
position        int64
innings         int64
venue          object
ground       category
date           object
result         object
dismissed        bool
dtype: object

#### Date is an 'object' datatype which can be converted to datetime format

In [15]:
df.loc[df['date'] == '5-Nov', 'date'] = '5-Nov-23'

In [16]:
df['date'] = pd.to_datetime(df['date'], format="%d-%b-%y")

In [17]:
df

Unnamed: 0,no,runs,against,position,innings,venue,ground,date,result,dismissed
0,1,107,Sri Lanka,4,2,"Eden Gardens, Kolkata",Home,2009-12-24,Won,True
1,2,102,Bangladesh,3,2,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,2010-01-11,Won,False
2,3,118,Australia,3,2,"APCA-VDCA Stadium, Visakhapatnam",Home,2010-10-20,Won,True
3,4,105,New Zealand,3,1,"Nehru Stadium, Guwahati",Home,2010-11-28,Won,True
4,5,100,Bangladesh,4,1,"Sher-e-Bangla Cricket Stadium, Dhaka",Away,2011-02-19,Won,False
...,...,...,...,...,...,...,...,...,...,...
74,75,186,Australia,4,2,"Narendra Modi Stadium, Ahmedabad",Home,2023-03-09,Drawn,True
75,76,121,West Indies,4,1,"Queen's Park Oval, Port of Spain",Away,2023-07-20,Drawn,True
76,77,122,Pakistan,3,1,"R. Premadasa Stadium, Colombo",Neutral,2023-09-11,Won,False
77,78,103,Bangladesh,3,2,"Maharashtra Cricket Association Stadium, Pune",Home,2023-10-19,Won,False


# Summary of changes to be made to existing df

### No.           
#### Can be used as an Index
<hr>

### Runs
#### 1. Contains values where the batsman was 'not out' at the end of the innings. 
#### 2. A new column 'Dismissed: {Yes, No}' can be added and the * can be removed from runs column and converted into an integer
<hr>

### Against
#### No changes to be made so far
<hr>

### Position
#### No changes to be made so far
<hr>

### Innings
#### If the innings is greater than 2, it is a test match. Create a new column 'Test: {Yes, No}' 
<hr>

### Venue
#### Create a new column 'City' by splitting the string at the ','
<hr>

### Ground
#### No changes to be made so far
<hr>

### Date
#### Create a new column 'Date'
#### Create a new column 'Month'
#### Create a new column 'Year'
<hr>

### Result
#### Create new column with only 3 values - {Won, Lost, Drawn}
<hr>

# Data Pre-processing

#### In which year did he score the most number of centuries

In which month did he score the most number of centuries?

What was his highest score among all the matches that India won?

What was his highest score among all the matches that India lost?

How many times did India win when he scored a century?

Against which team did he score the most number of centuries?

Against which team did he score the least number of centuries?

At which venue did he score the most number of centuries?

At which batting position did he score the most/least number of centuries?

How many times did he score more than 200 runs?

How many centuries did he score at Home, Away, Neutral?