In [41]:
import numpy as np
import pandas as pd



In [42]:
# lets first have a quick look at series
# series is like an array but is indexed

s = pd.Series([11,12,31,41,15,61,17,80])
s

0    11
1    12
2    31
3    41
4    15
5    61
6    17
7    80
dtype: int64

In [43]:
#we can get the values
s.values

array([11, 12, 31, 41, 15, 61, 17, 80], dtype=int64)

In [44]:
#we can get the indexs
s.index

RangeIndex(start=0, stop=8, step=1)

In [45]:
#we can determine the type of data
s.dtype

dtype('int64')

In [46]:
#we can determine the size of data
s.size

8

In [47]:
#we can get the shape of the data
s.shape

(8,)

In [48]:
#we can get the number of bytes used to store the data 
s.nbytes

64

In [49]:
# we can use the index to access the values 
s[5]

61

In [50]:
# we can also get values based on conditions
s[s > 40]

3    41
5    61
7    80
dtype: int64

In [51]:
# we can use our own indexes rather than using the default index values
# here is the number of cases in million and respective countries as index
cd = pd.Series([10.3, 8.64, 5.7, 1.83, 1.82], index=['usa','ind','bra','frc','rus'])
cd

usa    10.30
ind     8.64
bra     5.70
frc     1.83
rus     1.82
dtype: float64

In [52]:
# we can name our series 
cd.name = 'covid_cases'
# we can also name the index
cd.index.name = 'countries'
cd

countries
usa    10.30
ind     8.64
bra     5.70
frc     1.83
rus     1.82
Name: covid_cases, dtype: float64

In [53]:
index_p = cd.index
index_p

Index(['usa', 'ind', 'bra', 'frc', 'rus'], dtype='object', name='countries')

In [54]:
# we can check if a key exist in series like we do in dict
'ind' in cd

True

In [55]:
# same is applicable for user defined index
cd['ind']

8.64

In [56]:
#we can get indexes
index_p[2]

'bra'

In [57]:
index_p[:2]

Index(['usa', 'ind'], dtype='object', name='countries')

In [58]:
# index are immitable and can't be changed directly 
# we need to use reindex if we want to change them 
# if new index is added we shall NaN
ncd = cd.reindex(['usa','ind','frc', 'rus', 'esp'])
ncd

countries
usa    10.30
ind     8.64
frc     1.83
rus     1.82
esp      NaN
Name: covid_cases, dtype: float64

In [59]:
# we can use fill_value to specify the default falue if index is not already present
ncd = cd.reindex(['usa','ind','frc', 'rus', 'esp'], fill_value='0')
ncd

countries
usa    10.3
ind    8.64
frc    1.83
rus    1.82
esp       0
Name: covid_cases, dtype: object

In [60]:
#we can convert the series to a python dict 
pcd = cd.to_dict()
pcd

{'usa': 10.3, 'ind': 8.64, 'bra': 5.7, 'frc': 1.83, 'rus': 1.82}

In [61]:
#we can pass a dict to series to create a series
bcd = pd.Series(pcd)
pcd

{'usa': 10.3, 'ind': 8.64, 'bra': 5.7, 'frc': 1.83, 'rus': 1.82}

In [62]:
# we can also specify our own index/keys while passing the dict
nbcd = pd.Series(pcd, index=['usa','ind','bra','frc','rus','esp'])
nbcd

usa    10.30
ind     8.64
bra     5.70
frc     1.83
rus     1.82
esp      NaN
dtype: float64

In [63]:
# as esp or spain was not part of the dict already we got a new key with NaN value
# we can check if we have null values in our series
pd.isnull(nbcd)

usa    False
ind    False
bra    False
frc    False
rus    False
esp     True
dtype: bool

In [64]:
# we can also use notnull to check what all values are not null in the series
pd.notnull(nbcd)

usa     True
ind     True
bra     True
frc     True
rus     True
esp    False
dtype: bool

In [65]:
# lets first have a quick look at dataframe
rank_dict = { \
             'year' : [2019,2018,2017,2016,2015], \
             'batsman':['virat kohli', 'virat kohli', 'steve smith', 'steve smith', 'steve smith' ],\
             'batsman country': ['india', 'india', 'australia', 'australia', 'australia'],\
             'bowler': ['Pat Cummins', 'Kagiso Rabada', 'James Anderson', 'James Anderson', 'Ravichandran Ashwin' ],\
            'bowler country':['australia', 'South Africa', 'England', 'India', 'India' ]\
            }
# we can pass a dict that contains key as the coloum name and list as the respective values 
icc_ranking = pd.DataFrame(rank_dict)
icc_ranking

Unnamed: 0,year,batsman,batsman country,bowler,bowler country
0,2019,virat kohli,india,Pat Cummins,australia
1,2018,virat kohli,india,Kagiso Rabada,South Africa
2,2017,steve smith,australia,James Anderson,England
3,2016,steve smith,australia,James Anderson,India
4,2015,steve smith,australia,Ravichandran Ashwin,India


In [66]:
# pandas automatically adds the indexes as shown above
# we can get the coloum names of the dataframe 
icc_ranking.columns

Index(['year', 'batsman', 'batsman country', 'bowler', 'bowler country'], dtype='object')

In [67]:
# we can use the coloum names to get the data 
icc_ranking.year

0    2019
1    2018
2    2017
3    2016
4    2015
Name: year, dtype: int64

In [68]:
icc_ranking.batsman

0    virat kohli
1    virat kohli
2    steve smith
3    steve smith
4    steve smith
Name: batsman, dtype: object

In [69]:
# we can also use the index approach to get the index
icc_ranking['bowler']

0            Pat Cummins
1          Kagiso Rabada
2         James Anderson
3         James Anderson
4    Ravichandran Ashwin
Name: bowler, dtype: object

In [70]:
# for column name that has the spaces in middle we have to use the index approach 
icc_ranking['bowler country']

0       australia
1    South Africa
2         England
3           India
4           India
Name: bowler country, dtype: object

In [71]:
icc_ranking['batsman country']

0        india
1        india
2    australia
3    australia
4    australia
Name: batsman country, dtype: object

In [72]:
# we can also retrive some values from dataframe 
# in this case 2 values from the start
icc_ranking.head(2)

Unnamed: 0,year,batsman,batsman country,bowler,bowler country
0,2019,virat kohli,india,Pat Cummins,australia
1,2018,virat kohli,india,Kagiso Rabada,South Africa


In [73]:
# in this case 2 values from the start
icc_ranking.tail(2)

Unnamed: 0,year,batsman,batsman country,bowler,bowler country
3,2016,steve smith,australia,James Anderson,India
4,2015,steve smith,australia,Ravichandran Ashwin,India


In [74]:
# we can also extract data at particular index with the help of iloc
icc_ranking.iloc[3]

year                         2016
batsman               steve smith
batsman country         australia
bowler             James Anderson
bowler country              India
Name: 3, dtype: object

In [75]:
# we can also select multiple index
icc_ranking.iloc[['0','2','4']]

Unnamed: 0,year,batsman,batsman country,bowler,bowler country
0,2019,virat kohli,india,Pat Cummins,australia
2,2017,steve smith,australia,James Anderson,England
4,2015,steve smith,australia,Ravichandran Ashwin,India


In [76]:
# we can also extract multiple columns from the data frame
# this basically creates a new dataframe with selected coloums
pd.DataFrame(icc_ranking, columns=['year', 'batsman', 'batsman country'])

Unnamed: 0,year,batsman,batsman country
0,2019,virat kohli,india
1,2018,virat kohli,india
2,2017,steve smith,australia
3,2016,steve smith,australia
4,2015,steve smith,australia


In [77]:
# we can also extract multiple columns from the data frame
# if we pass a coloum name that does not exists we shall get NaN values
pd.DataFrame(icc_ranking, columns=['year', 'bowler', 'bowler country', 'notdefined'])

Unnamed: 0,year,bowler,bowler country,notdefined
0,2019,Pat Cummins,australia,
1,2018,Kagiso Rabada,South Africa,
2,2017,James Anderson,England,
3,2016,James Anderson,India,
4,2015,Ravichandran Ashwin,India,


In [78]:
# let us create a Dataframe from numbers and provide user defined index and coloum names
df = pd.DataFrame(np.random.randn(20).reshape(4,5), index = ['first','second', 'third','fourth'], columns=['cl1', 'cl2', 'cl3','cl4','cl5'])
df

Unnamed: 0,cl1,cl2,cl3,cl4,cl5
first,0.47577,-0.098268,-0.07342,-0.739632,1.147998
second,0.942677,1.454482,-1.246868,0.153964,-1.109041
third,-0.031052,0.017519,-0.07745,-1.663458,1.430449
fourth,-0.787432,0.738778,0.903089,-0.365768,0.789636


In [81]:
# we can reindex the DataFrame
# and can use the fill_value option 
ndf = pd.DataFrame.reindex(df, index = ['first','second', 'third','fourth', 'fifth'], fill_value=10)
ndf

Unnamed: 0,cl1,cl2,cl3,cl4,cl5
first,0.47577,-0.098268,-0.07342,-0.739632,1.147998
second,0.942677,1.454482,-1.246868,0.153964,-1.109041
third,-0.031052,0.017519,-0.07745,-1.663458,1.430449
fourth,-0.787432,0.738778,0.903089,-0.365768,0.789636
fifth,10.0,10.0,10.0,10.0,10.0


In [82]:
# we can reindex by using the dataframe object too
# we can also change the coloums 
ndf = df.reindex(index = ['first','second', 'third','fourth', 'fifth'], columns = ['cl1', 'cl2', 'cl3','cl4','cl5', 'cl6'], fill_value=10)
ndf

Unnamed: 0,cl1,cl2,cl3,cl4,cl5,cl6
first,0.47577,-0.098268,-0.07342,-0.739632,1.147998,10.0
second,0.942677,1.454482,-1.246868,0.153964,-1.109041,10.0
third,-0.031052,0.017519,-0.07745,-1.663458,1.430449,10.0
fourth,-0.787432,0.738778,0.903089,-0.365768,0.789636,10.0
fifth,10.0,10.0,10.0,10.0,10.0,10.0
