In [16]:
import numpy as np
import pandas as pd

In [17]:
pd.__version__

'2.3.3'

## Series

In [18]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [19]:
s.values

array([ 1.,  3.,  5., nan,  6.,  8.])

In [20]:
s.index

RangeIndex(start=0, stop=6, step=1)

#### Panda Series vs. Numpy Array
Panda series have explicitly defined index (or label) associated with the values, which makes Panda series different from Numpy array


In [21]:
data = pd.Series([1, 4, 6.3, 10], index=['A', 'B', 'C', 'D'])
data

A     1.0
B     4.0
C     6.3
D    10.0
dtype: float64

The above data series may look similar to a regular python dictionay, but unlike dictionary, panda series can be sliced
 

In [22]:
data['A': 'C']

A    1.0
B    4.0
C    6.3
dtype: float64

## DataFrame
It is analogous to 2D array in numpy, but with both flexible row indices and flexible column names

In [23]:
df = pd.DataFrame(['A', 'B', 'C'], index=[10, 20, 30], columns=["letter"])
df

Unnamed: 0,letter
10,A
20,B
30,C


In [24]:
stateDf = pd.DataFrame({'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]})
stateDf

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [25]:
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297,
                  'Florida': 170312, 'Illinois': 149995})
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127, 'Florida': 19552860,
                        'Illinois': 12882135})

stateDf = pd.DataFrame({"area": area, "population": population})
stateDf

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [26]:
print(stateDf.index)
print(stateDf.columns)

print(stateDf.columns.to_list(), type(stateDf.columns.to_list()))
print(stateDf.columns.to_numpy(), type(stateDf.columns.to_numpy()))

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['area', 'population'], dtype='object')
['area', 'population'] <class 'list'>
['area' 'population'] <class 'numpy.ndarray'>


DataFrame can be created using Numpy structured array

In [27]:
a = np.zeros([5], dtype=[('A', 'int'), ('B', 'float')])
a

array([(0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.)],
      dtype=[('A', '<i8'), ('B', '<f8')])

In [28]:
pd.DataFrame(a)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


DataFrame index is an immutable object in itself. It can be created separately, and shared between multiple Dataframes

In [29]:
ind = pd.Index(['A', 'B', 'C'])
ind

Index(['A', 'B', 'C'], dtype='object')

Index is immutable; cannot be modified

In [32]:
ind[2] = 'G'

TypeError: Index does not support mutable operations

In [33]:
print(pd.DataFrame(["Maths", "English", "Science"], index=ind))
print(pd.DataFrame(["Burger", "Apple", "Candy"], index=ind))


         0
A    Maths
B  English
C  Science
        0
A  Burger
B   Apple
C   Candy


In [34]:
data = pd.Series([1, 4, 6.3, 10], index=['A', 'B', 'C', 'D'])
data

A     1.0
B     4.0
C     6.3
D    10.0
dtype: float64

In [35]:
data['B']

np.float64(4.0)

In [36]:
'A' in data

True

In [37]:
print(data.keys())
print(list(data.keys()))

Index(['A', 'B', 'C', 'D'], dtype='object')
['A', 'B', 'C', 'D']


In [38]:
list(data.items())

[('A', 1.0), ('B', 4.0), ('C', 6.3), ('D', 10.0)]

In [39]:
# masking
data[data > 1.5]

B     4.0
C     6.3
D    10.0
dtype: float64

### loc, iloc and ix

In [40]:
data = pd.Series([1, 43.5, 6.3, 10, 11], index=['A', 'B', 'C', 'D', 'E'])
print(data)
print(data.loc['A'])
print(data.loc['A': 'D'])
print(data.iloc[0])
print(data.iloc[1])

A     1.0
B    43.5
C     6.3
D    10.0
E    11.0
dtype: float64
1.0
A     1.0
B    43.5
C     6.3
D    10.0
dtype: float64
1.0
43.5


In [41]:
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297,
                  'Florida': 170312, 'Illinois': 149995})
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127, 'Florida': 19552860,
                        'Illinois': 12882135})

stateDf = pd.DataFrame({"area": area, "population": population})
stateDf

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [42]:
stateDf.values

array([[  423967, 38332521],
       [  695662, 26448193],
       [  141297, 19651127],
       [  170312, 19552860],
       [  149995, 12882135]])

In [43]:
stateDf.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967,695662,141297,170312,149995
population,38332521,26448193,19651127,19552860,12882135


In [44]:
stateDf['density'] = stateDf['population'] / stateDf['area']
stateDf

Unnamed: 0,area,population,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [45]:
stateDf.loc['Texas']

area          6.956620e+05
population    2.644819e+07
density       3.801874e+01
Name: Texas, dtype: float64

In [46]:
stateDf.iloc[1]

area          6.956620e+05
population    2.644819e+07
density       3.801874e+01
Name: Texas, dtype: float64

In [47]:
stateDf.loc['Texas': , 'area': 'population']

Unnamed: 0,area,population
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [48]:
stateDf.loc['Texas': , ['population', 'density']]

Unnamed: 0,population,density
Texas,26448193,38.01874
New York,19651127,139.076746
Florida,19552860,114.806121
Illinois,12882135,85.883763


In [49]:
stateDf['density']

California     90.413926
Texas          38.018740
New York      139.076746
Florida       114.806121
Illinois       85.883763
Name: density, dtype: float64

In [50]:
stateDf[stateDf['density'] > 100]

Unnamed: 0,area,population,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [51]:
stateDf.loc[:, ['area', 'density']]

Unnamed: 0,area,density
California,423967,90.413926
Texas,695662,38.01874
New York,141297,139.076746
Florida,170312,114.806121
Illinois,149995,85.883763
