In [3]:
import numpy as np
import pandas as pd

In [4]:
pd.__version__

'2.3.3'

## Series

In [5]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
s.values

array([ 1.,  3.,  5., nan,  6.,  8.])

In [7]:
s.index

RangeIndex(start=0, stop=6, step=1)

#### Panda Series vs. Numpy Array
Panda series have explicitly defined index (or label) associated with the values, which makes Panda series different from Numpy array


In [8]:
data = pd.Series([1, 4, 6.3, 10], index=['A', 'B', 'C', 'D'])
data

A     1.0
B     4.0
C     6.3
D    10.0
dtype: float64

The above data series may look similar to a regular python dictionay, but unlike dictionary, panda series can be sliced
 

In [9]:
data['A': 'C']

A    1.0
B    4.0
C    6.3
dtype: float64

## DataFrame
It is analogous to 2D array in numpy, but with both flexible row indices and flexible column names

In [10]:
df = pd.DataFrame(['A', 'B', 'C'], index=[10, 20, 30], columns=["letter"])
df

Unnamed: 0,letter
10,A
20,B
30,C


In [11]:
stateDf = pd.DataFrame({'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]})
stateDf

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [12]:
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297,
                  'Florida': 170312, 'Illinois': 149995})
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127, 'Florida': 19552860,
                        'Illinois': 12882135})

stateDf = pd.DataFrame({"area": area, "population": population})
stateDf

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [13]:
print(stateDf.index)
print(stateDf.columns)

print(stateDf.columns.to_list(), type(stateDf.columns.to_list()))
print(stateDf.columns.to_numpy(), type(stateDf.columns.to_numpy()))

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['area', 'population'], dtype='object')
['area', 'population'] <class 'list'>
['area' 'population'] <class 'numpy.ndarray'>


DataFrame can be created using Numpy structured array

In [14]:
a = np.zeros([5], dtype=[('A', 'int'), ('B', 'float')])
a

array([(0, 0.), (0, 0.), (0, 0.), (0, 0.), (0, 0.)],
      dtype=[('A', '<i8'), ('B', '<f8')])

In [15]:
pd.DataFrame(a)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


DataFrame index is an immutable object in itself. It can be created separately, and shared between multiple Dataframes

In [16]:
ind = pd.Index(['A', 'B', 'C'])
ind

Index(['A', 'B', 'C'], dtype='object')

Index is immutable; cannot be modified

In [17]:
ind[2] = 'G'

TypeError: Index does not support mutable operations

In [18]:
print(pd.DataFrame(["Maths", "English", "Science"], index=ind))
print(pd.DataFrame(["Burger", "Apple", "Candy"], index=ind))


         0
A    Maths
B  English
C  Science
        0
A  Burger
B   Apple
C   Candy


In [19]:
data = pd.Series([1, 4, 6.3, 10], index=['A', 'B', 'C', 'D'])
data

A     1.0
B     4.0
C     6.3
D    10.0
dtype: float64

In [20]:
data['B']

np.float64(4.0)

In [21]:
'A' in data

True

In [22]:
print(data.keys())
print(list(data.keys()))

Index(['A', 'B', 'C', 'D'], dtype='object')
['A', 'B', 'C', 'D']


In [23]:
list(data.items())

[('A', 1.0), ('B', 4.0), ('C', 6.3), ('D', 10.0)]

In [24]:
# masking
data[data > 1.5]

B     4.0
C     6.3
D    10.0
dtype: float64

### loc, iloc and ix

In [25]:
data = pd.Series([1, 43.5, 6.3, 10, 11], index=['A', 'B', 'C', 'D', 'E'])
print(data)
print(data.loc['A'])
print(data.loc['A': 'D'])
print(data.iloc[0])
print(data.iloc[1])

A     1.0
B    43.5
C     6.3
D    10.0
E    11.0
dtype: float64
1.0
A     1.0
B    43.5
C     6.3
D    10.0
dtype: float64
1.0
43.5


In [26]:
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297,
                  'Florida': 170312, 'Illinois': 149995})
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127, 'Florida': 19552860,
                        'Illinois': 12882135})

stateDf = pd.DataFrame({"area": area, "population": population})
stateDf

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [27]:
stateDf.values

array([[  423967, 38332521],
       [  695662, 26448193],
       [  141297, 19651127],
       [  170312, 19552860],
       [  149995, 12882135]])

In [28]:
stateDf.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967,695662,141297,170312,149995
population,38332521,26448193,19651127,19552860,12882135


In [29]:
stateDf['density'] = stateDf['population'] / stateDf['area']
stateDf

Unnamed: 0,area,population,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [30]:
stateDf.loc['Texas']

area          6.956620e+05
population    2.644819e+07
density       3.801874e+01
Name: Texas, dtype: float64

In [31]:
stateDf.iloc[1]

area          6.956620e+05
population    2.644819e+07
density       3.801874e+01
Name: Texas, dtype: float64

In [32]:
stateDf.loc['Texas': , 'area': 'population']

Unnamed: 0,area,population
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [33]:
stateDf.loc['Texas': , ['population', 'density']]

Unnamed: 0,population,density
Texas,26448193,38.01874
New York,19651127,139.076746
Florida,19552860,114.806121
Illinois,12882135,85.883763


In [34]:
stateDf['density']

California     90.413926
Texas          38.018740
New York      139.076746
Florida       114.806121
Illinois       85.883763
Name: density, dtype: float64

In [35]:
stateDf[stateDf['density'] > 100]

Unnamed: 0,area,population,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [36]:
stateDf.loc[:, ['area', 'density']]

Unnamed: 0,area,density
California,423967,90.413926
Texas,695662,38.01874
New York,141297,139.076746
Florida,170312,114.806121
Illinois,149995,85.883763


In [37]:
points = pd.Series([4, 8, 15, 16, 23, 42])
points = points / 4
points

0     1.00
1     2.00
2     3.75
3     4.00
4     5.75
5    10.50
dtype: float64

Any item which is not present in one of the two series is assigned NaN after the operation

In [38]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127}, name='population')

population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

### Handling missing data

In [39]:
a = pd.Series(np.random.randint(0, 10, 4), index=list("ABCD"))
b = pd.Series(np.random.randint(0, 10, 4), index=list("ABCE"))
print("a =\n", a)
print("b =\n", b)
c = a+b
print("c =\n", c)

a =
 A    2
B    6
C    6
D    5
dtype: int32
b =
 A    3
B    8
C    2
E    3
dtype: int32
c =
 A     5.0
B    14.0
C     8.0
D     NaN
E     NaN
dtype: float64


### Fill the missing values with 0 

In [40]:
# fills the missing indices with 0 and then add the series
a.add(b, fill_value=0)

A     5.0
B    14.0
C     8.0
D     5.0
E     3.0
dtype: float64

In [41]:
mean = a.mean()
print("Mean of series a =", mean)
a.add(b, fill_value=mean)

Mean of series a = 4.75


A     5.00
B    14.00
C     8.00
D     9.75
E     7.75
dtype: float64

In case of DataFrame, we need to stack the dataframe 'A' in order to calculate its mean

In [42]:
A = pd.DataFrame(np.random.randint(0, 10, [2, 3]), index=list("XY"))
print("A =", "\n", A)
B = pd.DataFrame(np.random.randint(0, 10, [2, 3]), index=list("XZ"))
print("B =", "\n", B)


A = 
    0  1  2
X  6  4  7
Y  3  8  9
B = 
    0  1  2
X  0  6  2
Z  1  0  1


In [43]:
C = A + B
C

Unnamed: 0,0,1,2
X,6.0,10.0,9.0
Y,,,
Z,,,


### Filling the missing values with Mean value

In [44]:
meanVal = A.stack().mean()
print("Mean of A =", meanVal)
A.add(B, fill_value=meanVal)


Mean of A = 6.166666666666667


Unnamed: 0,0,1,2
X,6.0,10.0,9.0
Y,9.166667,14.166667,15.166667
Z,7.166667,6.166667,7.166667


In [45]:
A = pd.DataFrame([[1, 2, 4], [8, 9, 0]], index=list("XY"))
print("A =", "\n", A)

A = 
    0  1  2
X  1  2  4
Y  8  9  0


In [46]:
print("A.index =", A.index.to_list())
print("A.columns =", A.columns.to_list())
print("A.loc['X'].index", A.loc['X'].index.to_list())

A.index = ['X', 'Y']
A.columns = [0, 1, 2]
A.loc['X'].index [0, 1, 2]


In [47]:
A.subtract(A.loc['X'], axis=0)

Unnamed: 0,0,1,2
X,,,
Y,,,
0,,,
1,,,
2,,,


axis=0 means aligning index of Series A.loc['X'] with **index of DataFrame A** \
Index of A = [X, Y] \
Index of A.loc['X'] = [0, 1, 2] \
Because of no overlap the above result is calculated

In [None]:
A = pd.DataFrame([[1, 2, 4], [8, 9, 0]], index=list("XY"))
print("A =", "\n", A)

A = 
    0  1  2
X  1  2  4
Y  8  9  0


In [None]:
A.subtract(A.loc['X'], axis=1)

Unnamed: 0,0,1,2
X,0,0,0
Y,7,7,-4


axis=1 means aligning index of Series A.loc['X'] with the **columns of DataFrame A** \
Index of A.loc['X'] = [0, 1, 2] \
Column of A = [0, 1, 2]

In [53]:
A = [[1, 2, 4], [8, 9, np.nan]]