In [136]:
import numpy as np
import pandas as pd

### Operation between Seris and DataFrame

In [137]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3,4))
print(A)

[[6 3 7 4]
 [6 9 2 6]
 [7 4 3 7]]


In [138]:
print(A[0])

[6 3 7 4]


In [139]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 0,  6, -5,  2],
       [ 1,  1, -4,  3]])

In [140]:
# Create dataframe
df = pd.DataFrame(A, columns=list('QRST'))
print(df)

   Q  R  S  T
0  6  3  7  4
1  6  9  2  6
2  7  4  3  7


In [141]:
df.iloc[0]

Q    6
R    3
S    7
T    4
Name: 0, dtype: int32

In [142]:
# Default the operation is row-wise
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


In [143]:
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [144]:
# Subtract 1 of the column elements to rest of the column elements
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,3,0,4,1
1,-3,0,-7,-3
2,3,0,-1,3


In [145]:
# Subtract 1 element of a column to rest of the elements in column 
halfrow = df.iloc[0, ::2]
print(halfrow)

Q    6
S    7
Name: 0, dtype: int32


In [146]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,0.0,,-5.0,
2,1.0,,-4.0,


### Operation of the NULL values
- Pandas provide several useful function for detecting, removing and replacing null values
- 'isnull()' generate the boolean mask for indexing missing values
- 'notnull()' opposite value of 'isnull()'
- 'fillna()' fill the missing values

#### Detecting null values

In [147]:
data = pd.Series([0, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

#### Filter out of null values

In [148]:
data[data.notnull()]

0        0
2    hello
dtype: object

#### Remove of NA values for Seris

In [149]:
ser1 = data.dropna()
print(ser1)

0        0
2    hello
dtype: object


### Operation on DataFrame

In [150]:
df = pd.DataFrame(
    [[1, np.nan, 2],
     [2, 3, 5],
     [np.nan, 4, 6]]
)

print(df)

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [151]:
# By default is row-wise
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [152]:
# By default is column-wise
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [153]:
# Creating the nan value colum
df[3] = np.nan

### Drop only if all data is NaN

In [154]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [155]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [156]:
# Specifying the minimum number we want to keep to remove NaN
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


### Filling null values

#### Series

In [157]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(data)

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64


In [158]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [159]:
# Forward fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [160]:
# Backward fill
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

#### Dataframe

In [161]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [162]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


### Hierarchical Indexing

#### Multiple Indexed Series

In [163]:
# Using python tunple as key
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

In [164]:
population = [33871648, 37253956,
              18976457, 19378102,
              20851820, 25145561]

In [165]:
pop = pd.Series(population, index=index)
print(pop)

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64


In [166]:
pop[('New York', 2000):('Texas', 2010)]

(New York, 2000)    18976457
(New York, 2010)    19378102
(Texas, 2000)       20851820
(Texas, 2010)       25145561
dtype: int64

In [167]:
index = pd.MultiIndex.from_tuples(index)
print(index)

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )


In [168]:
pop = pop.reindex(index)
print(pop)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [169]:
pop['New York':'Texas']

New York  2000    18976457
          2010    19378102
Texas     2000    20851820
          2010    25145561
dtype: int64

In [170]:
pop[:, 2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [171]:
# Convert multiple-index series to dataframe using 'unstack()'
pop_df = pop.unstack()
print(pop_df)

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


In [172]:
# What if we want to add another column of demographic data (under 18)
pop_df = pd.DataFrame({'total': pop,
                      'under18': [9267089, 9284094,
                                  4687371, 4318033,
                                  5906301, 6879014]
})

print(pop_df)

                    total  under18
California 2000  33871648  9267089
           2010  37253956  9284094
New York   2000  18976457  4687371
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2010  25145561  6879014


In [173]:
f_u18 = pop_df['under18'] / pop_df['total']

In [174]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Methods of multiIndex creation
- List
- Dictionary
- MultiIndex Constructor

#### By list

In [175]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])

print(df)

        data1     data2
a 1  0.383257  0.763249
  2  0.958699  0.996176
b 1  0.911243  0.198010
  2  0.956327  0.009570


#### By dictionary

In [176]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        (  'New York', 2000): 18976457,
        (  'New York', 2010): 19378102,
        (     'Texas', 2000): 20851820,
        (     'Texas', 2010): 25145561
}

print(data)

{('California', 2000): 33871648, ('California', 2010): 37253956, ('New York', 2000): 18976457, ('New York', 2010): 19378102, ('Texas', 2000): 20851820, ('Texas', 2010): 25145561}


In [177]:
pd.Series(data)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

#### By explicit multiIndex constructors

In [178]:
# From the list of arrays
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [179]:
# From the list of tuples
pd.MultiIndex.from_tuples(
    [('a', 1),
     ('a', 2),
     ('b', 1),
     ('b', 2)]
)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [180]:
# From a cartesian product of single series
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

### MultiIndex level names

In [181]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [182]:
pop.index.names = ['state', 'year']
print(pop)

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


### MultiIndex for columns

In [197]:
# Create medical data
# Hierarchical indices and columns

index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], 
                                   names=['year', 'visit'])

columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], 
                                      ['HR', 'Temp']], 
                                     names=['patient', 'measurement'])

In [198]:
# Create data
data = np.round(np.random.uniform(35, 40, size=(4, 6)), 1)
print(data)

[[35.6 38.5 39.6 35.5 35.3 38.6]
 [38.2 37.5 39.7 35.5 39.7 35.3]
 [37.5 38.  38.7 37.5 36.8 36.7]
 [39.7 39.2 39.7 36.3 39.9 38.4]]


In [199]:
health_data = pd.DataFrame(data, index=index, columns=columns)
print(health_data)

patient       Bob       Guido         Sue      
measurement    HR  Temp    HR  Temp    HR  Temp
year visit                                     
2013 1       35.6  38.5  39.6  35.5  35.3  38.6
     2       38.2  37.5  39.7  35.5  39.7  35.3
2014 1       37.5  38.0  38.7  37.5  36.8  36.7
     2       39.7  39.2  39.7  36.3  39.9  38.4


In [205]:
# Access a person
health_data.loc[:,'Guido']

Unnamed: 0_level_0,measurement,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,39.6,35.5
2013,2,39.7,35.5
2014,1,38.7,37.5
2014,2,39.7,36.3


In [200]:
# Access the data of a person from a particular year
health_data.loc[2013, 'Guido']

measurement,HR,Temp
visit,Unnamed: 1_level_1,Unnamed: 2_level_1
1,39.6,35.5
2,39.7,35.5


In [207]:
# Recover patient heart rate
health_data['Guido', 'HR']

year  visit
2013  1        39.6
      2        39.7
2014  1        38.7
      2        39.7
Name: (Guido, HR), dtype: float64

In [213]:
# Acess first 2 rows and first 4 columns
health_data.iloc[:2, :4]

Unnamed: 0_level_0,patient,Bob,Bob,Guido,Guido
Unnamed: 0_level_1,measurement,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2013,1,35.6,38.5,39.6,35.5
2013,2,38.2,37.5,39.7,35.5


In [214]:
# Access the heart rate of GUido using slicing

health_data.loc[:, ('Guido', 'HR')]

year  visit
2013  1        39.6
      2        39.7
2014  1        38.7
      2        39.7
Name: (Guido, HR), dtype: float64

In [215]:
# Access the heart rate of all patients first visit using tuple of multiple indices

health_data.loc[(:, 1), (:, 'HR')]

SyntaxError: invalid syntax (3622483709.py, line 3)

In [219]:
# IndexSlice

idx = pd.IndexSlice

health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,patient,Bob,Guido,Sue
Unnamed: 0_level_1,measurement,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,35.6,39.6,35.3
2014,1,37.5,38.7,36.8


In [232]:
# Access the heart rate of Bon's first visit in 2014 using IndexSlice

health_data.loc[idx[2014, 1], idx['Bob', 'HR']]

37.5

In [233]:
# Access the heart rate and temperature of Bon and Sue's first visit in 2014 using IndexSlice

health_data.loc[idx[2014, 1], idx[['Bob', 'Sue'], 'HR':'Temp']]

patient  measurement
Bob      HR             37.5
         Temp           38.0
Sue      HR             36.8
         Temp           36.7
Name: (2014, 1), dtype: float64

### Data Aggregation on Mutiple Index

In [234]:
health_data

Unnamed: 0_level_0,patient,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,measurement,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,35.6,38.5,39.6,35.5,35.3,38.6
2013,2,38.2,37.5,39.7,35.5,39.7,35.3
2014,1,37.5,38.0,38.7,37.5,36.8,36.7
2014,2,39.7,39.2,39.7,36.3,39.9,38.4


In [237]:
# Average out the measurement in the two visits each year

data_mean = health_data.groupby(level='year').median()
print(data_mean)

patient       Bob        Guido          Sue       
measurement    HR  Temp     HR  Temp     HR   Temp
year                                              
2013         36.9  38.0  39.65  35.5  37.50  36.95
2014         38.6  38.6  39.20  36.9  38.35  37.55


In [246]:
data_mean.groupby(axis=1, level='measurement').median()

measurement,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,37.5,36.95
2014,38.6,37.55


### Combining datasets: Concatente

In [248]:
# One dimensional array
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [251]:
# Two dimensional array
x = [[1, 2], [3, 4]]

np.concatenate([x, x], axis=0)

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [250]:
# Two dimensional array
x = [[1, 2], [3, 4]]

np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

### Concatenation in series and dataframe

In [252]:
ser1 = pd.Series(['A', 'B', 'C', 'D'], index=[1, 2, 3, 4])
ser2 = pd.Series(['D', 'E', 'F', 'G'], index=[5, 6, 7, 8])

pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    D
6    E
7    F
8    G
dtype: object

In [259]:
# Create dataframe
df1 = pd.DataFrame(np.arange(0, 9).reshape(3, 3), index=[1, 2, 3], columns=['a', 'b', 'c'])
df1

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8


In [260]:
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), index=[4, 5, 6], columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
4,9,10,11
5,12,13,14
6,15,16,17


In [262]:
df3 = pd.DataFrame(np.arange(18, 27).reshape(3, 3), index=[1, 2, 3], columns=['e', 'f', 'g'])

In [264]:
pd.concat([df1, df2], axis=0)

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8
4,9,10,11
5,12,13,14
6,15,16,17


In [266]:
pd.concat([df1, df3], axis=1)

Unnamed: 0,a,b,c,e,f,g
1,0,1,2,18,19,20
2,3,4,5,21,22,23
3,6,7,8,24,25,26
