# Python Pandas CheatSheet
<img src="../sample_files/logos/pandas.svg" width="400" />
The Pandas library is built  on NumPy and provieds easy-to-use data structures and data analysis tools for Python

In [1]:
import pandas as pd
import numpy as np

# Reshaping Data

### Pivot

In [2]:
data = { 'Date' : ['2016-03-01', '2016-03-02', '2016-03-01', '2016-03-03', '2016-03-02', '2016-03-03'],
         'Type' : ['a', 'b', 'c', 'a', 'a', 'c'],
         'Value': [11.432, 13.031, 20.784, 99.906, 1.303, 20.784]}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,Date,Type,Value
0,2016-03-01,a,11.432
1,2016-03-02,b,13.031
2,2016-03-01,c,20.784
3,2016-03-03,a,99.906
4,2016-03-02,a,1.303
5,2016-03-03,c,20.784


<img src="../sample_files/images/pandas_pivot.png" width="400" />

In [3]:
df3= df2.pivot(index='Date',    # Spread rows into columns
               columns='Type',
               values='Value')
df3

Type,a,b,c
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-03-01,11.432,,20.784
2016-03-02,1.303,13.031,
2016-03-03,99.906,,20.784


### Pivot Table

In [4]:
df4 = pd.pivot_table(df2,             # Spread rows into columns
                     index='Type',
                     columns='Date',
                     values='Value')
df4

Date,2016-03-01,2016-03-02,2016-03-03
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,11.432,1.303,99.906
b,,13.031,
c,20.784,,20.784


### Stack / Unstack

<img src="../sample_files/images/pandas_stack_unstack.png" width="300" />

In [5]:
arrays = [np.array([1,2,3]),
          np.array([5,4,3])]
df5 = pd.DataFrame(np.random.rand(3, 2), index=arrays)
df5

Unnamed: 0,Unnamed: 1,0,1
1,5,0.143861,0.567687
2,4,0.676252,0.871133
3,3,0.078842,0.938091


In [6]:
stacked = df5.stack() # Pivot a level of column labels
stacked

1  5  0    0.143861
      1    0.567687
2  4  0    0.676252
      1    0.871133
3  3  0    0.078842
      1    0.938091
dtype: float64

In [7]:
unstacked = stacked.unstack() # Pivot a level of index labels
unstacked

Unnamed: 0,Unnamed: 1,0,1
1,5,0.143861,0.567687
2,4,0.676252,0.871133
3,3,0.078842,0.938091


### Melt
<img src="../sample_files/images/pandas_melt.png" width="400" />

In [8]:
df6 = pd.melt(df2,                   # Gather columns into rows
        id_vars=["Date"],
        value_vars=["Type", "Value"],
        value_name="Observations")
df6

Unnamed: 0,Date,variable,Observations
0,2016-03-01,Type,a
1,2016-03-02,Type,b
2,2016-03-01,Type,c
3,2016-03-03,Type,a
4,2016-03-02,Type,a
5,2016-03-03,Type,c
6,2016-03-01,Value,11.432
7,2016-03-02,Value,13.031
8,2016-03-01,Value,20.784
9,2016-03-03,Value,99.906


## Iteration

In [9]:
for item in df2.iteritems(): # (Column-index, Series) pairs
  print(item)


('Date', 0    2016-03-01
1    2016-03-02
2    2016-03-01
3    2016-03-03
4    2016-03-02
5    2016-03-03
Name: Date, dtype: object)
('Type', 0    a
1    b
2    c
3    a
4    a
5    c
Name: Type, dtype: object)
('Value', 0    11.432
1    13.031
2    20.784
3    99.906
4     1.303
5    20.784
Name: Value, dtype: float64)


In [10]:
for row in df2.iterrows():  # (Row-index, Series) pairs
  print(row)

(0, Date     2016-03-01
Type              a
Value        11.432
Name: 0, dtype: object)
(1, Date     2016-03-02
Type              b
Value        13.031
Name: 1, dtype: object)
(2, Date     2016-03-01
Type              c
Value        20.784
Name: 2, dtype: object)
(3, Date     2016-03-03
Type              a
Value        99.906
Name: 3, dtype: object)
(4, Date     2016-03-02
Type              a
Value         1.303
Name: 4, dtype: object)
(5, Date     2016-03-03
Type              c
Value        20.784
Name: 5, dtype: object)


## Advanced Indexing

In [11]:
s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
print("s = \n{}".format(s))
data = {'Country': ['Belgium', 'India', 'Brazil'],
        'Capital': ['Brussels', 'New Delhi', 'Brasília'],
        'Population': [11190846, 1303171035, 207847528]}
df = pd.DataFrame(data,
                  columns=['Country', 'Capital', 'Population'])
print("df = \n{}".format(df))

data = {'second': [1,2,3,4],
        'first': [4,3,2,1]}
df6 = pd.DataFrame(data)
print("df6 = \n{}".format(df6))

s = 
a    3
b   -5
c    7
d    4
dtype: int64
df = 
   Country    Capital  Population
0  Belgium   Brussels    11190846
1    India  New Delhi  1303171035
2   Brazil   Brasília   207847528
df6 = 
   second  first
0       1      4
1       2      3
2       3      2
3       4      1


In [12]:
# Selecting
ans = df3.loc[:,(df3>1).any()]        # Select cols with any vals > 1
print("df3.loc[:,(df3>1).any()] = \n{}\n".format(ans))
ans = df3.loc[:,(df3>1).all()]        # Select cols with vals > 1
print("df3.loc[:,(df3>1).all()] = \n{}\n".format(ans))
ans = df3.loc[:,df3.isnull().any()]   # Select cols with NaN
print("df3.loc[:,df3.isnull().any()] = \n{}\n".format(ans))
ans = df3.loc[:,df3.notnull().all()]  # Select cols without NaN
print("df3.loc[:,df3.notnull().all()]] = \n{}\n".format(ans))

# Indexing With isin
ans = df[(df.Country.isin(df2.Type))] # Find same elements
print("df[(df.Country.isin(df2.Type))] = \n{}\n".format(ans))
ans = df3.filter(items=['a','b'])      # Filter on values
print("df3.filter(items=['a','b']) = \n{}\n".format(ans))
ans = df.select(lambda x: not x%5)    # Select specific elements
print("df.select(lambda x: not x%5) = \n{}\n".format(ans))

# Where
ans = s.where(s > 0)                  # Subset the data
print("s.where(s > 0) = \n{}\n",format(ans))

# Query
ans = df6.query('second > first')     # Query DataFrame
print("df6.query('second > first') = \n{}\n".format(ans))

df3.loc[:,(df3>1).any()] = 
Type             a       b       c
Date                              
2016-03-01  11.432     NaN  20.784
2016-03-02   1.303  13.031     NaN
2016-03-03  99.906     NaN  20.784

df3.loc[:,(df3>1).all()] = 
Type             a
Date              
2016-03-01  11.432
2016-03-02   1.303
2016-03-03  99.906

df3.loc[:,df3.isnull().any()] = 
Type             b       c
Date                      
2016-03-01     NaN  20.784
2016-03-02  13.031     NaN
2016-03-03     NaN  20.784

df3.loc[:,df3.notnull().all()]] = 
Type             a
Date              
2016-03-01  11.432
2016-03-02   1.303
2016-03-03  99.906

df[(df.Country.isin(df2.Type))] = 
Empty DataFrame
Columns: [Country, Capital, Population]
Index: []

df3.filter(items=['a','b']) = 
Type             a       b
Date                      
2016-03-01  11.432     NaN
2016-03-02   1.303  13.031
2016-03-03  99.906     NaN

df.select(lambda x: not x%5) = 
   Country   Capital  Population
0  Belgium  Brussels    11190846

s.wh

  app.launch_new_instance()


### Setting / Resetting Index

In [13]:
df.set_index('Country')    # Set the index
print("{}\n".format(df))
df5 = df.reset_index(
)     # Reset the index
print("{}\n".format(df5))
df6 = df.rename(index=str,  # Rename DataFrame
                columns={"Country":"cntry",
                "Capital":"cptl",
                "Population":"ppltn"})
print("{}\n".format(df6))

   Country    Capital  Population
0  Belgium   Brussels    11190846
1    India  New Delhi  1303171035
2   Brazil   Brasília   207847528

   index  Country    Capital  Population
0      0  Belgium   Brussels    11190846
1      1    India  New Delhi  1303171035
2      2   Brazil   Brasília   207847528

     cntry       cptl       ppltn
0  Belgium   Brussels    11190846
1    India  New Delhi  1303171035
2   Brazil   Brasília   207847528



### Reindexing

In [14]:
s2 = s.reindex(['a','c','d','e','b'])
s2

a    3.0
c    7.0
d    4.0
e    NaN
b   -5.0
dtype: float64

In [15]:
# Forward filling
df.reindex(range(4), method='ffill')
print("{}\n".format(df))

# Backwards filling
s3 = s.reindex(range(5), method='bfill')
print("{}\n".format(s3))

   Country    Capital  Population
0  Belgium   Brussels    11190846
1    India  New Delhi  1303171035
2   Brazil   Brasília   207847528



TypeError: '>' not supported between instances of 'int' and 'str'

### Multiindexing

In [None]:
arrays = [np.array([1,2,3]),np.array([5,4,3])]
df5 = pd.DataFrame(np.random.rand(3, 2), index=arrays)
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples,names=['first', 'second'])
df6 = pd.DataFrame(np.random.rand(3, 2), index=index)
df2.set_index(["Date", "Type"])

## Duplicate Data

In [None]:
s3.unique()                              # Return unique values
df2.duplicated('Type')                   # Check duplicates
df2.drop_duplicates('Type', keep='last') # Drop duplicates
df.index.duplicated()                    # Check index duplicates

## Grouping Data

In [None]:
# Aggregation
df2.groupby(by=['Date','Type']).mean()
df4.groupby(level=0).sum()
df4.groupby(level=0).agg({'a':lambda x:sum(x)/len(x),'b': np.sum})

# Transformation
customSum = lambda x: (x+x%2)
df4.groupby(level=0).transform(customSum)

## Missing Data

In [None]:
df.dropna()            # Drop NaN values
df3.fillna(df3.mean()) # Fill NaN values with a predetermined value
df2.replace("a", "f")  # Replace values with others

## Combininig Data

### Merge

In [None]:
data = {'X1': ['a','b','c'],
        'X2': [11.432, 1.303,  99.906]}
data1 = pd.DataFrame(data, columns=['X1', 'X2'])
data1

In [None]:
data = {'X1': ['a','b','d'],
        'X3': [20.784, None, 20.784]}
data2 = pd.DataFrame(data, columns=['X1', 'X3'])
data2

In [None]:
pd.merge(data1, data2, how='left', on='X1')

In [None]:
pd.merge(data1, data2, how='right', on='X1')

In [None]:
pd.merge(data1, data2, how='inner', on='X1')

In [None]:
pd.merge(data1, data2, how='outer', on='X1')

### Join

 ### Concatenate

## Dates

## Visualization