# Getting Started with pandas

It contains high-level data structures and manipulation tools designed to make data analysis fast and easy in Python. pandas is built on top of NumPy and makes it easy to use in NumPy-centric applications.

In [1]:
import pandas #as pd
#import numpy as np

In [3]:
pandas.DataFrame()

In [4]:
import pandas as pd

In [5]:
pd.DataFrame()

## Introduction to pandas Data Structures

### Series

In [6]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

Index

In [10]:
obj2 = pd.Series(
    [4, 7, -5, 3],
    index = ['d', 'b', 'a', 'c'],
    name = 'first'
                )
obj2

d    4
b    7
a   -5
c    3
Name: first, dtype: int64

In [8]:
obj2['a']

-5

In [9]:
obj2[['a', 'b']]

a   -5
b    7
dtype: int64

In [11]:
obj2['d'] = 6

In [12]:
obj2

d    6
b    7
a   -5
c    3
Name: first, dtype: int64

In [13]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
Name: first, dtype: int64

In [14]:
obj2 > 0

d     True
b     True
a    False
c     True
Name: first, dtype: bool

In [15]:
obj2[obj2 > 0]

d    6
b    7
c    3
Name: first, dtype: int64

In [18]:
obj2.loc['a']

-5

In [16]:
obj2.loc[lambda x : x >0]

d    6
b    7
c    3
Name: first, dtype: int64

In [22]:
obj3 = pd.Series(
    ["APPL1", "GGLE", "AMZN", "MCRS"],
    index = ['d', 'b', 'a', 'c']
                )
obj3

d    APPL1
b     GGLE
a     AMZN
c     MCRS
dtype: object

In [None]:
ticker = 'APPL'

In [24]:
obj3[obj3.str.contains('APPL.*|GG|AM', regex=True)]

d    APPL1
b     GGLE
a     AMZN
dtype: object

In [25]:
obj3 = pd.Series(
    ["XXXXAPPL1", "GGLE", "AMZN", "MCRS"],
    index = ['d', 'b', 'a', 'c']
                )
obj3

d    XXXXAPPL1
b         GGLE
a         AMZN
c         MCRS
dtype: object

In [30]:
obj3[
    obj3.str.contains('appl.*|GG|AM',
                       regex=True, case =False)
]

d    XXXXAPPL1
b         GGLE
a         AMZN
dtype: object

In [31]:
obj3[
    obj3.index.str.contains('d|a',
                       regex=True, case =False)
]

d    XXXXAPPL1
a         AMZN
dtype: object

In [33]:
obj3.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [None]:
obj2

In [None]:
np.exp(obj2)

In [None]:
np.log(obj2)

In [None]:
'it's'

"it's"

From dictionary to series

In [36]:
import numpy as np

In [37]:
sdata = {
    'Ohio': 35000,
    'Texas': 71000,
    'Oregon': 16000,
    'Utah': 5000,
    'NYC':np.nan
}

obj3 = pd.Series(sdata)
obj3

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
NYC           NaN
dtype: float64

In [41]:
obj3.isna()

Ohio      False
Texas     False
Oregon    False
Utah      False
NYC        True
dtype: bool

In [42]:
obj3.isna().sum()

1

In [43]:
obj3[
    obj3.isna()
]

NYC   NaN
dtype: float64

In [44]:
obj3.loc[lambda x: x.isna()]

NYC   NaN
dtype: float64

In [45]:
obj3[
    ~obj3.isna()
]

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
dtype: float64

In [46]:
obj3.loc[lambda x: ~x.isna()]

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
dtype: float64

In [47]:
obj3.dropna()

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
dtype: float64

In [50]:
obj3.mean()

31750.0

In [51]:
obj3[obj3 > obj3.mean()]

Ohio     35000.0
Texas    71000.0
dtype: float64

In [48]:
obj3.loc[lambda x: x > x.mean()]

Ohio     35000.0
Texas    71000.0
dtype: float64

### DataFrame

A DataFrame represents a tabular, spreadsheet-like data structure containing an or-dered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dict of Series (one for all sharing the same index). Compared with other such DataFrame-like structures you may have used before (like R’s data.frame), row-oriented and column-oriented operations in DataFrame are treated roughly symmet-rically. Under the hood, the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays.

In [53]:
data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [57]:
frame.head(2)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7


In [55]:
frame.tail(2)

Unnamed: 0,state,year,pop
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [58]:
list(frame)

['state', 'year', 'pop']

In [60]:
frame.columns

Index(['state', 'year', 'pop'], dtype='object')

In [61]:
frame['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [72]:
frame.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [64]:
frame[['state', 'pop']]

Unnamed: 0,state,pop
0,Ohio,1.5
1,Ohio,1.7
2,Ohio,3.6
3,Nevada,2.4
4,Nevada,2.9
5,Nevada,3.2


In [73]:
frame.max()

state    Ohio
year     2003
pop       3.6
dtype: object

In [74]:
frame['pop'].max()

3.6

In [81]:
frame[frame['pop'] == frame['pop'].max()]

Unnamed: 0,state,year,pop
2,Ohio,2002,3.6


In [83]:
frame.loc[lambda x: x["pop"] == x["pop"].max()]

Unnamed: 0,state,year,pop
2,Ohio,2002,3.6


In [91]:
frame.loc[lambda x: x["year"] > 2000].mean()

year    2001.80
pop        2.76
dtype: float64

In [None]:
frame2['debt'] = np.arange(6.)
frame2

In [None]:
frame2['emp'] = np.random.normal(0, 1, 6)
frame2

### Index Objects

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/02_pandas.png?raw=true)

In [None]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

In [None]:
obj[1:]

In [None]:
obj[-2:]

index[1] = 'd'  # TypeError

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/01_pandas.png?raw=true)

## Essential Functionality



### Reindexing

In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

In [None]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

In [None]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

- ffill or pad Fill (or carry) values forward 
- bfill or backfill Fill (or carry) values backward 

In [None]:
obj3.reindex(range(6), method='ffill')

In [None]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

In [None]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

In [None]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/03_pandas.png?raw=true)

### Dropping Entries from an Axis

In [93]:
obj = pd.Series(
    np.arange(5.),
    index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [94]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [98]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [99]:
data.drop(columns = ['one', 'two'])

Unnamed: 0,three,four
Ohio,2,3
Colorado,6,7
Utah,10,11
New York,14,15


In [100]:
data.drop('Ohio', axis=0)

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [102]:
data.sum()

one      24
two      28
three    32
four     36
dtype: int64

In [103]:
data.sum(axis = 1)

Ohio         6
Colorado    22
Utah        38
New York    54
dtype: int64

In [107]:
df = data.reindex(columns = ['two', 'three'])

In [114]:
data.reindex(columns = ['two', 'three']).std()

two      5.163978
three    5.163978
dtype: float64

In [112]:
#df.to_excel('hello.xlsb')

In [110]:
df.to_csv('hello.csv')

### Indexing, Selection, and Filtering

In [None]:
obj = pd.Series(np.arange(4.),
                index=['a', 'b', 'c', 'd']
               )
obj


In [None]:
obj['b']


In [None]:
obj[1]


In [None]:
obj[2:4]


In [None]:
obj[['b', 'a', 'd']]


In [None]:
obj[[1, 3]]


In [None]:
obj[obj < 2]

In [None]:
obj['b':'c']

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data


In [None]:
data['two']

In [None]:
data[['three', 'one']]

In [None]:
data[:2]


In [None]:
data[data['three'] > 5]

In [None]:
data < 5


In [None]:
data[data < 5] = 0
data

#### Selection with loc and iloc

In [None]:
data

In [None]:
data.loc['Colorado', ['two', 'three']]

In [None]:
data.iloc[2, [3, 0, 1]]

In [None]:
data.iloc[2]

In [None]:
data.iloc[[1, 2], [3, 0, 1]]

In [None]:
data.loc[:'Utah', 'two']


In [None]:
data.iloc[:, :3][data.three > 5]

### Integer Indexes

In [None]:
ser = pd.Series(np.arange(3.))
ser

In [None]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

In [None]:
ser[:1]


In [None]:
ser.loc[:1]


In [None]:
ser.iloc[:1]

### Arithmetic and Data Alignment

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1


In [None]:
s2

In [None]:
s1 + s2

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1

In [None]:
df2

Adding these together returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:

In [None]:
df1 + df2

In [None]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df1


In [None]:
df2

In [None]:
df1 - df2

#### Arithmetic methods with fill values

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df1

In [None]:
df2

In [None]:
df1 + df2

Value in df1 equals to 0

In [None]:
df1.add(df2, fill_value=0)

In [None]:
df1.add(df2, fill_value=10)

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/04_pandas.png?raw=true)

#### Operations between DataFrame and Series

In [None]:
arr = np.arange(12.).reshape((3, 4))
arr


In [None]:
arr[0]

In [None]:
arr - arr[0]

In [None]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame


In [None]:
series

In [None]:
frame - series

In [None]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

In [None]:
series3 = frame['d']
frame


In [None]:
series3


In [None]:
frame.sub(series3, axis='index')

### Function Application and Mapping

In [120]:
frame = pd.DataFrame(
    np.random.randn(4, 3),
    columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon']
)
frame

Unnamed: 0,b,d,e
Utah,0.064911,0.778099,-1.410976
Ohio,-0.297657,-0.661085,1.098662
Texas,-0.412382,-1.33327,-0.57092
Oregon,0.392852,-0.864207,1.073834


In [121]:
frame.shape

(4, 3)

Fonction Lambda

$$ \text{coef Var} = sdt/mean$$

In [122]:
def minmax(x):
    return x.max() - x.min()

In [124]:
0.392852 - -0.412382

0.805234

In [123]:
minmax(frame['b'])

0.8052333456935847

In [125]:
frame.apply(lambda x: x.max() - x.min())

b    0.805233
d    2.111369
e    2.509638
dtype: float64

In [128]:
frame.apply(lambda x: minmax(x), axis = 1)

Utah      2.189075
Ohio      1.759746
Texas     0.920889
Oregon    1.938040
dtype: float64

In [131]:
frame.apply(lambda x: x.std() / x.mean(), axis =0)

b    -5.799023
d    -1.749805
e    26.178104
dtype: float64

In [None]:
frame.apply(lambda x: x.max() - x.min(), axis =0)

In [None]:
frame.apply(lambda x: x.max() - x.min(), axis =1)

In [None]:
frame.apply(f, axis='columns')

Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating point value in frame. You can do this with applymap: 

In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

In [None]:
frame['e'].map(format)

### Sorting and Ranking

In [None]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

In [None]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame

In [None]:
frame.sort_index()

In [None]:
frame.sort_index(axis=1)

In [None]:
frame.sort_index(axis=1, ascending=False)

In [None]:
obj = pd.Series([4, 7, -3, 2])
obj

In [None]:
obj.sort_values()

In [None]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

In [None]:
frame.sort_values(by='b')

In [None]:
frame.sort_values(by=['a', 'b'])

In [None]:
frame.sort_values(by=['a', 'b'],
                  ascending = [True, False])

In [None]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

In [None]:
obj.rank()

In [None]:
obj.rank(method='first')

In [None]:
# Assign tie values the maximum rank in the group
obj.rank(ascending=False, method='max')

In [None]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

In [None]:
frame.rank(axis='columns')

### Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

In [None]:
obj.index.is_unique

In [None]:
obj['a']

In [None]:
obj['c']

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

In [None]:
df.loc['b']

## Summarizing and Computing Descriptive Statistics

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/05_pandas.png?raw=true)

In [None]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/06_pandas.png?raw=true)

In [None]:
df.sum()

In [None]:
df.sum(axis='columns')

In [None]:
df.mean(axis='columns', skipna=False)

In [None]:
df.idxmax()

In [None]:
df.cumsum()

In [None]:
df.describe()

In [None]:
frame

In [None]:
(frame - frame.shift(1))/frame.shift(1)

In [None]:
frame.shift(-1).fillna(frame.loc[0])

In [None]:
frame.pct_change()

In [None]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()


![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/07_pandas.png?raw=true)

### Correlation and Covariance

Download dataset

```
svn export https://github.com/wesm/pydata-book/trunk/datasets
```

conda install pandas-datareader

In [None]:
import os
os.getcwd()

In [None]:
price = pd.read_pickle('yahoo_price.pkl')
volume = pd.read_pickle('yahoo_volume.pkl')

In [None]:
price.head()

In [None]:
volume.head()

import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [None]:
returns = price.pct_change()
returns.head()

In [None]:
price.apply(lambda x: x - x.shift(1), axis = 0).head()

In [None]:
returns[['MSFT', 'IBM']].corr()

In [None]:
returns['MSFT'].corr(returns['IBM'])

In [None]:
returns['MSFT'].cov(returns['IBM'])

In [None]:
returns.MSFT.corr(returns.IBM)

In [None]:
cov = returns.corr()

Using DataFrame’s corrwith method, you can compute pairwise correlations between a DataFrame’s columns or rows with another Series or DataFrame

In [None]:
returns.corrwith(returns.IBM)

In [None]:
returns.corrwith(volume)

### Unique Values, Value Counts, and Membership

In [None]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [None]:
uniques = obj.unique()
uniques

In [None]:
obj.value_counts()

In [None]:
 obj.nunique()

In [None]:
pd.value_counts(obj.values, sort=False)

In [None]:
mask = obj.isin(['b', 'c'])
mask

In [None]:
obj[mask]

In [None]:
cov

In [None]:
cov[cov.index.isin(["MSFT", "AAPL"])]

In [None]:
cov[~cov.index.isin(["MSFT", "AAPL"])]

In [None]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

In [None]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

In [None]:
result = data.apply(pd.value_counts).fillna(0)
result

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/08_pandas.png?raw=true)

## Handling Missing Data 

Missing data is common in most data analysis applications. One of the goals in de-signing pandas was to make working with missing data as painless as possible.

In [None]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

In [None]:
string_data.isnull() 

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/09_pandas.png?raw=true)

## Filtering Out Missing Data 

In [None]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7]) 
data

In [None]:
data.dropna() 

In [None]:
data[data.notnull()] 

In [None]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                  [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

In [None]:
cleaned = data.dropna() 
cleaned

In [None]:
data.dropna( subset = [0]) 

Passing how='all' will only drop rows that are all NA: 

In [None]:
data.dropna(how='all') 

## Filling in Missing Data 

In [None]:
df.fillna(0)

In [None]:
df.fillna({1: 0.5, 3: -1}) 

fillna returns a new object, but you can modify the existing object in place: 

In [None]:
df.fillna(0, inplace=True) 

## Hierarchical Indexing 

Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form. 

In [None]:
data = pd.Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) 
data

What you’re seeing is a prettified view of a Series with a MultiIndex as its index. The “gaps” in the index display mean “use the label directly above”: 

In [None]:
data.index 

With a hierarchically-indexed object, so-called partial indexing is possible, enabling you to concisely select subsets of the data: 

In [None]:
data['b'] 

In [None]:
data.loc[('a')]

In [None]:
data.loc[('a'), 1]

In [None]:
data.loc[('a'), 1]

Hierarchical indexing plays a critical role in reshaping data and group-based operations like forming a pivot table. For example, this data could be rearranged into a DataFrame using its unstack method: 

PS: we will see more about unstack in the next lecture

In [None]:
data.unstack() 

With a DataFrame, either axis can have a hierarchical index: 

In [None]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                            ['Green', 'Red', 'Green']]) 
frame

In [None]:
frame.index.names = ['key1', 'key2'] 
frame

In [None]:
frame.columns.names = ['state', 'color'] 
frame

In [None]:
frame['Ohio'] 

## Summary Statistics by Level 

Many descriptive and summary statistics on DataFrame and Series have a level option in which you can specify the level you want to sum by on a particular axis. Consider the above DataFrame; we can sum by level on either the rows or columns like so:

In [None]:
frame.sum(level='key2') 

In [None]:
frame

In [None]:
frame.sum(level='color', axis=1) 

## Using a DataFrame’s Columns 

In [None]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]}) 
frame

In [None]:
frame2 = frame.set_index(['c', 'd']) 
frame2

In [None]:
frame.set_index(['c', 'd'], drop=False) 

In [None]:
frame2.reset_index() 