# Loading libraries

In [1]:
import numpy as np
import pandas as pd

# Object creation

### Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1,3, 4, np.nan, 6,8])
s

0    1.0
1    3.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64

### Creating a DataFrame by passing a Numpy array, which a datetime index and labeled columns:

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164
2013-01-02,-0.342418,0.300301,1.196545,0.318254
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359
2013-01-04,0.656443,-1.123722,0.334306,-0.756663
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861
2013-01-06,-1.196805,0.473708,0.638928,0.912025


### Creating a DataFrame by passing a dict of objects that can be converted to series-like:

In [5]:
df2 = pd.DataFrame(
    {
        "A":1.0,
        "B":pd.Timestamp("20210324"),
        "C":pd.Series(1, index=list(range(4)), dtype="float32"),
        "D":np.array([3]*4, dtype="int32"),
        "E":pd.Categorical(["test", "train", "test", "train"]),
        "F":"foo"
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-24,1.0,3,test,foo
1,1.0,2021-03-24,1.0,3,train,foo
2,1.0,2021-03-24,1.0,3,test,foo
3,1.0,2021-03-24,1.0,3,train,foo


### The columns of the resulting DataFrame have different dtypes:

In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing data

In [7]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164
2013-01-02,-0.342418,0.300301,1.196545,0.318254
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359
2013-01-04,0.656443,-1.123722,0.334306,-0.756663
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.656443,-1.123722,0.334306,-0.756663
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861
2013-01-06,-1.196805,0.473708,0.638928,0.912025


In [9]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.to_numpy()

array([[-0.89943863, -1.21455874, -0.79551241,  0.47916404],
       [-0.34241759,  0.3003008 ,  1.19654491,  0.31825416],
       [-0.25941439, -0.55522691,  0.73570836, -1.69435931],
       [ 0.65644273, -1.12372189,  0.33430617, -0.75666275],
       [-1.19201298, -0.41790538,  0.8832923 , -0.82586058],
       [-1.19680528,  0.47370788,  0.6389281 ,  0.91202487]])

In [12]:
df2.to_numpy()

array([[1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-03-24 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [13]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.538941,-0.422901,0.498878,-0.26124
std,0.712032,0.70176,0.694658,0.987789
min,-1.196805,-1.214559,-0.795512,-1.694359
25%,-1.118869,-0.981598,0.410462,-0.808561
50%,-0.620928,-0.486566,0.687318,-0.219204
75%,-0.280165,0.120749,0.846396,0.438937
max,0.656443,0.473708,1.196545,0.912025


In [14]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.899439,-0.342418,-0.259414,0.656443,-1.192013,-1.196805
B,-1.214559,0.300301,-0.555227,-1.123722,-0.417905,0.473708
C,-0.795512,1.196545,0.735708,0.334306,0.883292,0.638928
D,0.479164,0.318254,-1.694359,-0.756663,-0.825861,0.912025


### Sorting by axis:

In [15]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.479164,-0.795512,-1.214559,-0.899439
2013-01-02,0.318254,1.196545,0.300301,-0.342418
2013-01-03,-1.694359,0.735708,-0.555227,-0.259414
2013-01-04,-0.756663,0.334306,-1.123722,0.656443
2013-01-05,-0.825861,0.883292,-0.417905,-1.192013
2013-01-06,0.912025,0.638928,0.473708,-1.196805


### Sorting by values:

In [16]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-1.196805,0.473708,0.638928,0.912025
2013-01-02,-0.342418,0.300301,1.196545,0.318254
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359
2013-01-04,0.656443,-1.123722,0.334306,-0.756663
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164


# Selection

## Getting

### Selecting a single column, which yields a Series, equivalent to df.A:

In [17]:
df["A"]

2013-01-01   -0.899439
2013-01-02   -0.342418
2013-01-03   -0.259414
2013-01-04    0.656443
2013-01-05   -1.192013
2013-01-06   -1.196805
Freq: D, Name: A, dtype: float64

### Selecting via [ ], which slices the rows

In [18]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164
2013-01-02,-0.342418,0.300301,1.196545,0.318254
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359


In [19]:
df["2013-01-05":"2013-01-07"]

Unnamed: 0,A,B,C,D
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861
2013-01-06,-1.196805,0.473708,0.638928,0.912025


## Selection by label

In [20]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164
2013-01-02,-0.342418,0.300301,1.196545,0.318254
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359
2013-01-04,0.656443,-1.123722,0.334306,-0.756663
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861
2013-01-06,-1.196805,0.473708,0.638928,0.912025


In [21]:
df.loc[dates[0]]

A   -0.899439
B   -1.214559
C   -0.795512
D    0.479164
Name: 2013-01-01 00:00:00, dtype: float64

### Selecting on multi-axis label:

In [22]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,-0.899439,-1.214559
2013-01-02,-0.342418,0.300301
2013-01-03,-0.259414,-0.555227
2013-01-04,0.656443,-1.123722
2013-01-05,-1.192013,-0.417905
2013-01-06,-1.196805,0.473708


### Showing label slicing, both endpoints are included:

In [23]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,-0.342418,0.300301
2013-01-03,-0.259414,-0.555227
2013-01-04,0.656443,-1.123722


Reduction in the dimensions of the returned object:

In [24]:
df.loc["20130102",["A","B"]]

A   -0.342418
B    0.300301
Name: 2013-01-02 00:00:00, dtype: float64

### For getting a scalar value:

In [25]:
df.loc[dates[0], "A"]

-0.8994386335710782

### For getting fast access to a scalar (equivalent to the prior method):

In [26]:
df.at[dates[0], "A"]

-0.8994386335710782

## Selection by position

### Select via the position of the passed integers:

In [27]:
df.iloc[3]

A    0.656443
B   -1.123722
C    0.334306
D   -0.756663
Name: 2013-01-04 00:00:00, dtype: float64

### By integer slices, acting similar to numpy/Python:

In [28]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.656443,-1.123722
2013-01-05,-1.192013,-0.417905


### By lists of interger position locations, similar to the Numpy/Python style:

In [29]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.342418,1.196545
2013-01-03,-0.259414,0.735708
2013-01-05,-1.192013,0.883292


### For slicing rows explicitly:

In [30]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.342418,0.300301,1.196545,0.318254
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359


### For slicing columns explicitly:

In [31]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-1.214559,-0.795512
2013-01-02,0.300301,1.196545
2013-01-03,-0.555227,0.735708
2013-01-04,-1.123722,0.334306
2013-01-05,-0.417905,0.883292
2013-01-06,0.473708,0.638928


### For getting a value explicitly:

In [32]:
df.iloc[1,1]

0.3003008037439202

### For getting fast access to a scalar (equivalent to the prior method):

In [33]:
df.iat[1,1]

0.3003008037439202

## Boolean indexing

### Using a single column's values to select data:

In [34]:
df["A"] > 0

2013-01-01    False
2013-01-02    False
2013-01-03    False
2013-01-04     True
2013-01-05    False
2013-01-06    False
Freq: D, Name: A, dtype: bool

In [35]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-04,0.656443,-1.123722,0.334306,-0.756663


### Selecting valeus from a DataFrame where a boolean condition is met:

In [36]:
df>0

Unnamed: 0,A,B,C,D
2013-01-01,False,False,False,True
2013-01-02,False,True,True,True
2013-01-03,False,False,True,False
2013-01-04,True,False,True,False
2013-01-05,False,False,True,False
2013-01-06,False,True,True,True


In [37]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,0.479164
2013-01-02,,0.300301,1.196545,0.318254
2013-01-03,,,0.735708,
2013-01-04,0.656443,,0.334306,
2013-01-05,,,0.883292,
2013-01-06,,0.473708,0.638928,0.912025


### Using the isin() method for filtering:

In [38]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2


Unnamed: 0,A,B,C,D,E
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164,one
2013-01-02,-0.342418,0.300301,1.196545,0.318254,one
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359,two
2013-01-04,0.656443,-1.123722,0.334306,-0.756663,three
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861,four
2013-01-06,-1.196805,0.473708,0.638928,0.912025,three


In [39]:
df2["E"].isin(["two", "four"])

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

In [40]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359,two
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861,four


## Setting

### Setting a new column automatically aligns the data by the indexes:

In [41]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130101", periods=6))
s1

2013-01-01    1
2013-01-02    2
2013-01-03    3
2013-01-04    4
2013-01-05    5
2013-01-06    6
Freq: D, dtype: int64

In [42]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164,1
2013-01-02,-0.342418,0.300301,1.196545,0.318254,2
2013-01-03,-0.259414,-0.555227,0.735708,-1.694359,3
2013-01-04,0.656443,-1.123722,0.334306,-0.756663,4
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861,5
2013-01-06,-1.196805,0.473708,0.638928,0.912025,6


### Setting values by label:

In [43]:
df.at[dates[2], "A"] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164,1
2013-01-02,-0.342418,0.300301,1.196545,0.318254,2
2013-01-03,0.0,-0.555227,0.735708,-1.694359,3
2013-01-04,0.656443,-1.123722,0.334306,-0.756663,4
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861,5
2013-01-06,-1.196805,0.473708,0.638928,0.912025,6


### Setting value by position:

In [44]:
df.iat[3,0] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,0.479164,1
2013-01-02,-0.342418,0.300301,1.196545,0.318254,2
2013-01-03,0.0,-0.555227,0.735708,-1.694359,3
2013-01-04,0.0,-1.123722,0.334306,-0.756663,4
2013-01-05,-1.192013,-0.417905,0.883292,-0.825861,5
2013-01-06,-1.196805,0.473708,0.638928,0.912025,6


### Setting by assing with a NumPy arrary:

In [45]:
df.loc[:, "D"] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,5,1
2013-01-02,-0.342418,0.300301,1.196545,5,2
2013-01-03,0.0,-0.555227,0.735708,5,3
2013-01-04,0.0,-1.123722,0.334306,5,4
2013-01-05,-1.192013,-0.417905,0.883292,5,5
2013-01-06,-1.196805,0.473708,0.638928,5,6


In [46]:
len(df)

6

In [47]:
df2 = df.copy()
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,5,1
2013-01-02,-0.342418,0.300301,1.196545,5,2
2013-01-03,0.0,-0.555227,0.735708,5,3
2013-01-04,0.0,-1.123722,0.334306,5,4
2013-01-05,-1.192013,-0.417905,0.883292,5,5
2013-01-06,-1.196805,0.473708,0.638928,5,6


In [48]:
df2[df2 > 0]

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,5,1
2013-01-02,,0.300301,1.196545,5,2
2013-01-03,,,0.735708,5,3
2013-01-04,,,0.334306,5,4
2013-01-05,,,0.883292,5,5
2013-01-06,,0.473708,0.638928,5,6


In [49]:
-df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.899439,1.214559,0.795512,-5.0,-1.0
2013-01-02,0.342418,-0.300301,-1.196545,-5.0,-2.0
2013-01-03,-0.0,0.555227,-0.735708,-5.0,-3.0
2013-01-04,-0.0,1.123722,-0.334306,-5.0,-4.0
2013-01-05,1.192013,0.417905,-0.883292,-5.0,-5.0
2013-01-06,1.196805,-0.473708,-0.638928,-5.0,-6.0


In [50]:
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,-5,-1
2013-01-02,-0.342418,-0.300301,-1.196545,-5,-2
2013-01-03,0.0,-0.555227,-0.735708,-5,-3
2013-01-04,0.0,-1.123722,-0.334306,-5,-4
2013-01-05,-1.192013,-0.417905,-0.883292,-5,-5
2013-01-06,-1.196805,-0.473708,-0.638928,-5,-6


# Missing data

### pandas primarily uses the value np.nan to represent missing data. It is by default not included in computations. 

### Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data:

In [51]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-0.899439,-1.214559,-0.795512,5,1,
2013-01-02,-0.342418,0.300301,1.196545,5,2,
2013-01-03,0.0,-0.555227,0.735708,5,3,
2013-01-04,0.0,-1.123722,0.334306,5,4,


In [52]:
df1.loc[dates[0]:dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-0.899439,-1.214559,-0.795512,5,1,1.0
2013-01-02,-0.342418,0.300301,1.196545,5,2,1.0
2013-01-03,0.0,-0.555227,0.735708,5,3,
2013-01-04,0.0,-1.123722,0.334306,5,4,


### To drop any rows that have missing data:

In [53]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-0.899439,-1.214559,-0.795512,5,1,1.0
2013-01-02,-0.342418,0.300301,1.196545,5,2,1.0


### Filling missing data:

In [54]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-0.899439,-1.214559,-0.795512,5,1,1.0
2013-01-02,-0.342418,0.300301,1.196545,5,2,1.0
2013-01-03,0.0,-0.555227,0.735708,5,3,5.0
2013-01-04,0.0,-1.123722,0.334306,5,4,5.0


In [55]:
df1.loc[:,["E"]].fillna(value=5)

Unnamed: 0,E
2013-01-01,1.0
2013-01-02,1.0
2013-01-03,5.0
2013-01-04,5.0


### To get the boolean mask where values are nan:

In [56]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


In [57]:
df1.isna()

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# Operations

## Stats

Operations in general exclude missing data

In [58]:
df.mean()

A   -0.605112
B   -0.422901
C    0.498878
D    5.000000
F    3.500000
dtype: float64

### Same operation on the other axis:

In [59]:
df.mean(1)

2013-01-01    0.618098
2013-01-02    1.630886
2013-01-03    1.636096
2013-01-04    1.642117
2013-01-05    1.854675
2013-01-06    2.183166
Freq: D, dtype: float64

### Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specific dimention:

In [60]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

### Subtract a list and Series by axis with operator version.

In [61]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-1.0,-1.555227,-0.264292,4.0,2.0
2013-01-04,-3.0,-4.123722,-2.665694,2.0,1.0
2013-01-05,-6.192013,-5.417905,-4.116708,0.0,0.0
2013-01-06,,,,,


In [62]:
df.sub(s, axis=1)

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00,A,B,C,D,F
2013-01-01,,,,,,,,,,,
2013-01-02,,,,,,,,,,,
2013-01-03,,,,,,,,,,,
2013-01-04,,,,,,,,,,,
2013-01-05,,,,,,,,,,,
2013-01-06,,,,,,,,,,,


## Apply

In [63]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,5,1
2013-01-02,-1.241856,-0.914258,0.401033,10,3
2013-01-03,-1.241856,-1.469485,1.136741,15,6
2013-01-04,-1.241856,-2.593207,1.471047,20,10
2013-01-05,-2.433869,-3.011112,2.354339,25,15
2013-01-06,-3.630674,-2.537404,2.993267,30,21


In [64]:
df_score = pd.DataFrame({"L":[51,65,78], "M":[80,90,100]}, index=["A", "B", "C"])
df_score

Unnamed: 0,L,M
A,51,80
B,65,90
C,78,100


In [65]:
df_score.apply(np.average, axis=0)

L    64.666667
M    90.000000
dtype: float64

In [66]:
df_score.apply(np.average, axis=1)

A    65.5
B    77.5
C    89.0
dtype: float64

In [67]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.899439,-1.214559,-0.795512,5,1
2013-01-02,-0.342418,0.300301,1.196545,5,2
2013-01-03,0.0,-0.555227,0.735708,5,3
2013-01-04,0.0,-1.123722,0.334306,5,4
2013-01-05,-1.192013,-0.417905,0.883292,5,5
2013-01-06,-1.196805,0.473708,0.638928,5,6


In [68]:
df.apply(lambda x: x.max() - x.min())

A    1.196805
B    1.688267
C    1.992057
D    0.000000
F    5.000000
dtype: float64

## Histogramming

In [69]:
s = pd.Series(np.random.randint(0,7,size=10))
s

0    2
1    6
2    5
3    4
4    5
5    2
6    3
7    6
8    6
9    3
dtype: int32

In [70]:
s.value_counts()

6    3
5    2
3    2
2    2
4    1
dtype: int64

# Merge

## Concat

### Concatenating pandas objects together with concat():

In [71]:
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-1.603882,2.150095,0.529218,0.701705
1,0.03495,-0.815203,0.371688,0.825315
2,2.117452,1.708453,2.418974,1.887378
3,-1.205453,-0.121624,-0.146072,-0.279882
4,-1.071172,-0.226309,0.893861,0.072537
5,1.063269,1.184406,-0.973738,0.421908
6,0.435845,-0.702957,-0.513485,-0.210882
7,-0.535896,0.442588,-1.440149,1.643216
8,-0.878816,0.333394,0.133002,-1.240939
9,-0.224446,0.692506,0.400893,-1.091479


In [73]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -1.603882  2.150095  0.529218  0.701705
 1  0.034950 -0.815203  0.371688  0.825315
 2  2.117452  1.708453  2.418974  1.887378,
           0         1         2         3
 3 -1.205453 -0.121624 -0.146072 -0.279882
 4 -1.071172 -0.226309  0.893861  0.072537
 5  1.063269  1.184406 -0.973738  0.421908
 6  0.435845 -0.702957 -0.513485 -0.210882,
           0         1         2         3
 7 -0.535896  0.442588 -1.440149  1.643216
 8 -0.878816  0.333394  0.133002 -1.240939
 9 -0.224446  0.692506  0.400893 -1.091479]

In [74]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-1.603882,2.150095,0.529218,0.701705
1,0.03495,-0.815203,0.371688,0.825315
2,2.117452,1.708453,2.418974,1.887378
3,-1.205453,-0.121624,-0.146072,-0.279882
4,-1.071172,-0.226309,0.893861,0.072537
5,1.063269,1.184406,-0.973738,0.421908
6,0.435845,-0.702957,-0.513485,-0.210882
7,-0.535896,0.442588,-1.440149,1.643216
8,-0.878816,0.333394,0.133002,-1.240939
9,-0.224446,0.692506,0.400893,-1.091479


## Join

### SQL style merges

In [75]:
left = pd.DataFrame({"key":["foo", "foo"], "lval":[1,2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [78]:
right = pd.DataFrame({"key":["foo", "foo"], "rval":[3,4]})
right

Unnamed: 0,key,rval
0,foo,3
1,foo,4


In [79]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [80]:
left = pd.DataFrame({"key":["foo", "bar"], "lval":[1,2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [81]:
right = pd.DataFrame({"key":["foo", "bar"], "rval":[3,4]})
right

Unnamed: 0,key,rval
0,foo,3
1,bar,4


In [82]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,3
1,bar,2,4


# Grouping

### Spliting the data into groups based on some criteria
### Applying a function to each group independently
### Combining the results into a data structure

In [86]:
df = pd.DataFrame(
    {
        "A":["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B":["one", "one", "two", "three", "two", "two", "one", "three"],
        "C":np.random.randn(8),
        "D":np.random.randn(8)
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,0.616701,-0.483857
1,bar,one,0.569229,0.068727
2,foo,two,1.122198,-0.350232
3,bar,three,-0.217549,0.138587
4,foo,two,0.623836,-0.810344
5,bar,two,-0.839531,-1.761739
6,foo,one,0.341115,0.250146
7,foo,three,-1.693246,0.71126


### Grouping and then applying the sum() function to the resulting groups:

In [89]:
df.groupby("A").sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.487852,-1.554424
foo,1.010603,-0.683027


In [90]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.569229,0.068727
bar,three,-0.217549,0.138587
bar,two,-0.839531,-1.761739
foo,one,0.957816,-0.233711
foo,three,-1.693246,0.71126
foo,two,1.746033,-1.160576


# Reshaping

## Stack

In [92]:
tuples = list(
    zip(
        *[
            ["bar", "bar", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one"]
        ]))
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
df = pd.DataFrame(np.random.randn(7,2), index=index, columns=["A","B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.155869,-0.097778
bar,two,-1.433728,1.898315
baz,one,1.712905,-0.814175
foo,two,-2.41635,1.312701
foo,one,-1.533785,0.324933
qux,two,0.504755,0.682036
qux,one,-1.213726,-1.246023


In [93]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.155869,-0.097778
bar,two,-1.433728,1.898315
baz,one,1.712905,-0.814175
foo,two,-2.41635,1.312701
