## pandas view vs copy

- All operations generate a copy

- If inplace=True is provided, it will modify in-place; only some operations support this

- An indexer that sets, e.g. .loc/.iloc/.iat/.at will set inplace.

- An indexer that gets on a single-dtyped object is almost always a view (depending on the memory layout it may not be that's why this is not reliable). This is mainly for efficiency. (the example from above is for .query; this will always return a copy as its evaluated by numexpr)

- An indexer that gets on a multiple-dtyped object is always a copy.

In [1]:
import pandas as pd
import numpy as np

In [5]:
data = [{'season': 'spring','a': 1, 'b': 2, 'c':3}, 
        {'season': 'summer','a': 92, 'b': 32, 'c':43},
        {'season': 'autumn','a': 1444, 'b': 32, 'c':39},
        {'season': 'winter','a':10, 'b': 20, 'c': 30}]
df = pd.DataFrame(data)
df = df.set_index('season')
df

Unnamed: 0_level_0,a,b,c
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
spring,1,2,3
summer,92,32,43
autumn,1444,32,39
winter,10,20,30


## []

In [6]:
# indexing single colume using df['col']
# indexing multiple colume using df[['col1','col2']]
df[['a','b']]

Unnamed: 0_level_0,a,b
season,Unnamed: 1_level_1,Unnamed: 2_level_1
spring,1,2
summer,92,32
autumn,1444,32
winter,10,20


## .loc[] for label based indexing

In [7]:
# indexing using .loc
df.loc['spring']

a    1
b    2
c    3
Name: spring, dtype: int64

## .iloc[]     for positional indexing

In [9]:
# select 1st row
df.iloc[1]

a    92
b    32
c    43
Name: summer, dtype: int64

In [10]:
#select first column
df.iloc[:,1]

season
spring     2
summer    32
autumn    32
winter    20
Name: b, dtype: int64

## Selection by callable

In [15]:
df1 = pd.DataFrame(np.random.randn(6, 4),index=list('abcdef'),columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.538369,0.32146,0.169858,-0.903983
b,-2.525694,0.170403,-0.061161,-0.329243
c,-0.363418,1.310597,-0.084465,-1.257777
d,1.059371,2.061509,1.066778,1.012883
e,0.611163,-0.770759,-0.206068,0.238075
f,-1.137754,0.288189,-1.259288,-1.531174


In [17]:
df1.loc[lambda df: df['A']>=0,:]
# or boolean indexing df1[df1['A']>=0]

Unnamed: 0,A,B,C,D
a,0.538369,0.32146,0.169858,-0.903983
d,1.059371,2.061509,1.066778,1.012883
e,0.611163,-0.770759,-0.206068,0.238075


In [18]:
#callable indexing in Series
df1['A'].loc[lambda s: s > 0]

a    0.538369
d    1.059371
e    0.611163
Name: A, dtype: float64

## Random Sampling

In [19]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6],'weight_column': [0.5, 0.4, 0.1, 0]})
df2

Unnamed: 0,col1,weight_column
0,9,0.5
1,8,0.4
2,7,0.1
3,6,0.0


In [24]:
df2.sample(n=3, weights = 'weight_column',random_state = 2834)

Unnamed: 0,col1,weight_column
0,9,0.5
1,8,0.4
2,7,0.1


## Selection by where

In [25]:
#where takes an optional other argument for replacement of values where the condition is False, in the returned copy.
dates = pd.date_range('1/1/2000', periods=8)
df1 = pd.DataFrame(np.random.randn(8, 4),index=dates, columns=['A', 'B', 'C', 'D'])
df1

Unnamed: 0,A,B,C,D
2000-01-01,-0.467772,1.118495,-0.629553,0.745838
2000-01-02,-2.049764,-0.932466,-0.970354,-0.575599
2000-01-03,-0.267137,-1.034488,1.591659,-1.455532
2000-01-04,0.214531,-0.576425,-1.0097,0.863304
2000-01-05,0.110996,0.575267,1.17366,-1.054563
2000-01-06,1.756358,0.253118,0.274261,-0.841215
2000-01-07,-1.172191,0.014331,-0.228705,0.935969
2000-01-08,-0.318348,-0.141426,-0.579258,0.516643


In [26]:
df1.where(df1<0,-df1) # this is a copy

Unnamed: 0,A,B,C,D
2000-01-01,-0.467772,-1.118495,-0.629553,-0.745838
2000-01-02,-2.049764,-0.932466,-0.970354,-0.575599
2000-01-03,-0.267137,-1.034488,-1.591659,-1.455532
2000-01-04,-0.214531,-0.576425,-1.0097,-0.863304
2000-01-05,-0.110996,-0.575267,-1.17366,-1.054563
2000-01-06,-1.756358,-0.253118,-0.274261,-0.841215
2000-01-07,-1.172191,-0.014331,-0.228705,-0.935969
2000-01-08,-0.318348,-0.141426,-0.579258,-0.516643
