In [27]:
import pandas as pd 
import numpy as np 

In [28]:
# DataFrame.add_prefix(prefix)[source]
# Prefix labels with string prefix.

# For Series, the row labels are prefixed. For DataFrame, the column labels are prefixed.
s = pd.Series([1, 2, 3, 4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [29]:
s.add_prefix("new_")

new_0    1
new_1    2
new_2    3
new_3    4
dtype: int64

In [30]:
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
df.add_prefix("col_")

Unnamed: 0,col_A,col_B
0,1,3
1,2,4
2,3,5
3,4,6


In [31]:
# DataFrame.add_suffix(suffix)[source]
# Suffix labels with string suffix.

# For Series, the row labels are suffixed. For DataFrame, the column labels are suffixed.
s.add_suffix("_suffix")

0_suffix    1
1_suffix    2
2_suffix    3
3_suffix    4
dtype: int64

In [32]:
df.add_suffix("_1")

Unnamed: 0,A_1,B_1
0,1,3
1,2,4
2,3,5
3,4,6


#### DataFrame.align(other, join='outer', axis=None, level=None, copy=True, fill_value=None, method=None, limit=None, fill_axis=0, broadcast_axis=None)[source]
- Align two objects on their axes with the specified join method.

- Join method is specified for each axis Index.

In [33]:

df = pd.DataFrame(
    [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
)
other = pd.DataFrame(
    [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
    columns=["A", "B", "C", "D"],
    index=[2, 3, 4],)
print(df)
print(other)

   D  B  E  A
1  1  2  3  4
2  6  7  8  9
     A    B    C    D
2   10   20   30   40
3   60   70   80   90
4  600  700  800  900


In [34]:
df.align(other)

(     A    B   C    D    E
 1  4.0  2.0 NaN  1.0  3.0
 2  9.0  7.0 NaN  6.0  8.0
 3  NaN  NaN NaN  NaN  NaN
 4  NaN  NaN NaN  NaN  NaN,
        A      B      C      D   E
 1    NaN    NaN    NaN    NaN NaN
 2   10.0   20.0   30.0   40.0 NaN
 3   60.0   70.0   80.0   90.0 NaN
 4  600.0  700.0  800.0  900.0 NaN)

In [35]:
df.align(other, join="right")

(     A    B   C    D
 2  9.0  7.0 NaN  6.0
 3  NaN  NaN NaN  NaN
 4  NaN  NaN NaN  NaN,
      A    B    C    D
 2   10   20   30   40
 3   60   70   80   90
 4  600  700  800  900)

In [36]:
df.align(other, join="left", fill_value=2)

(   D  B  E  A
 1  1  2  3  4
 2  6  7  8  9,
     D   B  E   A
 1   2   2  2   2
 2  40  20  2  10)

In [37]:
# DataFrame.at_time(time, asof=False, axis=None)[source]
# Select values at particular time of day (e.g., 9:30AM).
i = pd.date_range('2018-04-09', periods=4, freq='12H')
ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
ts

Unnamed: 0,A
2018-04-09 00:00:00,1
2018-04-09 12:00:00,2
2018-04-10 00:00:00,3
2018-04-10 12:00:00,4


In [38]:
ts.at_time('12:00')

Unnamed: 0,A
2018-04-09 12:00:00,2
2018-04-10 12:00:00,4


In [39]:
# DataFrame.between_time(start_time, end_time, include_start=_NoDefault.no_default, include_end=_NoDefault.no_default, inclusive=None, axis=None)[source]
# Select values between particular times of the day (e.g., 9:00-9:30 AM).

# By setting start_time to be later than end_time, you can get the times that are not between the two times.
i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
ts

Unnamed: 0,A
2018-04-09 00:00:00,1
2018-04-10 00:20:00,2
2018-04-11 00:40:00,3
2018-04-12 01:00:00,4


In [40]:
ts.between_time('0:15', '0:45')

Unnamed: 0,A
2018-04-10 00:20:00,2
2018-04-11 00:40:00,3


#### DataFrame.drop(labels=None, *, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')[source]
- Drop specified labels from rows or columns.

In [41]:

df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])
df.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [46]:
# Drop columns and/or rows of MultiIndex DataFrame
midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
                             ['speed', 'weight', 'length']],
                     codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
                            [0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx, columns=['big', 'small'],
                  data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
                        [250, 150], [1.5, 0.8], [320, 250],
                        [1, 0.8], [0.3, 0.2]])
df

Unnamed: 0,Unnamed: 1,big,small
lama,speed,45.0,30.0
lama,weight,200.0,100.0
lama,length,1.5,1.0
cow,speed,30.0,20.0
cow,weight,250.0,150.0
cow,length,1.5,0.8
falcon,speed,320.0,250.0
falcon,weight,1.0,0.8
falcon,length,0.3,0.2


In [47]:
df.drop(index=('falcon', 'weight'))

Unnamed: 0,Unnamed: 1,big,small
lama,speed,45.0,30.0
lama,weight,200.0,100.0
lama,length,1.5,1.0
cow,speed,30.0,20.0
cow,weight,250.0,150.0
cow,length,1.5,0.8
falcon,speed,320.0,250.0
falcon,length,0.3,0.2


In [48]:
df.drop(index='cow', columns='small')

Unnamed: 0,Unnamed: 1,big
lama,speed,45.0
lama,weight,200.0
lama,length,1.5
falcon,speed,320.0
falcon,weight,1.0
falcon,length,0.3


#### pandas.DataFrame.drop_duplicates
DataFrame.drop_duplicates(subset=None, *, keep='first', inplace=False, ignore_index=False)[source]
- Return DataFrame with duplicate rows removed.

- Considering certain columns is optional. Indexes, including time indexes are ignored.

In [49]:

df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})


In [50]:
# By default, it removes duplicate rows based on all columns.
df.drop_duplicates()

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [52]:
# To remove duplicates on specific column(s), use subset.
df.drop_duplicates(subset=list(df.columns)[0])

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5


In [53]:
# To remove duplicates and keep last occurrences, use keep.

df.drop_duplicates(subset=['brand', 'style'], keep='last')

Unnamed: 0,brand,style,rating
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
4,Indomie,pack,5.0


#### DataFrame.duplicated(subset=None, keep='first')[source]
- Return boolean Series denoting duplicate rows.

In [54]:

df.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [55]:
# By using ‘last’, the last occurrence of each set of duplicated values is set on False and all others on True.
df.duplicated(keep='last')


0     True
1    False
2    False
3    False
4    False
dtype: bool

In [56]:
# By setting keep on False, all duplicates are True.

df.duplicated(keep=False)

0     True
1     True
2    False
3    False
4    False
dtype: bool

In [57]:
# To find duplicates on specific column(s), use subset.

df.duplicated(subset=['brand'])

0    False
1     True
2    False
3     True
4     True
dtype: bool

#### DataFrame.equals(other)[source]
- Test whether two objects contain the same elements.

- This function allows two Series or DataFrames to be compared against each other to see if they have the same shape and elements. NaNs in the same location are considered equal.

- The row/column index do not need to have the same type, as long as the values are considered equal. Corresponding columns must be of the same dtype.

In [59]:
df = pd.DataFrame({1: [10], 2: [20]})
df

Unnamed: 0,1,2
0,10,20


In [60]:
exactly_equal = pd.DataFrame({1: [10], 2: [20]})
df.equals(exactly_equal)

True

In [61]:

different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
df.equals(different_column_type)

True

In [62]:
# DataFrames df and different_data_type have different types for the same values for their elements, and will return False even though their column labels are the same values and types.

different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
different_data_type
df.equals(different_data_type)

False

#### DataFrame.filter(items=None, like=None, regex=None, axis=None)[source]
- Subset the dataframe rows or columns according to the specified index labels.

In [63]:
df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
                  index=['mouse', 'rabbit'],
                  columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
mouse,1,2,3
rabbit,4,5,6


In [64]:
df.filter(items=['one', 'three'])

Unnamed: 0,one,three
mouse,1,3
rabbit,4,6


In [68]:
# select columns by regular expression
df.filter(regex='o$', axis=1)

Unnamed: 0,two
mouse,2
rabbit,5


In [70]:
# select rows containing 'bbi'
df.filter(like='wo', axis=1)

Unnamed: 0,two
mouse,2
rabbit,5


#### DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)[source]
- Return a random sample of items from an axis of object.

You can use random_state for reproducibility.

In [72]:
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},
                  index=['falcon', 'dog', 'spider', 'fish'])
df

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [80]:
# Extract 3 random elements from the Series df['num_legs']: Note that we use random_state to ensure the reproducibility of the examples.

df['num_legs'].sample(n=2, random_state=2)

spider    8
fish      0
Name: num_legs, dtype: int64

In [85]:
# A random 50% sample of the DataFrame with replacement:

df.sample(frac=1.5, replace=True, random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
dog,4,0,2
fish,0,0,8
falcon,2,2,10
falcon,2,2,10
fish,0,0,8
dog,4,0,2


In [86]:
# Using a DataFrame column as weights. Rows with larger value in the num_specimen_seen column are more likely to be sampled.

df.sample(n=2, weights='num_specimen_seen', random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
fish,0,0,8


#### DataFrame.truncate(before=None, after=None, axis=None, copy=True)[source]
- Truncate a Series or DataFrame before and after some index value.

This is a useful shorthand for boolean indexing based on index values above or below certain thresholds.

In [87]:
df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
                   'B': ['f', 'g', 'h', 'i', 'j'],
                   'C': ['k', 'l', 'm', 'n', 'o']},
                  index=[1, 2, 3, 4, 5])
df

Unnamed: 0,A,B,C
1,a,f,k
2,b,g,l
3,c,h,m
4,d,i,n
5,e,j,o


In [88]:
df.truncate(before=2, after=4)

Unnamed: 0,A,B,C
2,b,g,l
3,c,h,m
4,d,i,n


In [90]:
# The columns of a DataFrame can be truncated.

df.truncate(before="A", after="B", axis="columns")

Unnamed: 0,A,B
1,a,f
2,b,g
3,c,h
4,d,i
5,e,j


In [91]:
# For Series, only rows can be truncated.

df['A'].truncate(before=2, after=4)

2    b
3    c
4    d
Name: A, dtype: object

In [96]:
    # The index values in truncate can be datetimes or string dates.

dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
df = pd.DataFrame(index=dates, data={'A': 1})


2678401

In [99]:
df.truncate(before=pd.Timestamp('2016-01-05'),
            after=pd.Timestamp('2016-01-10')).head()

Unnamed: 0,A
2016-01-05 00:00:00,1
2016-01-05 00:00:01,1
2016-01-05 00:00:02,1
2016-01-05 00:00:03,1
2016-01-05 00:00:04,1


In [100]:
# Because the index is a DatetimeIndex containing only dates, we can specify before and after as strings. They will be coerced to Timestamps before truncation.

df.truncate('2016-01-05', '2016-01-10').tail()

Unnamed: 0,A
2016-01-09 23:59:56,1
2016-01-09 23:59:57,1
2016-01-09 23:59:58,1
2016-01-09 23:59:59,1
2016-01-10 00:00:00,1
