In [2]:
# Vectorized String Methods

import numpy as np
import pandas as pd

In [3]:
s = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
)


s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [4]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)
df

Unnamed: 0,one,two,three
a,1.398362,0.447185,
b,-0.052356,-0.726732,0.700017
c,-0.081184,-0.085896,-0.798885
d,,0.165439,0.324408


In [5]:
unsorted_df = df.reindex(
    index=['a', 'd', 'c', 'b'], columns=['three', 'two', 'one']
)

unsorted_df

Unnamed: 0,three,two,one
a,,0.447185,1.398362
d,0.324408,0.165439,
c,-0.798885,-0.085896,-0.081184
b,0.700017,-0.726732,-0.052356


In [7]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,1.398362,,0.447185
d,,0.324408,0.165439
c,-0.081184,-0.798885,-0.085896
b,-0.052356,0.700017,-0.726732


In [8]:
unsorted_df['three'].sort_index()

a         NaN
b    0.700017
c   -0.798885
d    0.324408
Name: three, dtype: float64

In [10]:
s1 = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3], "c": [2, 3, 4]}).set_index(
    list("ab")
)
s1

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
B,1,2
a,2,3
C,3,4


In [11]:
s1.sort_index(level="a")

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
B,1,2
C,3,4
a,2,3


In [12]:
# key값을 준다?(함수를 주는 건데, 방식이 특이하네요.)     🔰
# key값으로 함수를 주는 경우는 생가보다 많다고 한다.
s1.sort_index(level="a", key=lambda idx: idx.str.lower())


Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
a,2,3
B,1,2
C,3,4


In [14]:
s1 = pd.Series(["B", "a", "C"])
s1

0    B
1    a
2    C
dtype: object

In [15]:
# Sort by Values
s1.sort_values(key=lambda x: x.str.lower())

1    a
0    B
2    C
dtype: object

In [16]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
2       C
6    CABA
8     cat
7     dog
5    <NA>
dtype: string

In [17]:
s.sort_values(na_position='first')

5    <NA>
0       A
3    Aaba
1       B
4    Baca
2       C
6    CABA
8     cat
7     dog
dtype: string

In [None]:
# Sort by Indexes and Values

In [18]:
idx = pd.MultiIndex.from_tuples(
    [("a", 1), ("a", 2), ("a", 2), ("b", 2), ("b", 1), ("b", 1)]
)
idx

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 1)],
           )

In [19]:
idx.names = ["first", "second"]
idx.names

FrozenList(['first', 'second'])

In [42]:
df_multi = pd.DataFrame({'A':np.arange(6, 0, -1)}, index=idx)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [43]:
df_multi.sort_values(by=['second', 'A'])
# 이 경우는 index가 이름 을 갖고 있는 경우이다.
# 특이 하게도 second index가 column의 label인양 사용된다. 이런 천일공로할 일이...😫

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5


In [46]:
# searchsorted(), numpy.ndarray.searchsorted()
# 제시된 value가 들어갈 자리를 return한다.
ser = pd.Series([1, 2, 3])
ser

0    1
1    2
2    3
dtype: int64

In [45]:
ser.searchsorted([0, 3])

array([0, 2])

In [47]:
ser.searchsorted([0,4])

array([0, 3])

In [50]:
# smallest / largest values
s = pd.Series(np.random.permutation(10))

In [51]:
s.sort_values()

6    0
1    1
0    2
9    3
4    4
8    5
7    6
2    7
3    8
5    9
dtype: int64

In [53]:
s.nsmallest(3)

6    0
1    1
0    2
dtype: int64

In [54]:
s.nlargest(3)

5    9
3    8
2    7
dtype: int64

In [None]:
# Selecting columns based on dtype: select_dtypes()

In [3]:
df = pd.DataFrame(
    {
        "string": list("abc"),
        "int64": list(range(1, 4)),
        "uint8": np.arange(3, 6).astype("u1"),
        "float64": np.arange(4.0, 7.0),
        "bool1": [True, False, True],
        "bool2": [False, True, False],
        "dates": pd.date_range("now", periods=3),
        "category": pd.Series(list("ABC")).astype("category"),
    }
)

# list.astype('category')
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-08-03 16:02:41.428930,A
1,b,2,4,5.0,False,True,2023-08-04 16:02:41.428930,B
2,c,3,5,6.0,True,False,2023-08-05 16:02:41.428930,C


In [6]:
df['timedelta'] = df.dates.diff()
df['unit64'] = np.arange(3, 6).astype('u8')
df['other_dates'] = pd.date_range('20230101', periods=3)
df['tz_aware_dates'] = pd.date_range('20230101', periods=3, tz='US/Eastern')
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,timedelta,unit64,other_dates,tz_aware_dates
0,a,1,3,4.0,True,False,2023-08-03 16:02:41.428930,A,NaT,3,2023-01-01,2023-01-01 00:00:00-05:00
1,b,2,4,5.0,False,True,2023-08-04 16:02:41.428930,B,1 days,4,2023-01-02,2023-01-02 00:00:00-05:00
2,c,3,5,6.0,True,False,2023-08-05 16:02:41.428930,C,1 days,5,2023-01-03,2023-01-03 00:00:00-05:00


In [8]:
df.dtypes

string                                object
int64                                  int64
uint8                                  uint8
float64                              float64
bool1                                   bool
bool2                                   bool
dates                         datetime64[ns]
category                            category
timedelta                    timedelta64[ns]
unit64                                uint64
other_dates                   datetime64[ns]
tz_aware_dates    datetime64[ns, US/Eastern]
dtype: object

In [10]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


In [11]:
df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger'])
# unsigned interger라는 것이 Unit8, 64예들을 예기하나 보네.

Unnamed: 0,int64,float64,bool1,bool2,timedelta
0,1,4.0,True,False,NaT
1,2,5.0,False,True,1 days
2,3,6.0,True,False,1 days


In [None]:
In [443]: subdtypes(np.generic)
Out[443]: 
[numpy.generic,
 [[numpy.number,
   [[numpy.integer,
     [[numpy.signedinteger,
       [numpy.int8,
        numpy.int16,
        numpy.int32,
        numpy.int64,
        numpy.longlong,
        numpy.timedelta64]],
      [numpy.unsignedinteger,
       [numpy.uint8,
        numpy.uint16,
        numpy.uint32,
        numpy.uint64,
        numpy.ulonglong]]]],
    [numpy.inexact,
     [[numpy.floating,
       [numpy.float16, numpy.float32, numpy.float64, numpy.float128]],
      [numpy.complexfloating,
       [numpy.complex64, numpy.complex128, numpy.complex256]]]]]],
  [numpy.flexible,
   [[numpy.character, [numpy.bytes_, numpy.str_]],
    [numpy.void, [numpy.record]]]],
  numpy.bool_,
  numpy.datetime64,
  numpy.object_]]