[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vochicong/ai-memo/blob/master/Python_memo.ipynb)

# Convert categories to numbers

To easily convert categorical (ordinal or nominal) Pandas column into integers!

In [1]:
!pip3 install -U pandas
import pandas as pd
pd.__version__

Requirement already up-to-date: pandas in /usr/local/lib/python3.6/dist-packages (0.23.4)


'0.23.4'

In [2]:
s = pd.Categorical(['b','a','b','c',None,'b','c'], ordered=True, categories=['a', 'b', 'c'])
print("Series {0}, codes {1}".format(s, s.codes))
s = pd.Categorical(['b','a','b','c',None,'b','c'], ordered=True, categories=['b', 'c'])
print("Series {0}, codes {1}".format(s, s.codes))
s = pd.Categorical(['b','a','b','c',None,'b','c'], ordered=True, categories=['b', 'c', 'd'])
print("Series {0}, codes {1}".format(s, s.codes))
pd.Series(s).cat.codes

Series [b, a, b, c, NaN, b, c]
Categories (3, object): [a < b < c], codes [ 1  0  1  2 -1  1  2]
Series [b, NaN, b, c, NaN, b, c]
Categories (2, object): [b < c], codes [ 0 -1  0  1 -1  0  1]
Series [b, NaN, b, c, NaN, b, c]
Categories (3, object): [b < c < d], codes [ 0 -1  0  1 -1  0  1]


0    0
1   -1
2    0
3    1
4   -1
5    0
6    1
dtype: int8

In [3]:
df = pd.DataFrame([['green', 'M', 10.1],
                   [None, 'XL', 14.3],
                   ['red', 'L', 13.5],
                   ['blue', 'XL', 15.3]])
df.columns = ['color', 'size', 'price']
df

Unnamed: 0,color,size,price
0,green,M,10.1
1,,XL,14.3
2,red,L,13.5
3,blue,XL,15.3


Let's convert `size` into a numeric column, given `M < L < XL`

In [4]:
size = pd.Categorical(df['size'], ordered=True, categories=['M', 'L', 'XL'] )
df['size_cat'] = pd.Series(size.codes)
df

Unnamed: 0,color,size,price,size_cat
0,green,M,10.1,0
1,,XL,14.3,2
2,red,L,13.5,1
3,blue,XL,15.3,2


In [5]:
dummies = pd.get_dummies(df['color'], dummy_na=True, drop_first=True)
dummies

Unnamed: 0,green,red,nan
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,0


In [0]:
df = pd.concat([df, dummies], axis=1, sort=False)

In [7]:
df.drop(['color', 'size'], axis=1)

Unnamed: 0,price,size_cat,green,red,nan
0,10.1,0,1,0,0
1,14.3,2,0,0,1
2,13.5,1,0,1,0
3,15.3,2,0,0,0


Now all columns are numerical, ready for some regression!

# Append row to dataframe

In [8]:
df_log = pd.DataFrame(columns=['option1', 'option2'])
df_log

Unnamed: 0,option1,option2


In [9]:
df_log = df_log.append(pd.DataFrame({'option1': [3], 'option2': [7]}))
df_log

Unnamed: 0,option1,option2
0,3,7


In [10]:
df_log = df_log.append(pd.DataFrame([[4,8]], columns=['option1', 'option2']),
                       ignore_index=True)
df_log

Unnamed: 0,option1,option2
0,3,7
1,4,8


# Pick items from a dict

In [11]:
dict = {'a': 1, 'b': 2, 'c': 3}
{k: dict.get(k) for k in ['b', 'c', 'd']}

{'b': 2, 'c': 3, 'd': None}

In [12]:
dict['d'] = 4
{k: dict.get(k) for k in ['b', 'c', 'd']}

{'b': 2, 'c': 3, 'd': 4}

# Pandas pivot

See https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping

In [13]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                  'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                  'baz': [1, 2, 3, 4, 5, 6],
                  'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


In [14]:
df.pivot(index='foo', columns='bar', values='baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [15]:
df.pivot(index='foo', columns='bar')['baz']

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [16]:
df.pivot(index='foo', columns='bar')

Unnamed: 0_level_0,baz,baz,baz,zoo,zoo,zoo
bar,A,B,C,A,B,C
foo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,1,2,3,x,y,z
two,4,5,6,q,w,t


In [17]:
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                  'bar': ['A', 'A', 'C', 'A', 'B', 'C'],
                  'baz': [1, 2, 3, 4, 5, 6],
                  'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
df.pivot(index='foo', columns='bar', values='baz') # raise exception about duplicate index in 'foo' and 'bar'

ValueError: ignored