# Recoding in Pandas

In [18]:
import pandas as pd
import seaborn as sns

## Load Diamonds Dataset

In [6]:
df = sns.load_dataset('diamonds')

In [7]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


### Use value_counts() or unique() on a df column to see its unique values

In [8]:
df['cut'].value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

In [19]:
df['cut'].unique()

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

## Recode with a function and call apply on the df column

In [20]:
def recode_cut(series):
    if series == 'Fair':
        return 0
    elif series == 'Good':
        return 1
    elif series == 'Very Good':
        return 2
    elif series == 'Premium':
        return 3
    elif series == 'Ideal':
        return 4

In [21]:
df['cutR'] = df['cut'].apply(recode_cut)

In [22]:
df[['cut', 'cutR']]

Unnamed: 0,cut,cutR
0,Ideal,4
1,Premium,3
2,Good,1
3,Premium,3
4,Good,1
...,...,...
53935,Ideal,4
53936,Good,1
53937,Very Good,2
53938,Premium,3


## Recode with a dictionary and call map on the df column

In [23]:
recode_cut_dict = {
    'Fair': 0,
    'Good': 1,
    'Very Good': 2,
    'Premium' : 3,
    'Ideal' : 4
}

In [24]:
df['cutRmap'] = df['cut'].map(recode_cut_dict)

In [25]:
df[['cut', 'cutR', 'cutRmap']]

Unnamed: 0,cut,cutR,cutRmap
0,Ideal,4,4
1,Premium,3,3
2,Good,1,1
3,Premium,3,3
4,Good,1,1
...,...,...,...
53935,Ideal,4,4
53936,Good,1,1
53937,Very Good,2,2
53938,Premium,3,3


## Create bins of continuous variables

### Using Pandas Cut
Use pandas cut as described [here](https://pandas.pydata.org/docs/reference/api/pandas.cut.html).

This is useful for grouping continuous data into bins such as ages or price ranges

In [27]:
df['carat'].describe()

count    53940.000000
mean         0.797940
std          0.474011
min          0.200000
25%          0.400000
50%          0.700000
75%          1.040000
max          5.010000
Name: carat, dtype: float64

In [30]:
df['caratBinned'] = pd.cut(df['carat'], 10, labels=False)

In [31]:
df[['carat', 'caratBinned']]

Unnamed: 0,carat,caratBinned
0,0.23,0
1,0.21,0
2,0.23,0
3,0.29,0
4,0.31,0
...,...,...
53935,0.72,1
53936,0.72,1
53937,0.70,1
53938,0.86,1


In [32]:
df['caratBinned'].value_counts()

0    25155
1    18626
2     7129
3     2349
4      614
5       53
6        6
7        5
8        2
9        1
Name: caratBinned, dtype: int64

In [43]:
def function_to_bin(df, column, bins=10, **kwargs):
    """This function will bin a column in pandas
    """
    new_col = column+'Binned'
    df[new_col] = pd.cut(df[column], bins, **kwargs)
    return df

In [44]:
df = function_to_bin(df, 'price', bins=10, labels=False)

In [45]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z', 'cutR', 'cutRmap', 'caratBinned', 'priceBinned'],
      dtype='object')

In [47]:
df['priceBinned'].value_counts()

0    25335
1     9328
2     7393
3     3878
4     2364
5     1745
6     1306
7     1002
8      863
9      726
Name: priceBinned, dtype: int64

### Using a function and Pandas Apply

In [49]:
def recode_prices(series):
    if series >= 3932.80:
        return 'Really expensive'
    else:
        return 'Expensive'

In [50]:
df['priceRecode'] = df['price'].apply(recode_prices)

In [51]:
df['priceRecode'].value_counts()

Expensive           34283
Really expensive    19657
Name: priceRecode, dtype: int64

In [57]:
dummies = pd.get_dummies(df['caratBinned'])

In [60]:
dummies.columns = ['BinnedPrice_' + str(x) for x in dummies.columns]

In [61]:
df2 = pd.concat([df, dummies],axis=1)

In [62]:
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,...,BinnedPrice_0,BinnedPrice_1,BinnedPrice_2,BinnedPrice_3,BinnedPrice_4,BinnedPrice_5,BinnedPrice_6,BinnedPrice_7,BinnedPrice_8,BinnedPrice_9
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,...,1,0,0,0,0,0,0,0,0,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,...,1,0,0,0,0,0,0,0,0,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,...,1,0,0,0,0,0,0,0,0,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,...,1,0,0,0,0,0,0,0,0,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,...,1,0,0,0,0,0,0,0,0,0


In [64]:
dummies2 = pd.get_dummies(df['priceRecode'])

In [65]:
dummies2.head()

Unnamed: 0,Expensive,Really expensive
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [66]:
df3 = pd.concat([df2, dummies2], axis=1)

In [67]:
df3

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,...,BinnedPrice_2,BinnedPrice_3,BinnedPrice_4,BinnedPrice_5,BinnedPrice_6,BinnedPrice_7,BinnedPrice_8,BinnedPrice_9,Expensive,Really expensive
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,...,0,0,0,0,0,0,0,0,1,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,...,0,0,0,0,0,0,0,0,1,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,...,0,0,0,0,0,0,0,0,1,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,...,0,0,0,0,0,0,0,0,1,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,...,0,0,0,0,0,0,0,0,1,0
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,...,0,0,0,0,0,0,0,0,1,0
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,...,0,0,0,0,0,0,0,0,1,0
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,...,0,0,0,0,0,0,0,0,1,0
