### Probability Ratio Encoding

Probability ratio encoding is only suitable for classification problems when the target is binary.

In [38]:
import pandas as pd
df=pd.read_csv('titanic.csv',usecols=['cabin','survived'])
df.head()

Unnamed: 0,survived,cabin
0,1,B5
1,1,C22 C26
2,0,C22 C26
3,0,C22 C26
4,0,C22 C26


In [39]:
# replace '?' value with word 'Missing'
df['cabin'] = df['cabin'].replace('?', 'Missing')
df.head(20)

Unnamed: 0,survived,cabin
0,1,B5
1,1,C22 C26
2,0,C22 C26
3,0,C22 C26
4,0,C22 C26
5,1,E12
6,1,D7
7,0,A36
8,1,C101
9,0,Missing


In [40]:
# Get first letter for each cabin category
df['cabin'] = df['cabin'].astype(str).str[0]
df.head()

Unnamed: 0,survived,cabin
0,1,B
1,1,C
2,0,C
3,0,C
4,0,C


In [41]:
# check all unique values for cabin feature
df['cabin'].unique()

array(['B', 'C', 'E', 'D', 'A', 'M', 'T', 'F', 'G'], dtype=object)

In [42]:
# copy data to keep original data aside 
prob_data = df.copy()

In [43]:
prob_df = prob_data.groupby(['cabin'])['survived'].mean().to_dict()
prob_df

{'A': 0.5,
 'B': 0.7230769230769231,
 'C': 0.6063829787234043,
 'D': 0.6956521739130435,
 'E': 0.7317073170731707,
 'F': 0.6190476190476191,
 'G': 0.6,
 'M': 0.3027613412228797,
 'T': 0.0}

In [44]:
# get probabilty of survived passengers
prob_data['prob_survived'] = prob_data['cabin'].map(prob_df)
prob_data.head(10)

Unnamed: 0,survived,cabin,prob_survived
0,1,B,0.723077
1,1,C,0.606383
2,0,C,0.606383
3,0,C,0.606383
4,0,C,0.606383
5,1,E,0.731707
6,1,D,0.695652
7,0,A,0.5
8,1,C,0.606383
9,0,M,0.302761


In [45]:
# find probability of died passengers
prob_data['prob_died'] = 1 - prob_data['prob_survived']
prob_data.head(10)

Unnamed: 0,survived,cabin,prob_survived,prob_died
0,1,B,0.723077,0.276923
1,1,C,0.606383,0.393617
2,0,C,0.606383,0.393617
3,0,C,0.606383,0.393617
4,0,C,0.606383,0.393617
5,1,E,0.731707,0.268293
6,1,D,0.695652,0.304348
7,0,A,0.5,0.5
8,1,C,0.606383,0.393617
9,0,M,0.302761,0.697239


In [46]:
# get ratio of survived and died 
prob_data['prob_ratio'] = prob_data['prob_survived']/prob_data['prob_died']
prob_data.head(10)

Unnamed: 0,survived,cabin,prob_survived,prob_died,prob_ratio
0,1,B,0.723077,0.276923,2.611111
1,1,C,0.606383,0.393617,1.540541
2,0,C,0.606383,0.393617,1.540541
3,0,C,0.606383,0.393617,1.540541
4,0,C,0.606383,0.393617,1.540541
5,1,E,0.731707,0.268293,2.727273
6,1,D,0.695652,0.304348,2.285714
7,0,A,0.5,0.5,1.0
8,1,C,0.606383,0.393617,1.540541
9,0,M,0.302761,0.697239,0.434229


In [47]:
# combine cabin feature values with it's probability ratio value
probability_encoded = dict(zip(prob_data['cabin'], prob_data['prob_ratio']))
probability_encoded

{'B': 2.6111111111111107,
 'C': 1.5405405405405408,
 'E': 2.727272727272727,
 'D': 2.2857142857142856,
 'A': 1.0,
 'M': 0.43422913719943423,
 'T': 0.0,
 'F': 1.6250000000000002,
 'G': 1.4999999999999998}

In [48]:
# Mapping probability encoded values to orignal dataframe of cabin feature
df['cabin_encoded']=df['cabin'].map(probability_encoded)
df.head(10)

Unnamed: 0,survived,cabin,cabin_encoded
0,1,B,2.611111
1,1,C,1.540541
2,0,C,1.540541
3,0,C,1.540541
4,0,C,1.540541
5,1,E,2.727273
6,1,D,2.285714
7,0,A,1.0
8,1,C,1.540541
9,0,M,0.434229


**Advantages:**
1. It captures information within each category, which creates more predictive features.
2. It creates a monotonic relationship between the variables and the target. So it’s suitable for linear models.
3. It does't expand the feature space.

**Disadvantages:**
1. It likely to cause overfitting.
2. It doesn't defined when the denominator is 0.