## 5).Mean Encoding

In [2]:
import pandas as pd
df = pd.read_csv(r'C:\Users\LENOVO\Desktop\Feature Engineering\titanic_train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# focussing on related 2 columns
df = pd.read_csv('titanic_train.csv',usecols=['Cabin','Survived']) 
df

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
...,...,...
886,0,
887,1,B42
888,0,
889,1,C148


In [4]:
df['Cabin'].fillna('Null',inplace=True) # permanent replacement of Nan values
df.head(10) # top 10 rows

# both C85 and C123 belong to the same block "C"

Unnamed: 0,Survived,Cabin
0,0,Null
1,1,C85
2,1,Null
3,1,C123
4,0,Null
5,0,Null
6,0,E46
7,0,Null
8,1,Null
9,1,Null


In [5]:
# Extracting the block name: C, E , etc
df['Cabin'] = df['Cabin'].astype(str).str[0]
# "astype(str)" converts the type into "String" format
# "str[0]" extracts the first letter
df

Unnamed: 0,Survived,Cabin
0,0,N
1,1,C
2,1,N
3,1,C
4,0,N
...,...,...
886,0,N
887,1,B
888,0,N
889,1,C


In [6]:
df.Cabin.unique() # list of categories in the column / index "Cabin"

array(['N', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [7]:
# Calculating the percentage of people in respective "Cabin" got "Survived" 
df.groupby(['Cabin'])['Survived'].mean()
# t = 0% , means no one from T block got survived

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
N    0.299854
T    0.000000
Name: Survived, dtype: float64

In [8]:
#sorting in ascending order
df.groupby(['Cabin'])['Survived'].mean().sort_values()

Cabin
T    0.000000
N    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [9]:
#sorting in ascending order by "ascending=True" command
df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=True)

Cabin
T    0.000000
N    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [10]:
#sorting in descending order by "ascending=False" command
df.groupby(['Cabin'])['Survived'].mean().sort_values(ascending=False)

Cabin
D    0.757576
E    0.750000
B    0.744681
F    0.615385
C    0.593220
G    0.500000
A    0.466667
N    0.299854
T    0.000000
Name: Survived, dtype: float64

In [11]:
# getting INDEX with ascending percentage of "Survived"
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'N', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [12]:
# storing the above index as labels
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'N', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [13]:
# Calculating the percentage of people in respective "Cabin" got "Survived" 
df.groupby(['Cabin'])['Survived'].mean()


Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
N    0.299854
T    0.000000
Name: Survived, dtype: float64

In [14]:
## Mean Encoding

In [15]:
mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'N': 0.29985443959243085,
 'T': 0.0}

In [16]:
# generating new column / feature
df['mean_ordinal_encode'] = df['Cabin'].map(mean_ordinal)
df

Unnamed: 0,Survived,Cabin,mean_ordinal_encode
0,0,N,0.299854
1,1,C,0.593220
2,1,N,0.299854
3,1,C,0.593220
4,0,N,0.299854
...,...,...,...
886,0,N,0.299854
887,1,B,0.744681
888,0,N,0.299854
889,1,C,0.593220


## 6). Probability Ratio Enconding

In [2]:
import pandas as pd
df = pd.read_csv(r'C:\Users\LENOVO\Desktop\Feature Engineering\titanic_train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
df = pd.read_csv(r'C:\Users\LENOVO\Desktop\Feature Engineering\titanic_train.csv', usecols=['Cabin','Survived'])
df

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
...,...,...
886,0,
887,1,B42
888,0,
889,1,C148


In [6]:
# Replacing Nan values with "Missing" in the column "Cabin"
df['Cabin'].fillna('Missing', inplace=True)
df

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
...,...,...
886,0,Missing
887,1,B42
888,0,Missing
889,1,C148


In [7]:
df['Cabin'].unique() #list of Unique categories in the column / feature "Cabin"


array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [8]:
# extracting the first letter
# to create block e.g- C,B,E, etc

df['Cabin']=df['Cabin'].astype(str).str[0]
df

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M
...,...,...
886,0,M
887,1,B
888,0,M
889,1,C


In [9]:
#Method 1
df['Cabin'].unique() # List of Unique categories in the column / feature "Cabin"

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [10]:
#Method 2
df.Cabin.unique() # List of Unique categories in the column / feature "Cabin"

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [11]:
# defining probability dataframe
prob_df = df.groupby(['Cabin'])['Survived'].mean() #calculating the "survived" probability of person in each "cabin/black"
prob_df

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [12]:
prob_df = pd.DataFrame(prob_df) #converting the data in Dataframe
prob_df #Changed the value in the column "Survived"

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [14]:
# calculatting the probability that the person has died
prob_df['Died'] = 1- prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


### Probability Ratio = prob(Survived)/prob(died)

In [15]:
#calculatting the probability ration
prob_df['Probability_Ratio'] = prob_df['Survived']/prob_df['Died']
prob_df

Unnamed: 0_level_0,Survived,Died,Probability_Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [17]:
# COnverting the probability ratio into Dictionary
probability_encoded = prob_df["Probability_Ratio"].to_dict()
probability_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [18]:
# Converting New column to map "probability_encoded" with "Cabin"
df['Cabin_encoded'] = df['Cabin'].map(probability_encoded)
df

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
...,...,...,...
886,0,M,0.428274
887,1,B,2.916667
888,0,M,0.428274
889,1,C,1.458333
