### **Understanding the concept of Feature Construction(2nd part of Feature Engineering)**

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns

In [2]:
data = pd.read_csv('/content/train.csv', )[['Age','Pclass','SibSp','Parch','Survived']]

In [3]:
data.sample(5)

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
581,39.0,1,1,1,1
626,57.0,2,0,0,0
490,,3,1,0,0
837,,3,0,0,0
502,,3,0,0,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Pclass    714 non-null    int64  
 2   SibSp     714 non-null    int64  
 3   Parch     714 non-null    int64  
 4   Survived  714 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 33.5 KB


In [4]:
data.dropna(inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Pclass    714 non-null    int64  
 2   SibSp     714 non-null    int64  
 3   Parch     714 non-null    int64  
 4   Survived  714 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 33.5 KB


In [5]:
data.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [9]:
X = data.iloc[:,0:4]
y = data.iloc[:,-1]

In [10]:
X, y

(      Age  Pclass  SibSp  Parch
 0    22.0       3      1      0
 1    38.0       1      1      0
 2    26.0       3      0      0
 3    35.0       1      1      0
 4    35.0       3      0      0
 ..    ...     ...    ...    ...
 885  39.0       3      0      5
 886  27.0       2      0      0
 887  19.0       1      0      0
 889  26.0       1      0      0
 890  32.0       3      0      0
 
 [714 rows x 4 columns],
 0      0
 1      1
 2      1
 3      1
 4      0
       ..
 885    0
 886    0
 887    1
 889    1
 890    0
 Name: Survived, Length: 714, dtype: int64)

In [12]:
#CrossValidating the accuracy score
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.6933333333333332

### **Applying Feature Construction**

In [13]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1  #Construction of SibSp & Parch cols to form Family_size

In [14]:
X.sample(6)

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
23,28.0,1,0,0,1
417,18.0,2,0,2,3
850,4.0,3,4,2,7
162,26.0,3,0,0,1
752,33.0,3,0,0,1
438,64.0,1,1,4,6


In [15]:
#Inputting the values as per condition
def myfunc(num):
    if num == 1:
        #alone
        return 0
    elif num >1 and num <=4:
        # small family
        return 1
    else:
        # large family
        return 2

In [16]:
myfunc(4)

1

In [17]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [18]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [20]:
#Dropping the non-required cols.
X.drop(columns=['SibSp','Parch','Family_size'],inplace=True)

In [21]:
X.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [22]:
#Accuracy Score after the construction
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.7003174603174602