# Feature Extraction

* Binary Features

* Text Features

* Regex Features

* Date Features

* Interaction Features

# Import Necessary Libraries

In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
from statsmodels.stats.proportion import proportions_ztest
import warnings
warnings.filterwarnings("ignore")

# Import Dataset

In [2]:
titanic = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/titanic.csv")
df = titanic.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Binary Feature Extraction

In [6]:
df["Cabin"].isnull().sum()

687

In [9]:
df.shape[0] - df["Cabin"].isnull().sum()

204

In [10]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
df["new_cabin_bool"] = df["Cabin"].notnull().astype(int)

In [16]:
df.groupby("new_cabin_bool")["Survived"].mean()

new_cabin_bool
0   0.300
1   0.667
Name: Survived, dtype: float64

In [19]:
df.loc[df["new_cabin_bool"]==1, "Survived"].sum()

136

In [20]:
df.loc[df["new_cabin_bool"]==0, "Survived"].sum()

206

In [27]:
df.loc[df["new_cabin_bool"]==1, "Survived"].shape[0]

204

In [28]:
df.loc[df["new_cabin_bool"]==0, "Survived"].shape[0]

687

In [32]:
def proportions_ztest_binary(dataframe, new_col, target):
  new_col_1 = dataframe.loc[dataframe[new_col]==1, target].sum()
  new_col_0 = dataframe.loc[dataframe[new_col]==0, target].sum()
  new_col_1_count = dataframe.loc[dataframe[new_col]==1, target].shape[0]
  new_col_0_count = dataframe.loc[dataframe[new_col]==0, target].shape[0]
  T_istatistigi, p_value = proportions_ztest(count=[new_col_1, new_col_0], nobs=[new_col_1_count, new_col_0_count])
  print("T-istatistigi: %.4f, P-Value: %.4f" %(T_istatistigi, p_value))
  if p_value < 0.05:
    print("The H0 hypothesis is rejected. There is a siginificant diffirence")
  else:
    print("The H1 hypothesis cannot be rejected. There is no siginificant diffirence")

In [33]:
def binary_features(dataframe, col_name, new_col_name, target):
  dataframe[new_col_name] = dataframe[col_name].notnull().astype(int)
  print(dataframe.groupby(new_col_name)[target].mean())
  proportions_ztest_binary(dataframe, new_col_name, target)

In [34]:
binary_features(df, "Cabin", "new_cabin_bool", "Survived")

new_cabin_bool
0   0.300
1   0.667
Name: Survived, dtype: float64
T-istatistigi: 9.4597, P-Value: 0.0000
The H0 hypothesis is rejected. There is a siginificant diffirence


In [133]:
df.loc[((df["SibSp"] + df["Parch"]) > 0), "NEW_IS_ALONE"] = "NO"
df.loc[((df["SibSp"] + df["Parch"]) == 0), "NEW_IS_ALONE"] = "YES"

In [138]:
T_istatistigi, p_value = proportions_ztest(count=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].sum(),
                         df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].sum()],
                  nobs=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].shape[0],
                        df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].shape[0]])

In [140]:
p_value

1.2756752813177572e-09

In [139]:
if p_value < 0.05:
  print("The H0 hypothesis is rejected. There is a siginificant diffirence")
else:
  print("The H1 hypothesis cannot be rejected. There is no siginificant diffirence")

The H0 hypothesis is rejected. There is a siginificant diffirence


# Text Feature Extraction

In [41]:
df["NEW_NAME_LETTER_COUNT"] = df["Name"].str.len()

In [44]:
df["NEW_NAME_WORD_COUNT"] = df["Name"].apply(lambda x: len(str(x).split(" ")))

In [47]:
df["NEW_NAME_DR"] = df["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))

In [50]:
len(df[df["NEW_NAME_DR"]>0])

10

In [51]:
df.groupby("NEW_NAME_DR").agg({"Survived": ["mean", "count"]})

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
NEW_NAME_DR,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.383,881
1,0.5,10


# Regex Feature Extraction

In [55]:
df["NEW_TITLE"] = df["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)

In [60]:
df[["NEW_TITLE", "Survived", "Age"]].groupby("NEW_TITLE").agg({"Survived": ["count", "mean"], "Age": ["count", "mean"]})

Unnamed: 0_level_0,Survived,Survived,Age,Age
Unnamed: 0_level_1,count,mean,count,mean
NEW_TITLE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Capt,1,0.0,1,70.0
Col,2,0.5,2,58.0
Countess,1,1.0,1,33.0
Don,1,0.0,1,40.0
Dr,7,0.429,6,42.0
Jonkheer,1,0.0,1,38.0
Lady,1,1.0,1,48.0
Major,2,0.5,2,48.5
Master,40,0.575,36,4.574
Miss,182,0.698,146,21.774


In [61]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,new_cabin_bool,NEW_NAME_LETTER_COUNT,NEW_NAME_WORD_COUNT,NEW_NAME_DR,NEW_TITLE
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,23,4,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,1,51,7,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,22,3,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,44,7,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,24,4,0,Mr


# Date Feature Extraction

### Import Necessary Libraries

In [72]:
from datetime import date

### Import Dataset

In [92]:
new_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/course_reviews.csv")
new_df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


In [93]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rating              4323 non-null   float64
 1   Timestamp           4323 non-null   object 
 2   Enrolled            4323 non-null   object 
 3   Progress            4323 non-null   float64
 4   Questions Asked     4323 non-null   float64
 5   Questions Answered  4323 non-null   float64
dtypes: float64(4), object(2)
memory usage: 202.8+ KB


In [94]:
new_df["Timestamp"] = pd.to_datetime(new_df["Timestamp"], format="%Y-%m-%d")

In [95]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Rating              4323 non-null   float64       
 1   Timestamp           4323 non-null   datetime64[ns]
 2   Enrolled            4323 non-null   object        
 3   Progress            4323 non-null   float64       
 4   Questions Asked     4323 non-null   float64       
 5   Questions Answered  4323 non-null   float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 202.8+ KB


In [96]:
new_df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


In [97]:
new_df["Year"] = new_df["Timestamp"].dt.year

In [98]:
new_df["Month"] = new_df["Timestamp"].dt.month

In [99]:
new_df["Year_Diff"] = date.today().year - new_df["Timestamp"].dt.year

In [106]:
new_df["Month_Diff"] = (date.today().year - new_df["Timestamp"].dt.year) * 12 + (date.today().month - new_df["Timestamp"].dt.month)

In [108]:
new_df["Day_Name"] = new_df["Timestamp"].dt.day_name()

In [109]:
new_df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,Year,Month,Year_Diff,Month_Diff,Day_Name
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,2021,2,3,37,Friday
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,2021,2,3,37,Thursday
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,2021,2,3,37,Thursday
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,2021,2,3,37,Thursday
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,2021,2,3,37,Thursday


# Interaction Feature Selection

### Import Dataset

In [117]:
titanic = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/titanic.csv")
df = titanic.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [122]:
df.loc[(df["Sex"] == "male") & (df["Age"] <= 21), "NEW_SEX_CAT"] = "youngmale"
df.loc[(df["Sex"] == "male") & (df["Age"] > 21) & (df["Age"] < 50), "NEW_SEX_CAT"] = "maturemale"
df.loc[(df["Sex"] == "male") & (df["Age"] >= 50), "NEW_SEX_CAT"] = "seniormale"
df.loc[(df["Sex"] == "female") & (df["Age"] <= 21), "NEW_SEX_CAT"] = "youngfemale"
df.loc[(df["Sex"] == "female") & (df["Age"] > 21) & (df["Age"] < 50), "NEW_SEX_CAT"] = "maturefemale"
df.loc[(df["Sex"] == "female") & (df["Age"] >= 50), "NEW_SEX_CAT"] = "seniorfemale"

In [123]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_SEX_CAT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,maturemale
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,maturefemale
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,maturefemale
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,maturefemale
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,maturemale


In [124]:
df.groupby("NEW_SEX_CAT").agg({"Survived": ["mean", "count"]})

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
NEW_SEX_CAT,Unnamed: 1_level_2,Unnamed: 2_level_2
maturefemale,0.774,155
maturemale,0.199,281
seniorfemale,0.909,22
seniormale,0.135,52
youngfemale,0.679,84
youngmale,0.25,120


In [126]:
df["NEW_AGE_PCLASS"] = df["Age"] * df["Pclass"]

In [128]:
df["NEW_FAMILY_SIZE"] = df["SibSp"] + df["Parch"] + 1

In [130]:
df.loc[((df["SibSp"] + df["Parch"]) > 0), "NEW_IS_ALONE"] = "NO"
df.loc[((df["SibSp"] + df["Parch"]) == 0), "NEW_IS_ALONE"] = "YES"

In [131]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_SEX_CAT,NEW_AGE_PCLASS,NEW_FAMILY_SIZE,NEW_IS_ALONE
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,maturemale,66.0,2,NO
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,maturefemale,38.0,2,NO
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,maturefemale,78.0,1,YES
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,maturefemale,35.0,2,NO
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,maturemale,105.0,1,YES


In [132]:
df.groupby("NEW_IS_ALONE").agg({"Survived": ["mean", "count"]})

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
NEW_IS_ALONE,Unnamed: 1_level_2,Unnamed: 2_level_2
NO,0.506,354
YES,0.304,537
