## Feature Engineering - Part 2

#### Tree-Based Methods - Feature Importance:
- The Wine dataset is a multi-class classification problem. Tree-based models like Random Forests can provide valuable insights into feature importance across multiple classes.

In [31]:
from sklearn.datasets import load_wine
# Load the Wine dataset
data = load_wine()
X, y = data.data, data.target
print(X.shape)
print(y.shape)
print(data.feature_names)
print(y)

(178, 13)
(178,)
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [30]:
from sklearn.ensemble import RandomForestClassifier


X, y = data.data, data.target

# Train a RandomForestClassifier
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importance scores
feature_importances = model.feature_importances_
feature_importances

array([0.09865709, 0.02132171, 0.37879938, 0.50122182])

In [26]:
top_k = 5

# Sort features by importance scores in descending order
sorted_indices = feature_importances.argsort()[::-1]
print(sorted_indices)
# Select the top k features
top_k_indices = sorted_indices[:top_k]

# Print the indices and names of the top k features
feature_names = data.feature_names
print("Top", top_k, "feature indices:", top_k_indices)
print("Top", top_k, "feature names:", [feature_names[idx] for idx in top_k_indices])

[12  6  9  0 11 10  5  4  1  3  8  2  7]
Top 5 feature indices: [12  6  9  0 11]
Top 5 feature names: ['proline', 'flavanoids', 'color_intensity', 'alcohol', 'od280/od315_of_diluted_wines']


#### Sequential Feature Selection
- The Iris dataset is a small, multi-class classification task. Sequential feature selection helps narrow down the feature set for better interpretability, which is often important in smaller datasets.

In [61]:
from sklearn.datasets import load_iris
# Load the Wine dataset
data = load_iris()
X, y = data.data, data.target
print(X.shape)
print(y.shape)
print(data.feature_names)
print(y)

(150, 4)
(150,)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [39]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

X, y = data.data, data.target

# Create a base model
model = KNeighborsClassifier()

# Forward feature selection
num_features_to_select = 2
sfs = SequentialFeatureSelector(model, k_features=num_features_to_select, forward=True)
sfs.fit(X, y)

# Selected feature indices
selected_feature_indices = sfs.k_feature_idx_

# Feature names
feature_names = data.feature_names

# Get the names of selected features
selected_feature_names = [feature_names[idx] for idx in selected_feature_indices]

# Print the selected feature names
print("Selected feature names:", selected_feature_names)


Selected feature names: ['petal length (cm)', 'petal width (cm)']


#### Additional Info

In [69]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [68]:
#Pearson correlation
import numpy as np
import pandas as pd
df = pd.DataFrame(data=np.c_[X, y], columns=data.feature_names + ['target'])

df.corr()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
sepal length (cm),1.0,-0.11757,0.871754,0.817941,0.782561
sepal width (cm),-0.11757,1.0,-0.42844,-0.366126,-0.426658
petal length (cm),0.871754,-0.42844,1.0,0.962865,0.949035
petal width (cm),0.817941,-0.366126,0.962865,1.0,0.956547
target,0.782561,-0.426658,0.949035,0.956547,1.0


In [62]:
y = y.reshape(len(y), 1)

In [63]:
y.shape

(150, 1)

In [64]:
X.shape

(150, 4)

In [66]:
D = np.concatenate((X,y), axis=1)

In [67]:
D.shape

(150, 5)

In [71]:
import numpy as np
import pandas as pd
df = pd.DataFrame(data=D, columns=data.feature_names + ['target'])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0
