<a href="https://colab.research.google.com/github/wkambale/Feature-Extraction-in-ML/blob/main/Feature_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Techniques of Feature Extraction in Machine Learning

From Numerical Scaling to Image Descriptors: Mastering Feature Extraction for Optimal Machine Learning Models for Enhanced Model Performance

Link to Article: https://kambale.dev//feature-extraction-in-ml

NB: To use this notebook, make a copy first.

MIT License: Copyright (c) 2023 **Wesley Kambale**

# Numerical Feature Extraction

In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler

data = [[10, 0.5], [20, 0.8], [15, 0.7]]

# Create an instance of the MinMaxScaler
scaler = MinMaxScaler()

# Apply scaling to the data
scaled_features = scaler.fit_transform(data)

In [None]:
# Binning 
import numpy as np

data = [2.5, 3.7, 1.9, 4.2, 5.1, 2.8]

# Create four bins from 1 to 6
bins = np.linspace(1, 6, 4) 

# Assign each value to a bin
binned_features = np.digitize(data, bins)  

In [None]:
# Aggregation
import numpy as np

data = [[10, 20, 15], [5, 10, 8], [12, 18, 20]]

# Compute mean along each column (axis=0)
mean_features = np.mean(data, axis=0)  

# Compute median along each column
median_features = np.median(data, axis=0)

In [None]:
# Polynomial Features
from sklearn.preprocessing import PolynomialFeatures

data = [[2, 3], [1, 4], [5, 2]]

# Create polynomial features up to degree 2
poly = PolynomialFeatures(degree=2) 

# Generate polynomial features
polynomial_features = poly.fit_transform(data)  

# Categorical Feature Extraction

In [None]:
# One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder

data = [['Red'], ['Blue'], ['Green'], ['Red']]

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder()  

# Apply one-hot encoding
onehot_features = encoder.fit_transform(data).toarray()  

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

data = ['Low', 'High', 'Medium', 'Low']

 # Create an instance of the LabelEncoder
encoder = LabelEncoder() 

 # Apply label encoding
encoded_features = encoder.fit_transform(data) 

In [None]:
# Frequency Encoding
import pandas as pd

data = pd.Series(['Apple', 'Banana', 'Apple', 'Orange', 'Banana'])

# Compute frequency of each category
frequency = data.value_counts(normalize=True)  

# Replace categories with frequencies
encoded_features = data.map(frequency)  

In [None]:
# Target Encoding
import pandas as pd

data = pd.DataFrame({'Category': ['A', 'B', 'A', 'B'], 'Target': [1, 0, 1, 1]})

# Compute mean target value for each category
target_mean = data.groupby('Category')['Target'].mean() 

# Replace categories with mean target values
encoded_features = data['Category'].map(target_mean)

# Text Feature Extraction

In [None]:
# Bag-of-Words (BoW)
from sklearn.feature_extraction.text import CountVectorizer

data = ['I love dogs', 'I hate cats', 'Dogs are cute']

 # Create an instance of CountVectorizer
vectorizer = CountVectorizer() 

# Apply BoW transformation
bow_features = vectorizer.fit_transform(data) 

In [None]:
# Term Frequency-Inverse Document Frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

data = ['I love dogs', 'I hate cats', 'Dogs are cute']

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()  

# Apply TF-IDF transformation
tfidf_features = vectorizer.fit_transform(data)  

In [None]:
# Word Embeddings
import gensim
from gensim.models import Word2Vec

data = [['I', 'love', 'dogs'], ['I', 'hate', 'cats'], ['Dogs', 'are', 'cute']]

# Create a Word2Vec model
model = Word2Vec(data, min_count=1)  

# Obtain the word embedding for 'dogs'
word_embedding = model.wv['dogs']  

In [None]:
# N-grams
from sklearn.feature_extraction.text import CountVectorizer

data = ['I love dogs', 'I hate cats', 'Dogs are cute']
vectorizer = CountVectorizer(ngram_range=(1, 2))
ngram_features = vectorizer.fit_transform(data)

# Image Feature Extraction

In [None]:
# Histogram of Oriented Gradients (HOG)
import cv2

image = cv2.imread('image.jpg', 0)
hog = cv2.HOGDescriptor()
hog_features = hog.compute(image)

In [None]:
# Scale-Invariant Feature Transform (SIFT)
import cv2

image = cv2.imread('image.jpg', 0)
sift = cv2.SIFT_create()
keypoints, descriptors = sift.detectAndCompute(image, None)

In [None]:
# Convolutional Neural Networks (CNNs)
import tensorflow as tf
from tensorflow.keras.applications import VGG16

image = tf.keras.preprocessing.image.load_img('image.jpg', target_size=(224, 224))
image = tf.keras.preprocessing.image.img_to_array(image)
image = tf.keras.applications.vgg16.preprocess_input(image)
vgg_model = VGG16(weights='imagenet', include_top=False)
features = vgg_model.predict(np.expand_dims(image, axis=0))

In [None]:
# Pre-trained Models
import tensorflow as tf
from tensorflow.keras.applications import VGG16

image = tf.keras.preprocessing.image.load_img('image.jpg', target_size=(224, 224))
image = tf.keras.preprocessing.image.img_to_array(image)
image = tf.keras.applications.vgg16.preprocess_input(image)
vgg_model = VGG16(weights='imagenet', include_top=False)
intermediate_layer_model = tf.keras.Model(inputs=vgg_model.input, outputs=vgg_model.get_layer('block4_pool').output)
features = intermediate_layer_model.predict(np.expand_dims(image, axis=0))

# Further Feature Extraction Techniques

In [None]:
# Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

# Assuming X is your input data

# Specify the number of components you want to extract
pca = PCA(n_components=2)

# X_pca contains the extracted features with reduced dimensionality
X_pca = pca.fit_transform(X)

In [None]:
# Independent Component Analysis (ICA)
from sklearn.decomposition import FastICA

# Assuming X is your input data

 # Specify the number of components you want to extract
ica = FastICA(n_components=2)

# X_ica contains the extracted features with reduced dimensionality
X_ica = ica.fit_transform(X)


In [None]:
# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2

# Assuming X and y are your input features and target labels, respectively

 # Select the top 10 features
selector = SelectKBest(score_func=chi2, k=10)

# X_new contains the selected features
X_new = selector.fit_transform(X, y)
