## feature extractions
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction

In [None]:
#1. One-hot encoding
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
features = [ {'city': 'Dubai', 'temperature': 33.}, 
        {'city': 'London', 'temperature': 12.}, 
        {'city': 'San Francisco', 'temperature': 18.}]


In [None]:
X =  vec.fit_transform(features)
X.toarray()

In [None]:
print(vec.get_feature_names())

In [None]:
#try: one-hot encode HW2's dataset


In [None]:
# bag-of-words
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?']
vec = CountVectorizer()
X = vec.fit_transform(corpus)
X.toarray()

In [None]:
twogram_vec = CountVectorizer(ngram_range=(1,2))
X = twogram_vec.fit_transform(corpus)
X.toarray()

In [None]:
twogram_vec.get_feature_names()

In [None]:
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(corpus)
X_tfidf.toarray()

In [None]:
vec.get_feature_names()

In [None]:
vec.vocabulary_.get('second')

In [None]:
vec.transform(['Something completely new document.']).toarray()
print(vec.transform(['Something completely and.']).toarray())

## feature selections
https://scikit-learn.org/stable/modules/feature_selection.html

#### L1 regularization gives sparse solutions

In [None]:
print(__doc__)

# Author: Fabian Pedregosa <fabian.pedregosa@inria.fr>
#         Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import datasets

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

print("Computing regularization path using the LARS ...")
_, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)

xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]

plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef|')
plt.ylabel('Coefficients')
plt.title('LASSO Path')
plt.axis('tight')
plt.show()

#### tree based feature selections
The relative rank (i.e. depth) of a feature used as a decision node in a tree can be used to assess the relative importance of that feature with respect to the predictability of the target variable. Features used at the top of the tree contribute to the final prediction decision of a larger fraction of the input samples. The expected fraction of the samples they contribute to can thus be used as an estimate of the relative importance of the features. In scikit-learn, the fraction of samples a feature contributes to is combined with the decrease in impurity from splitting them to create a normalized estimate of the predictive power of that feature.

By averaging the estimates of predictive ability over several randomized trees one can reduce the variance of such an estimate and use it for feature selection. This is known as the mean decrease in impurity, or MDI. Refer to [L2014] for more information on MDI and feature importance evaluation with Random Forests. (https://scikit-learn.org/stable/modules/ensemble.html#feature-importance-evaluation)

In [None]:
print(__doc__)
%matplotlib inline
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces
from sklearn.ensemble import ExtraTreesClassifier

# Number of cores to use to perform parallel fitting of the forest model
n_jobs = 1

# Load the faces dataset
data = fetch_olivetti_faces()
X = data.images.reshape((len(data.images), -1))
y = data.target

mask = y < 5  # Limit to 5 classes
X = X[mask]
y = y[mask]

# Build a forest and compute the pixel importances
print("Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs)
t0 = time()
forest = ExtraTreesClassifier(n_estimators=1000,
                              max_features=128,
                              n_jobs=n_jobs,
                              random_state=0)

forest.fit(X, y)
print("done in %0.3fs" % (time() - t0))
importances = forest.feature_importances_
importances = importances.reshape(data.images[0].shape)

# Plot pixel importances
plt.matshow(importances, cmap=plt.cm.hot)
plt.title("Pixel importances with forests of trees")
plt.show()

suggested reading: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html <br>
References: From the official documentations of scikit learn