In [1]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [2]:
data

[{'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
 {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
 {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
 {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}]

In [3]:
from sklearn.feature_extraction import DictVectorizer

In [4]:
vec = DictVectorizer(sparse = False,dtype = int)
vec.fit_transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

In [5]:
vec.get_feature_names()

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

In [7]:
vec = DictVectorizer(sparse = True,dtype = int)
vec.fit_transform(data)

<4x5 sparse matrix of type '<class 'numpy.int32'>'
	with 12 stored elements in Compressed Sparse Row format>

In [14]:
sample = ['problem problem problem of evil',
          'evil queen',
          'horizon problem']

In [15]:
sample

['problem problem problem of evil', 'evil queen', 'horizon problem']

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
vec = CountVectorizer()
x = vec.fit_transform(sample)

In [18]:
x

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [19]:
import pandas as pd
pd.DataFrame(x.toarray(),columns = vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,3,0
1,1,0,0,0,1
2,0,1,0,1,0


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
vec = TfidfVectorizer()
x = vec.fit_transform(sample)

In [22]:
x

<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [23]:
pd.DataFrame(x.toarray(),columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,0.291992,0.0,0.383935,0.875976,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,0.795961,0.0,0.605349,0.0


In [25]:
from numpy import nan
import numpy as np

X = np.array([[ nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   nan, 6  ],
              [ 8,   8,   1  ]])
y = np.array([14, 16, -1,  8, -5])

In [27]:
from sklearn.impute import SimpleImputer

In [28]:
imp = SimpleImputer(strategy='mean')
x2 = imp.fit_transform(X)

In [29]:
x2

array([[4.5, 0. , 3. ],
       [3. , 7. , 9. ],
       [3. , 5. , 2. ],
       [4. , 5. , 6. ],
       [8. , 8. , 1. ]])

In [30]:
from sklearn.linear_model import LinearRegression

In [32]:
model = LinearRegression().fit(x2,y)
model.predict(x2)

array([13.14869292, 14.3784627 , -1.15539732, 10.96606197, -5.33782027])

In [34]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [35]:
model = make_pipeline(
    SimpleImputer(strategy='mean'),
    PolynomialFeatures(degree = 2),
    LinearRegression()
)

In [36]:
model.fit(X,y)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

In [37]:
y

array([14, 16, -1,  8, -5])

In [38]:
model.predict(X)

array([14., 16., -1.,  8., -5.])