In [None]:
%pylab inline

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn
from sklearn.metrics import mean_squared_error
seaborn.set_style('white')
seaborn.set_context('talk')
np.random.seed(12345)

# Feature Engineering

### Generate Example Data

In [None]:
x = np.exp(2*np.random.randn(100, 1))
y = np.log(x[:, 0]) + np.random.randn(x.shape[0])
train_x = x[0:75]
test_x = x[75:]
train_y = y[0:75]
test_y = y[75:]

df = pd.DataFrame(x)
df['y'] = y

### Exploratory Data Analysis

#### Look at the distributions and correlations between input variables and output

In [None]:
g = seaborn.pairplot(df)
g.fig.set_size_inches(10,10)

#### Here the input variable is positive and skewed - good candidate for taking the log

### Fit naive model without feature engineering

In [None]:
model = LinearRegression()

In [None]:
model.fit(train_x, train_y)
naive_pred = model.predict(test_x)
print("RMSE: {:.2f}".format(mean_squared_error(test_y, naive_pred)))

### Fit model with logged features

In [None]:
logged_train_x = np.log(train_x)
logged_test_x = np.log(test_x)
model.fit(logged_train_x, train_y)
logged_pred = model.predict(logged_test_x)
print("RMSE: {:.2f}".format(mean_squared_error(test_y, logged_pred)))

In [None]:
plot(test_y)
plot(naive_pred)
plot(logged_pred)
_ = legend(['Truth', 'Naive', 'Engineered'])

# Text Feature Extraction

### Example Data

In [None]:
X = np.asarray(['The movie was really bad', 'the movie was good'])

In [None]:
vec = CountVectorizer()
X_trans = vec.fit_transform(X).toarray()
col_names = [c[0] for c in sorted(vec.vocabulary_.items(), key=lambda x: x[1])]
df = pd.DataFrame(X_trans, columns = col_names)
df

In [None]:
vec = TfidfVectorizer()
X_trans = vec.fit_transform(X).toarray()
col_names = [c[0] for c in sorted(vec.vocabulary_.items(), key=lambda x: x[1])]
df = pd.DataFrame(X_trans, columns = col_names)
df