# BONUS PROJECT: Data Preprocessing & Feature Engineering for Machine Learning

## Data Import and first Inspection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv("housing.csv")

In [None]:
df

__Features__:

* **longitude:**  geographic coordinate (district´s east-west position)
* **latitude:**  geographic coordinate (district´s north-south position)
* **housing_median_age:** median age of houses in district
* **total_rooms** Sum of all rooms in district
* **total_bedrooms** Sum of all bedrooms in district
* **population:** total population in district
* **households:** total households in district
* **median_income:** median household income in district 
* **median_house_value:** median house value in district
* **ocean_proximity:** District´s proximity to the ocean

In [None]:
df.info()

In [None]:
df[df.total_bedrooms.isna()]

In [None]:
df[df.duplicated()]

In [None]:
df.describe()

In [None]:
df.describe(include = "O")

In [None]:
df.ocean_proximity.value_counts()

In [None]:
df.total_rooms.value_counts()

In [None]:
df.hist(bins=50, figsize=(20,15))
plt.show()

## Data Cleaning and Creating additional Features

In [None]:
df.info()

In [None]:
df.dropna(inplace = True)

In [None]:
df["rooms_per_household"] = df.total_rooms.div(df.households)

In [None]:
df.rooms_per_household.nlargest(10)

In [None]:
df.rooms_per_household.nsmallest(10)

In [None]:
df.loc[[1979, 5916, 8219]]

In [None]:
df["pop_per_household"] = df.population.div(df.households)

In [None]:
df["bedrooms_per_room"] = df.total_bedrooms.div(df.total_rooms)

In [None]:
df.describe()

## Which Factors influence House Prices?

In [None]:
df

In [None]:
df.median_house_value.hist(bins = 100, figsize = (12, 8))
plt.show()

In [None]:
df.corr().median_house_value.sort_values(ascending = False)

In [None]:
df.median_income.hist(bins = 100, figsize = (12, 8))
plt.show()

In [None]:
sns.set(font_scale=1.5)
sns.jointplot(data = df, x = "median_income", y = "median_house_value", kind = "reg", height = 10)
plt.show()

In [None]:
sns.set(font_scale=1.5)
sns.jointplot(data = df, x = "median_income", y = "median_house_value", kind = "kde", height = 10)
plt.show()

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude",
                       s=df.population/100, label="Population", figsize = (15, 10),
                       c="median_house_value", cmap= "coolwarm",
                       colorbar=True, alpha=0.4, fontsize = 15, sharex = False)
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.legend(fontsize=16)
plt.show()

In [None]:
import matplotlib.image as mpimg
california_img = mpimg.imread("california.png")

In [None]:
california_img

In [None]:
plt.figure(figsize = (15, 10))
plt.imshow(california_img)
plt.show()

In [None]:
plt.figure(figsize = (15, 10))
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05])
plt.show()

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude",
                       s=df.population/100, label="Population", figsize = (15, 10),
                       c="median_house_value", cmap="coolwarm",
                       colorbar=True, alpha=0.4, fontsize = 20, sharex = False)
                      
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))

plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.title("House Prices in California", fontsize = 20)
plt.legend(fontsize=16)
plt.show()

In [None]:
prox = df.ocean_proximity.unique()
prox

In [None]:
df_loc = df[df.ocean_proximity == prox[2]].copy()

In [None]:
df_loc.plot(kind="scatter", x="longitude", y="latitude",
                       s=df_loc['population']/100, label="Population", figsize = (15, 10),
                       c="median_house_value", cmap="coolwarm",
                       colorbar=True, alpha=0.4, fontsize = 20, sharex = False
                      )
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,
           cmap=plt.get_cmap("jet"))

plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.legend(fontsize=16)
plt.show()

## Advanced Explanatory Data Analyis with Seaborn

In [None]:
df

In [None]:
df.median_income.hist(bins=50, figsize=(15,10))
plt.title("Median Income")
plt.show()

In [None]:
pd.qcut(df.median_income, q = [0, 0.25, 0.5, 0.75, 0.95, 1])

In [None]:
df["income_cat"] = pd.qcut(df.median_income, q = [0, 0.25, 0.5, 0.75, 0.95, 1],
                           labels = ["Low", "Below_Average", "Above_Average", "High", "Very High"])

In [None]:
df.income_cat

In [None]:
df.income_cat.value_counts(normalize = True)

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.5, palette= "viridis")
sns.countplot(data = df, x = "income_cat", hue = "ocean_proximity")
plt.legend(loc = 1)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.5)
sns.barplot(data = df, x = "income_cat", y = "median_house_value", dodge = True)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.5)
sns.barplot(data = df, x = "ocean_proximity", y = "median_house_value", dodge = True)
plt.show()

In [None]:
matrix = df.groupby(["income_cat", "ocean_proximity"]).median_house_value.mean().unstack().drop(columns = ["ISLAND"])

In [None]:
matrix.astype("int")

In [None]:
plt.figure(figsize=(12,8))
sns.set(font_scale=1.4)
sns.heatmap(matrix.astype("int"), cmap = "Reds", annot = True, fmt = "d", vmin = 90000, vmax = 470000)
plt.show()

## Feature Engineering - Part 1

In [None]:
label = df.median_house_value.copy()
label

In [None]:
features = df.drop(columns = ["median_house_value"])
features

In [None]:
features.info()

In [None]:
features.select_dtypes("float")

In [None]:
import scipy.stats as stats

In [None]:
feat1 = features.select_dtypes("float").apply(lambda x: stats.zscore(x))
feat1

In [None]:
pd.options.display.float_format = '{:.2f}'.format

In [None]:
feat1.agg(["mean", "std"])

## Feature Engineering - Part 2

In [None]:
features.ocean_proximity

In [None]:
features.ocean_proximity.value_counts()

In [None]:
dummies = pd.get_dummies(features.ocean_proximity)
dummies

In [None]:
features = pd.concat([feat1, dummies, df.income_cat], axis = 1)
features

## Splitting the Data into Train and Test Set

In [None]:
features

In [None]:
test_size = 0.2

In [None]:
X_test = features.sample(frac = test_size, random_state= 123)

In [None]:
X_test

In [None]:
X_test.income_cat.value_counts(normalize = True)

In [None]:
features.income_cat.value_counts(normalize = True)

In [None]:
X_test.index

In [None]:
X_train = features.loc[~features.index.isin(X_test.index)].copy()

In [None]:
X_train

In [None]:
X_train.income_cat.value_counts(normalize = True)

In [None]:
X_train = X_train.sample(frac = 1, random_state= 123)
X_train

In [None]:
X_train.drop(columns = ["income_cat"], inplace = True)

In [None]:
X_test.drop(columns = ["income_cat"], inplace = True)

In [None]:
y_train = label.loc[X_train.index]
y_test = label.loc[X_test.index]

In [None]:
y_train

## Training the ML Model (Random Forest Regressor)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = RandomForestRegressor(random_state = 42, n_estimators = 500,
                                   max_features= "sqrt", max_depth= 75, min_samples_split=2)

In [None]:
forest_reg.fit(X_train, y_train)

In [None]:
forest_reg.score(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
pred = forest_reg.predict(X_train)
pred

In [None]:
forest_mse = mean_squared_error(y_train, pred)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

## Evaluating the Model on the Test Set

In [None]:
forest_reg

In [None]:
forest_reg.score(X_test, y_test)

In [None]:
pred = forest_reg.predict(X_test)
pred

In [None]:
forest_mse = mean_squared_error(y_test, pred)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
comp = pd.DataFrame(data = {"True_V": y_test, "Pred":pred})
comp

In [None]:
ae = comp.True_V.sub(comp.Pred).abs()
ae

In [None]:
mae = ae.mean()
mae

## Feature Importance

In [None]:
forest_reg.feature_importances_

In [None]:
feature_imp = pd.Series(data = forest_reg.feature_importances_,
                        index = X_train.columns).sort_values(ascending = False)

In [None]:
feature_imp

In [None]:
feature_imp.sort_values().plot.barh(figsize = (12, 8))
plt.show()