# Feature Engineering

[statifer.com](https://statinfer.com/)

# Data Importing

In [None]:
import pandas as pd
house_price_data=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/kc_house_data/kc_house_data.csv")

In [None]:
house_price_data.info()


1. Print columns names
2. prepare the train and test data

In [None]:
house_price_data.columns.values

In [None]:
pred_cols=house_price_data.columns.values[3:]
print(pred_cols)

In [None]:
X = house_price_data[pred_cols]
y = house_price_data['price']

from sklearn  import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,train_size=0.8, random_state=55)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


# Basic Model

## Model1 with all numerical columns

In [None]:
from  sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X_train, y_train)

RSquared and MAPE

In [None]:
#Rsquared value on train and test data
from sklearn import metrics
y_pred_train=model1.predict(X_train)
print("Train RSquared", metrics.r2_score(y_train,y_pred_train))

y_pred_test=model1.predict(X_test)
print("Test RSquared",metrics.r2_score(y_test,y_pred_test))

import numpy as np
#MAPE
print("MAPE on Train data : ", round(np.mean(np.abs(y_train - y_pred_train)/y_train),2))
print("MAPE on Test data : ", round(np.mean(np.abs(y_test - y_pred_test)/y_test),2))


# Handling Date variables

In [None]:
date_vars = ['date', 'yr_built', 'yr_renovated']
house_price_dates=house_price_data[date_vars]
house_price_dates.head()

In [None]:
house_price_dates['sale_year'] = np.int64([d[0:4] for d in house_price_dates["date"]])
house_price_dates['sale_month'] = np.int64([d[4:6] for d in house_price_dates["date"]])
house_price_dates['day_sold'] = np.int64([d[6:8] for d in house_price_dates["date"]])
house_price_dates['age_of_house'] = house_price_dates['sale_year'] - house_price_dates['yr_built']
house_price_dates['Ind_renovated'] = house_price_dates['yr_renovated']>0
house_price_dates.head()

In [None]:
house_price_dates1=house_price_dates.drop(date_vars, axis=1) #keep only newly derived variables
house_price_data_dates=house_price_data.join(house_price_dates1)
house_price_data_dates.shape

## Model2 with Date columns

In [None]:
col_names = house_price_data_dates.columns.values
#print(col_names)

x_col_names=col_names[3:]
print(x_col_names)

X = house_price_data_dates[x_col_names]
y = house_price_data_dates['price']

from sklearn  import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.2, random_state=55)


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


In [None]:
import sklearn
model2 = sklearn.linear_model.LinearRegression()
model2.fit(X_train, y_train)

In [None]:
#Rsquared Calculation on Train data
from sklearn import metrics
y_pred_train=model2.predict(X_train)
print("Train data R-Squared : ", metrics.r2_score(y_train,y_pred_train))

#Rsquared Calculation on test data
y_pred_test=model2.predict(X_test)
print("Test data R-Squared : " , metrics.r2_score(y_test,y_pred_test))

In [None]:
#MAPE
print("MAPE on Train data : ", round(np.mean(np.abs(y_train - y_pred_train)/y_train),2))
print("MAPE on Test data : ", round(np.mean(np.abs(y_test - y_pred_test)/y_test),2))

# Handling Geo location(Lat-Long) Variables

In [None]:
###' House Price versus Longitude and Latitude'
bubble_col= house_price_data["price"] > house_price_data["price"].quantile(0.7)

import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
plt.scatter(house_price_data["long"],house_price_data["lat"], c=bubble_col,cmap="RdYlGn",s=10)
plt.title('House Price vs Longitude and Latitude', fontsize=20)
plt.xlabel('Longitude', fontsize=15)
plt.ylabel('Latitude', fontsize=15)
plt.show()


## High price houses center

In [None]:
high_long_mean=house_price_data["long"][bubble_col].mean()
high_lat_mean=house_price_data["lat"][bubble_col].mean()

plt.figure(figsize=(12,12))
plt.scatter(house_price_data["long"],house_price_data["lat"], c=bubble_col,cmap="RdYlGn",s=10)
plt.scatter(high_long_mean,high_lat_mean, c="black", s=1000)

plt.title('House Price vs Longitude and Latitude', fontsize=20)
plt.xlabel('Longitude', fontsize=15)
plt.ylabel('Latitude', fontsize=15)
plt.show()

Distance from high priced houses center to every house

In [None]:
##Distance from high priced houses center to every house
house_price_data["High_cen_distance"]=np.sqrt((house_price_data["long"] - high_long_mean) ** 2 + (house_price_data["lat"] - high_lat_mean) ** 2)

plt.figure(figsize=(15,15))
plt.scatter(house_price_data["High_cen_distance"],np.log(house_price_data["price"]))
plt.title('House Price vs Distance from center', fontsize=20)
plt.xlabel('Distance from center', fontsize=15)
plt.ylabel('log(house price)', fontsize=15)


## Model3 With Geo variable treatment

In [None]:
#Defining X data
col_names = house_price_data.columns.values
print(col_names)

x_col_names=col_names[3:]
print(x_col_names)

X = house_price_data[x_col_names]
y = house_price_data['price']

from sklearn  import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.2, random_state=55)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

import sklearn
model3 = sklearn.linear_model.LinearRegression()
model3.fit(X_train, y_train)




In [None]:
#Rsquared Calculation on Train data
from sklearn import metrics
y_pred_train=model3.predict(X_train)
print("Train data R-Squared : ", metrics.r2_score(y_train,y_pred_train))

#Rsquared Calculation on test data
y_pred_test=model3.predict(X_test)
print("Test data R-Squared : " , metrics.r2_score(y_test,y_pred_test))
#MAPE
print("MAPE on Train data : ", round(np.mean(np.abs(y_train - y_pred_train)/y_train),2))
print("MAPE on Test data : ", round(np.mean(np.abs(y_test - y_pred_test)/y_test),2))


## Seattle City indicator

In [None]:
# Seattle is Washington State’s largest city
Seattle_Longitude=	-122.335167
Seattle_Latitude=	47.608013

##Distance from Seattle to every house
house_price_data["Seattle_distance"]=np.sqrt((house_price_data["long"] - Seattle_Longitude) ** 2 + (house_price_data["lat"] - Seattle_Latitude) ** 2)

plt.figure(figsize=(15,15))
plt.scatter(house_price_data["Seattle_distance"],np.log(house_price_data["price"]))
plt.title('House Price vs Distance from Seattle', fontsize=20)
plt.xlabel('Distance from Seattle', fontsize=15)
plt.ylabel('log(house price)', fontsize=15)


In [None]:
#Defining X data
col_names = house_price_data.columns.values
print(col_names)

x_col_names=col_names[3:]
print(x_col_names)

X = house_price_data[x_col_names]
y = house_price_data['price']

from sklearn  import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.2, random_state=55)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

import sklearn
model3 = sklearn.linear_model.LinearRegression()
model3.fit(X_train, y_train)

#Rsquared Calculation on Train data
from sklearn import metrics
y_pred_train=model3.predict(X_train)
print("Train data R-Squared : ", metrics.r2_score(y_train,y_pred_train))

#Rsquared Calculation on test data
y_pred_test=model3.predict(X_test)
print("Test data R-Squared : " , metrics.r2_score(y_test,y_pred_test))
#MAPE
print("MAPE on Train data : ", round(np.mean(np.abs(y_train - y_pred_train)/y_train),2))
print("MAPE on Test data : ", round(np.mean(np.abs(y_test - y_pred_test)/y_test),2))



In [None]:
#Seattle_Longitude=	-122.335167
#Seattle_Latitude=	47.608013
#!pip install geopy
#from geopy.geocoders import Nominatim
#geolocator = Nominatim(user_agent="geoapiExercises")
#def city_state_country(coord):
#    location = geolocator.reverse(coord, exactly_one=True)
#    address = location.raw['address']
#    city = address.get('city', '')
#    state = address.get('state', '')
#    country = address.get('country', '')
#    return city, state, country

# Transformations

In [None]:
#Histogram on target variable
import seaborn as sns
plt.figure(figsize=(10,10))
sns.distplot(house_price_data["price"])
plt.title('House Price distribution', fontsize=20)

In [None]:
#Log transformation
house_price_data["log_price"]=np.log(house_price_data["price"])
plt.figure(figsize=(10,10))
sns.distplot(house_price_data["log_price"])
plt.title('log(House Price) distribution', fontsize=20)


## Model4 with transformations

In [None]:
###Model building after Transformations
#Defining X data
X = house_price_data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']]

y = house_price_data['log_price']

from sklearn  import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.2, random_state=55)

import sklearn
model4 = sklearn.linear_model.LinearRegression()
model4.fit(X_train, y_train)



In [None]:
#Rsquared Calculation on Train data
from sklearn import metrics
y_pred_train=model4.predict(X_train)
print("Train data R-Squared : ", metrics.r2_score(y_train,y_pred_train))

#Rsquared Calculation on test data
y_pred_test=model4.predict(X_test)
print("Test data R-Squared : " , metrics.r2_score(y_test,y_pred_test))

#MAPE
print("MAPE on Train data : ", round(np.mean(np.abs(y_train - y_pred_train)/y_train),4))
print("MAPE on Test data : ", round(np.mean(np.abs(y_test - y_pred_test)/y_test),4))


# One hot encoding

In [None]:
house_price_data.columns

In [None]:
categorical_cols=['waterfront', 'view', 'condition', 'grade','zipcode']
for col in categorical_cols:
  print(house_price_data[col].value_counts())

In [None]:
# get dummy variables
one_hot_data = pd.get_dummies(house_price_data['zipcode'])
#Try all ['view', 'condition', 'grade','zipcode']
print("one_hot_data \n", one_hot_data.sample(10))

In [None]:
# Concatenate dummy columns with main dataframe
house_price_with_dummy = pd.concat([house_price_data, one_hot_data],axis=1)
house_price_with_dummy.head()

## Model5 with one hot encoded values

In [None]:
###Model building after Transformations
#Defining X data

prev_cols=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
encoded_cols=list(one_hot_data.columns.values)

all_pred_cols=prev_cols+encoded_cols

X = house_price_with_dummy[all_pred_cols]
X.columns = X.columns.astype(str) #Converting numeric column names into Strings

y = house_price_with_dummy['price']

from sklearn  import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.2, random_state=55)

import sklearn
model5 = sklearn.linear_model.LinearRegression()
model5.fit(X_train, y_train)

In [None]:
#Rsquared Calculation on Train data
from sklearn import metrics
y_pred_train=model5.predict(X_train)
print("Train data R-Squared : ", metrics.r2_score(y_train,y_pred_train))

#Rsquared Calculation on test data
y_pred_test=model5.predict(X_test)
print("Test data R-Squared : " , metrics.r2_score(y_test,y_pred_test))

#MAPE
print("MAPE on Train data : ", round(np.mean(np.abs(y_train - y_pred_train)/y_train),4))
print("MAPE on Test data : ", round(np.mean(np.abs(y_test - y_pred_test)/y_test),4))


# Binning

## sqft_living vs price

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(house_price_data["sqft_living"],np.log(house_price_data["price"]))
plt.title('House Price vs sqft_living', fontsize=20)
plt.xlabel('sqft_living', fontsize=15)

## Creating bins

In [None]:
house_price_with_dummy['bins'] = pd.qcut(house_price_with_dummy["sqft_living"], q=10)
house_price_with_dummy['bins'].value_counts(sort=False)

In [None]:
bins_one_hot = pd.get_dummies(house_price_with_dummy['bins'])
data_with_bins_dummy = pd.concat([house_price_with_dummy, bins_one_hot],axis=1)
bins_cols=list(bins_one_hot.columns)

all_pred_cols=prev_cols+encoded_cols+bins_cols

## Model6 with Bins

In [None]:
X = data_with_bins_dummy[all_pred_cols]
y = data_with_bins_dummy['price']

from sklearn  import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.2, random_state=55)

import sklearn
model6 = sklearn.linear_model.LinearRegression()
model6.fit(X_train, y_train)



In [None]:
#Rsquared Calculation on Train data
from sklearn import metrics
y_pred_train=model6.predict(X_train)
print("Train data R-Squared : ", metrics.r2_score(y_train,y_pred_train))

#Rsquared Calculation on test data
y_pred_test=model6.predict(X_test)
print("Test data R-Squared : " , metrics.r2_score(y_test,y_pred_test))

#MAPE
print("MAPE on Train data : ", round(np.mean(np.abs(y_train - y_pred_train)/y_train),4))
print("MAPE on Test data : ", round(np.mean(np.abs(y_test - y_pred_test)/y_test),4))
