In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from sklearn.preprocessing import OneHotEncoder

In [None]:
train = pd.read_csv('E:/Nehali/MS DS/Machine Learning - Daming Li/Nehali/Midterm/house-prices-advanced-regression-techniques/train.csv')

### First let's looks at the summary for numerical data to get a sense of the data:

In [None]:
train.describe().T

### We plot a heatmap to measure multicollinearity in our features

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(abs(train.drop('Id', axis=1).corr()), cmap='YlGnBu')

### Some features pairs are so highly correlated, we can drop one from our data to improve our model:

In [None]:
remove_num = ['GarageArea', 'TotalBsmtSF', 'GarageYrBlt', 'TotRmsAbvGrd']
train = train.drop(remove_num,axis=1)

### Now, let's examine categorical features by plotting their distributions:

In [None]:
categorical_vars = train.columns[train.dtypes=='object']
f, ax = plt.subplots(10, 5, figsize=(50, 50))
for i, c in enumerate(categorical_vars):
    g = sns.barplot(data=pd.DataFrame(train[c].value_counts()).reset_index(), x='index', y=c, ax=ax[i//5, i%5])
    g.set(xticks=[])
    g.set(title=c)

#### We can probably drop features with one overwhelmingly common category:

In [None]:
remove_cat = ['Street', 'LandContour', 'Utilities', 
              'LandSlope', 'Condition1', 'Condition2', 
              'RoofMatl','BsmtCond', 'BsmtFinType2', 
              'Heating', 'GarageCond', 'GarageQual', 'Functional'] 
train = train.drop(remove_cat, axis=1)

### Now, let's do one-hot encodings for our categorical features:

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False,drop='first')
data_obj = ohe.fit_transform(train[train.columns[train.dtypes=='object']])

obj_df = pd.DataFrame(data_obj,columns=ohe.get_feature_names(train.columns[train.dtypes=='object']))
cat_columns = obj_df.columns

train = pd.concat([train[train.columns[train.dtypes!='object']], 
                         obj_df],
                        axis=1)

### Let's check for any low variance features and drop them:
#### First we must normalize:

In [None]:
# train = (train-train.mean())/train.std()
train=(train-train.min())/(train.max()-train.min())

In [None]:
low_var = train.columns[train.var()<.001]
print(low_var)
train = train.drop(columns=low_var,axis=0)

### In order to run models, we must remove any Nan values:
#### First let's check which features have Nans:

In [None]:
train.columns[train.isna().any()].tolist()

#### In these 2 cases, we can interpret any Nan values as 0:

In [None]:
train = train.fillna(0)

### Finding the best features can be done in numerous ways, we went with 3:
### - Extracting feature importance through Random Forest model
### - Checking correlation with target feature
### - Discussing real-life importance of features

#### Random Forest analysis:

In [None]:
from sklearn.ensemble import RandomForestRegressor

X = train.drop(['Id', 'SalePrice'], axis=1)
y = train['SalePrice']

model = RandomForestRegressor(random_state=1, max_depth=10)
model.fit(X,y)

features = X.columns
importances = model.feature_importances_
indices = np.argsort(importances)[-10:]  # top 10 features
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Now, let's examine the best correlations with the target:

In [None]:
corr_vals = train.corr(method='pearson')
sorted_corr_vals = corr_vals['SalePrice'].sort_values(ascending=False)
temp = sorted_corr_vals.to_frame(name='corr')
temp.head(15)

### These methods, along with group discussion on real-world impacts these features likely have on sale-price, we arrived at the following features to try:
- Overall Quality 
- GrLivArea
- LotArea 
- GarageCars
- FullBath 
- 1stFlrSF
- MasVnrArea 
- ExterQual
- YearBuilt  
- Neighborhood

In [None]:
Neighborhood = [col for col in train.columns if 'Neighborhood' in col]
ExterQual = [col for col in train.columns if 'ExterQual' in col]

features = ['OverallQual', 'GarageCars', 'FullBath', 
            'YearBuilt', '2ndFlrSF', 'MasVnrArea', 
            'LotArea', 'GrLivArea'] + Neighborhood + ExterQual

train[features]

### Now let's create some new features! One way to do this is to derive features from different columns into a single columns.
### For this, we went with: 
- PorchDeckRank (scaled score of porch and deck quality) 
- TotRmsAbvGrdwithBath (totrms + full bath + halfbaths) 
- RelativeAge (YearRemodAdd-YrSold) + MoSold/12

In [None]:
data = pd.read_csv('E:/Nehali/MS DS/Machine Learning - Daming Li/Nehali/Midterm/house-prices-advanced-regression-techniques/train.csv')
train['RelAge'] = data['YrSold'] - data['YearRemodAdd'] + data['MoSold']/12
train['TotRmsAbvGrdBath'] = data[['TotRmsAbvGrd', 'FullBath', 'HalfBath']].sum(axis=1)
train['PorchDeckRank'] = data[['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)

new_features = ['RelAge','TotRmsAbvGrdBath','PorchDeckRank']

### Another way to add new columns is through PCA analzysis, where we compress variance into fewer columns, reducing dimensionality:

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
x_pca = pca.fit_transform(train[features])
x_pca = pd.DataFrame(x_pca)

explained_variance = pca.explained_variance_ratio_
explained_variance
np.cumsum(pca.explained_variance_ratio_)

### The first two columns contain almost half of the overall variance information of the features, and can be used as extra features in our models:

In [None]:
train['PCA1'] = x_pca[[0]]
train['PCA2'] = x_pca[[1]]

#### Let's normalize these new features and add them to our feature list:

In [None]:
new_features += ['PCA1','PCA2']
train[new_features]=(train[new_features]-
                     train[new_features].min()
                    )/(train[new_features].max()-
                       train[new_features].min())

features += new_features

## Now, in order to evaluate our models, let's break our dataset into training and testing data:

In [None]:
from sklearn.model_selection import train_test_split

X = train[features]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#### Let's use the following metrics so we can compare using common metrics

In [None]:
from sklearn import metrics
# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# print('R-squared Error:', metrics.r2_score(y_test, y_pred))

### Support Vector Machine for Regression (SVR)

In [None]:
from sklearn.svm import SVR

svr = SVR(kernel='linear', gamma='scale')
svr.fit(X_train, y_train)

In [None]:
svr_pred = svr.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, svr_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, svr_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, svr_pred)))
print('R-squared Error:', metrics.r2_score(y_test, svr_pred))

In [None]:
svr.get_params()

#### Model Improvements - try different kernels

In [None]:
svr.set_params(C=10.0)
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, svr_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, svr_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, svr_pred)))
print('R-squared Error:', metrics.r2_score(y_test, svr_pred))

In [None]:
svr.set_params(kernel='poly')
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, svr_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, svr_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, svr_pred)))
print('R-squared Error:', metrics.r2_score(y_test, svr_pred))

In [None]:
svr.get_params()

# KNN Model

In [None]:
from sklearn import neighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split 

X = train[features]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn = KNeighborsRegressor(n_neighbors = 5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

### Evaluating different parameters

In [None]:
neighbors = np.arange(1,11)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    knn = KNeighborsRegressor(n_neighbors = k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

plt.title('KNN Neighbor Number')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
knn = KNeighborsRegressor(n_neighbors = 4)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

In [None]:
weights_opt = ['uniform', 'distance']
train_accuracy = np.empty(len(weights_opt))
test_accuracy = np.empty(len(weights_opt))

for i in range(len(weights_opt)):
    knn = KNeighborsRegressor(n_neighbors = 4, weights = weights_opt[i])
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

plt.title('KNN Weights')
plt.plot(weights_opt, test_accuracy, label = 'Testing Accuracy')
plt.plot(weights_opt, train_accuracy, label = 'Training accuracy')
plt.legend()
plt.xlabel('Weights')
plt.ylabel('Accuracy')
plt.show()

In [None]:
alg = ['auto', 'ball_tree', 'kd_tree', 'brute']
train_accuracy = np.empty(len(alg))
test_accuracy = np.empty(len(alg))

for i in range(len(alg)):
    knn = KNeighborsRegressor(n_neighbors = 4, weights = 'distance', algorithm = alg[i])
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

plt.title('KNN Algorithm')
plt.plot(alg, test_accuracy, label = 'Testing Accuracy')
plt.plot(alg, train_accuracy, label = 'Training accuracy')
plt.legend()
plt.xlabel('Algorithm')
plt.ylabel('Accuracy')
plt.show()

In [None]:
leaf = np.arange(1,100)
train_accuracy = np.empty(len(leaf))
test_accuracy = np.empty(len(leaf))

for i,k in enumerate(leaf):
    knn = KNeighborsRegressor(n_neighbors = 4, weights = 'distance', algorithm = 'ball_tree', leaf_size = k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

plt.title('KNN Ball Tree Leaf Size')
plt.plot(leaf, test_accuracy, label = 'Testing Accuracy')
plt.plot(leaf, train_accuracy, label = 'Training accuracy')
plt.legend()
plt.xlabel('Leaf Size')
plt.ylabel('Accuracy')
plt.show()

In [None]:
leaf = np.arange(1,100)
train_accuracy = np.empty(len(leaf))
test_accuracy = np.empty(len(leaf))

for i,k in enumerate(leaf):
    knn = KNeighborsRegressor(n_neighbors = 4, weights = 'distance', algorithm = 'kd_tree', leaf_size = k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

plt.title('KNN KD Tree Leaf Size')
plt.plot(leaf, test_accuracy, label = 'Testing Accuracy')
plt.plot(leaf, train_accuracy, label = 'Training accuracy')
plt.legend()
plt.xlabel('Leaf Size')
plt.ylabel('Accuracy')
plt.show()

In [None]:
p = np.arange(1,5)
train_accuracy = np.empty(len(p))
test_accuracy = np.empty(len(p))

for i,k in enumerate(p):
    knn = KNeighborsRegressor(n_neighbors = 4, weights = 'distance', algorithm = 'auto', p = k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)

plt.title('KNN Power Parameter')
plt.plot(p, test_accuracy, label = 'Testing Accuracy')
plt.plot(p, train_accuracy, label = 'Training accuracy')
plt.legend()
plt.xlabel('Power Parameter')
plt.ylabel('Accuracy')
plt.show()

In [None]:
knn = KNeighborsRegressor(n_neighbors = 4, weights = 'distance', algorithm = 'auto', p = 1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R-squared Error:', metrics.r2_score(y_test, y_pred))

# Overall Accuracy of KNN Model After Testing

In [None]:
knn = KNeighborsRegressor(n_neighbors = 4, weights = 'distance', algorithm = 'auto', p = 1)
knn.fit(X_train, y_train)
accuracy = knn.score(X_test, y_test)
print('Final Accuracy:', str(format(accuracy*100, '.3f')) + '%')

# Linear Regression

In [None]:
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = train[features]
y = train['SalePrice']

lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

from sklearn import metrics
print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred),4))  
print('Mean Squared Error:', round(metrics.mean_squared_error(y_test, y_pred),4))  
print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),4))
print('R-squared Error:', round(metrics.r2_score(y_test, y_pred), 4))

In [None]:
lr.get_params()

In [None]:
#Plotting a heatmap to view the features
plt.figure(figsize=(12, 12))
sns.heatmap(abs(X.corr()), cmap='YlGnBu')

In [None]:
#Droppig some of the features to improve the model
X=X.drop(['ExterQual_Fa','ExterQual_Gd','ExterQual_TA'],axis=1)
X.describe().T

In [None]:
#no improvements found
lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

from sklearn import metrics
print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred),4))  
print('Mean Squared Error:', round(metrics.mean_squared_error(y_test, y_pred),4))  
print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),4))
print('R-squared Error:', round(metrics.r2_score(y_test, y_pred), 4))