In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from statsmodels.api import OLS
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import seaborn as sns

In [3]:
training_data = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
training_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [None]:
columns_has_null = training_data.isna().sum().sort_values(ascending=False)
columns_has_null[columns_has_null > 0]

In [None]:
training_data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace=True)
columns_has_null = training_data.isna().sum().sort_values(ascending=False)
columns_has_null[columns_has_null > 0]

In [None]:
X_train = training_data.iloc[:, :-1]
X_train

In [None]:
def ConvertStringToInt(df):
  columns_to_encode = list(df.select_dtypes(include=['category','object'])) #gets all columns with non-numeric values
  le = LabelEncoder()
  for feature in columns_to_encode:
      try:
          df[feature] = le.fit_transform(df[feature])
      except:
          print('Error encoding '+feature)
  return df

In [None]:
X_train = ConvertStringToInt(X_train)
X_train = X_train.replace({np.NaN:-1})
X_train

In [None]:
y_train = training_data.iloc[:, -1:]
y_train = np.ravel(y_train)
y_train

In [None]:
testing_data = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")
testing_data

In [None]:
testing_data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1, inplace=True)
columns_has_null = testing_data.isna().sum().sort_values(ascending=False)
columns_has_null[columns_has_null > 0]

In [None]:
X_test = testing_data
X_test = ConvertStringToInt(X_test)
X_test = X_test.replace({np.NaN:-1})
y_test = None

In [None]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

In [None]:
print("train score:", forest.score(X_train,y_train))

In [None]:
training_data = ConvertStringToInt(training_data)
training_data = training_data.replace({np.NaN:0})
X = training_data.iloc[:, :-1]
y = training_data.iloc[:, -1:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, train_size=0.75)

In [None]:
y_test

In [None]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

In [None]:
print("model score on training data:", forest.score(X_train, y_train))

In [None]:
print("model score on test data:", forest.score(X_test, y_test))

In [None]:
sns.set_context("paper")
sns.set_style("ticks")

In [None]:
fig,ax = plt.subplots()
plt.plot(y_test.to_numpy(), alpha = 0.8,label=r"observed price")
plt.plot(forest.predict(X_test), alpha=0.8, label=r"predicted price")

#always label your axes
plt.xlabel(r"Sample number")
plt.ylabel(r"house price")

# create a legend
plt.legend(loc="upper left")
#ax.xaxis.set_minor_locator(MultipleLocator(2))
#ax.yaxis.set_minor_locator(MultipleLocator(0.2))
plt.xlim(0,100)
#plt.ylim(0.5,6)
plt.tight_layout()
#plt.savefig("House_price_regression.png", dpi=600)
#sns.despine()
plt.show()

In [None]:
fig,ax = plt.subplots()
plt.plot(forest.predict(X_test), y_test.to_numpy(), ".", alpha=0.6)

plt.xlabel(r"predicted price")
plt.ylabel(r"observed price")

plt.xlim(0,500000)
plt.ylim(0,500000)

plt.tight_layout()
#plt.savefig("predicted_house_price_regression_correlation.png", dpi=600)
plt.show()

In [None]:
feature_names = [x for x in X.columns]
importances = forest.feature_importances_
sorted_lists = sorted(zip(importances, feature_names), reverse=True)
importances, feature_names = [[x[0] for x in sorted_lists], [x[1] for x in sorted_lists]]

In [None]:
plt.figure()
plt.bar(feature_names, importances)
plt.xticks(rotation=45)
plt.ylabel(r"feature importance")
plt.ylim(0,0.6)
plt.xlim(0,20)
plt.tight_layout()
plt.show()