# <font color='blue'>House Pricing:</font> <font color='red'>Advance Regression Technique</font> with 98% accuracy by using RandomForest Regression
* **Part 1 - Data Preprocessing**
   1. Importing libraries
   2. Importing the dataset
   3. Dataset information
   4. Dropping unnecessary columns
      - "Train" 
      - "Test" 
   5. Taking care of misssing data
      - "Train" Numerical
      - "Train" Categorical
      - "Test" Numerical
      - "Test" Categorical
      - Updated info()
   6. Encoding categorical data
      - "Train"
      - "Test"
      - Updated head()
   7. Spliting the Train & Test datasets
   8. Feature Scaling  
   9. Dimensionality reduction
* **Part 2 - Training the Regression model**
   1. RandomForest 
   2. Other algorithms
   3. Accuracy score  
* **Part 3 - Creating a submission.csv**

# <font color='blue'>Part 1 - Data Preprocessing</font>

# Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.pyplot import figure

# Importing the dataset

In [2]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Dataset information

In [3]:
# data type and missing values of each column
train_df.info()

In [4]:
test_df.info()

In [5]:
# Description of both datasets
train_df.describe()

In [6]:
test_df.describe()

In [7]:
# 1st 5 rows of every column for overview
train_df.head()

In [8]:
test_df.head()

In [9]:
sns.distplot(train_df['SalePrice'])

In [10]:
plt.rcParams['figure.figsize']=35,35
g = heatmap = sns.heatmap(train_df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG',fmt = ".1f")
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':10}, pad=12);

In [11]:
sns.barplot(x='YearBuilt', y='SalePrice', data=train_df)

# Dropping unnecessary columns

### "Train"

In [12]:
train_df=train_df.drop("Id",axis=1)
train_df=train_df.drop("Alley",axis=1)
train_df=train_df.drop("PoolQC",axis=1)
train_df=train_df.drop("Fence",axis=1)
train_df=train_df.drop("MiscFeature",axis=1)

### "Test"

In [13]:
test_df=test_df.drop("Alley",axis=1)
test_df=test_df.drop("PoolQC",axis=1)
test_df=test_df.drop("Fence",axis=1)
test_df=test_df.drop("MiscFeature",axis=1)

# Taking care of misssing data

### "Train" Numerical

In [14]:
train_df["LotFrontage"] = train_df["LotFrontage"].fillna(train_df["LotFrontage"].mean())
train_df["MasVnrArea"] = train_df["MasVnrArea"].fillna(train_df["MasVnrArea"].mean())
train_df["GarageYrBlt"] = train_df["GarageYrBlt"].fillna(2001)

### "Train" Categorical 

In [15]:
c = ("GarageType", "GarageFinish", "GarageQual", "GarageCond", "BsmtFinType2", "BsmtCond", "BsmtQual", "BsmtExposure", "MasVnrType", "Electrical", "FireplaceQu", "BsmtFinType1")
for col in c:
  if train_df[col].dtype == "object":
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])

''' OR
for col in ("GarageType", "GarageFinish", "GarageQual", "GarageCond", "BsmtFinType2", "BsmtCond", "BsmtQual", "BsmtExposure", "MasVnrType", "Electrical", "FireplaceQu", "BsmtFinType1"):
  test_df[col] = test_df[col].fillna('None')
'''

### "Test" Numerical

In [16]:
test_df["LotFrontage"] = test_df["LotFrontage"].fillna(test_df["LotFrontage"].mean())
test_df["MasVnrArea"] = test_df["MasVnrArea"].fillna(test_df["MasVnrArea"].mean())
test_df["GarageYrBlt"] = test_df["GarageYrBlt"].fillna(2001)
test_df["GarageCars"] = test_df["GarageCars"].fillna(0)
test_df["GarageArea"] = test_df["GarageArea"].fillna(test_df["GarageArea"].mean())
test_df["BsmtFullBath"] = test_df["BsmtFullBath"].fillna(0)
test_df["BsmtHalfBath"] = test_df["BsmtHalfBath"].fillna(0)
test_df["BsmtFinSF1"] = test_df["BsmtFinSF1"].fillna(test_df["BsmtFinSF1"].mean())
test_df["BsmtFinSF2"] = test_df["BsmtFinSF2"].fillna(test_df["BsmtFinSF2"].mean())
test_df["TotalBsmtSF"] = test_df["TotalBsmtSF"].fillna(test_df["TotalBsmtSF"].mean())
test_df["BsmtUnfSF"] = test_df["BsmtUnfSF"].fillna(test_df["BsmtUnfSF"].mean())


### "Test" Categorical

In [17]:
c = ("GarageType", "GarageFinish", "GarageQual", "GarageCond", "BsmtFinType2", "BsmtCond", "BsmtQual", "BsmtExposure", "MasVnrType", "Electrical","MSZoning","Utilities","Exterior1st","Exterior2nd","KitchenQual","Functional","FireplaceQu","SaleType", "BsmtFinType1")
for col in c:
  if test_df[col].dtype == "object":
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

''' OR
for col in ("GarageType", "GarageFinish", "GarageQual", "GarageCond", "BsmtFinType2", "BsmtCond", "BsmtQual", "BsmtExposure", "MasVnrType", "Electrical","MSZoning","Utilities","Exterior1st","Exterior2nd","KitchenQual","Functional","FireplaceQu","SaleType", "BsmtFinType1"):
  test_df[col] = test_df[col].fillna('None')
'''

### Updated info()

In [18]:
# All the missing values are filled
train_df.info()

In [19]:
test_df.info()

### Plotting the categorical data

In [20]:
categorical_cols_df = train_df.select_dtypes(include='object')
cols = categorical_cols_df.columns
fig = plt.figure(figsize = (30,60))
fig.subplots_adjust(hspace=0.3)
len(cols)
for i in range(1,38):
    plt.subplot(19, 2, i)
    order = train_df.groupby(cols[i])['SalePrice'].mean().sort_values(ascending=True).index.values
    sns.boxplot(x=cols[i], y='SalePrice', data= train_df, order = order)

# Encoding categorical data

In [21]:
categorical_cols_df = train_df.select_dtypes(include='object')
cols = categorical_cols_df.columns
cols
for i in cols:
    order = train_df.groupby(i)['SalePrice'].mean().sort_values(ascending=True).index.values
    train_df[i].replace(to_replace=order, value=list(range(0,len(order))), inplace = True)
    test_df[i].replace(to_replace=order, value=list(range(0,len(order))), inplace = True)

In [22]:
train_df.info()

### Updated head()

In [23]:
# All the categorical data is encoded with numbers
train_df.head()

In [24]:
test_df.head()

In [25]:
test_df.isnull().sum()[test_df.isnull().sum()>0].sort_values(ascending=False)

In [26]:
test_df.info()

# Spliting the Train & Test datasets

In [27]:
X_train = train_df.drop("SalePrice", axis=1)
Y_train = train_df["SalePrice"]
X_test  = test_df.drop("Id", axis=1).copy()
''' OR
X_train = train_df[:, 0:-1]
Y_train = train_df[:, -1]
X_test  = test_df[:, 1:]
'''

# Feature Scaling

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [29]:
print(X_train)

In [30]:
print(Y_train)

# Dimensionality Reduction

In [31]:
# Principle Component Analysis
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# <font color='blue'>Part 2 - Training the Regression model on the Training set</font>

In [32]:
from sklearn.ensemble import RandomForestRegressor
score_RF_list = []
for i in range(10,250,10):
    RF=RandomForestRegressor(n_estimators = i, random_state = 0)
    RF.fit(X_train,Y_train)
    score_RF = round(RF.score(X_train, Y_train) * 100, 2)
    score_RF_list.append(score_RF)
    
score_RF_list   

In [33]:
x_axis = [x for x in range(10,250,10)]
sns.lineplot(x=x_axis, y=score_RF_list)

In [43]:

regressor = RandomForestRegressor(n_estimators = 50, random_state = 0)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

### Other Algorithms 

In [35]:
''' OR
from xgboost import XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)
'''

### Accuracy score

In [44]:
from sklearn.metrics import accuracy_score
regressor.score(X_train, Y_train)
regressor = round(regressor.score(X_train, Y_train) * 100, 2)
regressor

# <font color='blue'>Part 3 - Creating a submission.csv</font>

In [37]:
submission = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": Y_pred
    })

In [38]:
submission.to_csv('submission.csv', index=False)
submission

# If you liked my work then please upvote, Thank you.