In [2]:
import pandas as pd
import numpy as np

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Data Dictionary:-

1. X2=the house age (unit: year)
2. X3=the distance to the nearest MRT station (unit: meter)
3. =the number of convenience stores in the living circle on foot (integer)
4. =the geographic coordinate, latitude. (unit: degree)
5. =the geographic coordinate, longitude. (unit: degree)

The output is as follows -
Y= house price of unit area (10000 New Taiwan Dollar/Ping, where Ping is a local unit, 1 Ping = 3.3 meter squared)

In [3]:
df= pd.read_csv('../input/real-estate/Real estate.csv',index_col=[0])
df= df.reset_index()
df

In [4]:
del df['No']


In [5]:
df.head()

In [6]:
df.isna().sum()

In [7]:
df.info()

### Exploratory Data Analysis

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
df['house age'].plot.hist()

In [10]:
df['house age'].plot.box()

1. Most of the houses(50% of them) are less than 15 years old
2. Rest are more than 15 years old.


In [11]:
import plotly.express as px

In [12]:
px.histogram(df,df['distance to the nearest MRT station'],width=600,height=500)

1. Most of the houses have nearest MRT station by distance of less than 2Km.

In [13]:
px.scatter(df,df['distance to the nearest MRT station'],df['house price of unit area'])

1. The prices of the house which are nearest to the metro station is of relatively higher price per unit area as between (30-80) within 1Km. Followed by (20-40) for area between (1-2 ) KM.

2. Similarly the prices of house per unit area decrease with increase in the distance between metro station of that of the house.


In [14]:
df['number of convenience stores']= df['number of convenience stores'].astype('category')


In [15]:
df['number of convenience stores'].value_counts().sort_values(ascending=False).plot(kind='bar')
plt.title('Countof convenience stores')
plt.xlabel('No of convinenct stores')
plt.ylabel('Count');

In [16]:
df['number of convenience stores'].value_counts(normalize=True)

1. Most of the houses have convinence stores of less than 5 in around (50%) of the houses

### Target Variable

In [17]:
px.box(df['house price of unit area'])

### Removing outliers


In [18]:
Q1 = np.percentile(df['house price of unit area'], 25,
                   interpolation = 'midpoint')
 
Q3 = np.percentile(df['house price of unit area'], 75,
                   interpolation = 'midpoint')
IQR = Q3 - Q1
 
print("Old Shape: ", df.shape)
 
# Upper bound
upper = np.where(df['house price of unit area'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df['house price of unit area'] <= (Q1-1.5*IQR))
 
''' Removing the Outliers '''
df.drop(upper[0], inplace = True)
df.drop(lower[0], inplace = True)
 
print("New Shape: ", df.shape)

In [19]:
px.box(df['house price of unit area'])

### Checking the multicollinearity

In [20]:
import seaborn as sns

In [21]:
sns.heatmap(df.corr(),annot=True)

In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variance_inflation_factor
vif = pd.DataFrame()
vif["features"] = df.columns
vif["vif_Factor"] = [variance_inflation_factor(df.values, i).round(1) for i in range(df.shape[1])]
vif

In [23]:
df['longitude'].corr(df['latitude'])

In [24]:
X= df.drop(columns=['house price of unit area'])
Y= df['house price of unit area']

In [25]:
X['distance to the nearest MRT station']= X['distance to the nearest MRT station'].astype(int)

### Gradient Descent Algorithm

In [26]:
X.info()
X['number of convenience stores']= X['number of convenience stores'].astype(int)

In [27]:
X= X.apply(lambda rec:(rec-rec.mean())/rec.std(),axis=0)
X

### INITIATING THE VALUE FOR SLOPE AND INTERCEPT

In [28]:
import random
def initialize(dim):
  b=random.random()
  theta=np.random.rand(dim)
  return b,theta
b,theta=initialize(5)
print("Intercept: ",b,"Slope: ",theta)

### Predicted values of Y

In [29]:
def predict_Y(b,theta,X):
  return b + np.dot(X,theta)
Y_hat=predict_Y(b,theta,X)
Y_hat

### Cost function 

### MSE - VALUE FOR INITIATED SLOPE VALUES AND INTERCEPT

In [30]:
import math
def get_cost(Y,Y_hat):
  Y_resid=Y-Y_hat
  return np.sum(np.dot(Y_resid.T,Y_resid))/len(Y-Y_resid)
Y_hat=predict_Y(b,theta,X)
get_cost(Y,Y_hat)

In [31]:
def update_theta(x,y,y_hat,b_0,theta_o,learning_rate):
  db=(np.sum(y_hat-y)*2)/len(y)
  dw=(np.dot((y_hat-y),x)*2)/len(y)
  b_1=b_0-learning_rate*db
  theta_1=theta_o-learning_rate*dw
  return b_1,theta_1
print("After initialization -intercept: ",b,"slope: ",theta)
Y_hat=predict_Y(b,theta,X)
b,theta=update_theta(X,Y,Y_hat,b,theta,0.01)
print("After first update -intercept: ",b,"slope: ",theta)
get_cost(Y,Y_hat)

In [32]:
def run_gradient_descent(X,Y,alpha,num_iterations):
  b,theta=initialize(X.shape[1])
  iter_num=0
  gd_iterations_df=pd.DataFrame(columns=['iteration','cost'])
  result_idx=0
  for each_iter in range(num_iterations):
    Y_hat=predict_Y(b,theta,X)
    this_cost=get_cost(Y,Y_hat)
    prev_b=b
    prev_theta=theta
    b,theta=update_theta(X,Y,Y_hat,prev_b,prev_theta,alpha)
    if(iter_num%10==0):
      gd_iterations_df.loc[result_idx]=[iter_num,this_cost]
    result_idx=result_idx+1
    iter_num +=1
  print("Final Estimate of b and slope :",b,theta)
  return gd_iterations_df,b,theta
gd_iterations_df,b,theta=run_gradient_descent(X,Y,alpha=0.001,num_iterations=200)

In [33]:
gd_iterations_df

In [34]:
%matplotlib inline
plt.plot(gd_iterations_df['iteration'],gd_iterations_df['cost'])
plt.xlabel("Number of iterations")
plt.ylabel("MSE");

The MSE tends to decrease significantly after 175 iterations

Hence we reach the global minima of linear regression optimized value as- 713.49

Equation thus formed is :- y= 12.837271267213714 +0.36573071(x1)+1.7400131(x2)+2.507124069(x3)+1.77342638(x4)+2.20128138(x5)