# Indian House Prediction 

### Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Loading the Dataset

In [2]:
df = pd.read_csv('house.csv')
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29451 entries, 0 to 29450
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   POSTED_BY              29451 non-null  object 
 1   UNDER_CONSTRUCTION     29451 non-null  int64  
 2   RERA                   29451 non-null  int64  
 3   BHK_NO.                29451 non-null  int64  
 4   BHK_OR_RK              29451 non-null  object 
 5   SQUARE_FT              29451 non-null  float64
 6   READY_TO_MOVE          29451 non-null  int64  
 7   RESALE                 29451 non-null  int64  
 8   ADDRESS                29451 non-null  object 
 9   LONGITUDE              29451 non-null  float64
 10  LATITUDE               29451 non-null  float64
 11  TARGET(PRICE_IN_LACS)  29451 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 2.7+ MB


In [4]:
df.shape

(29451, 12)

### Dropping Columns

In [5]:
df.drop(columns = ['BHK_OR_RK', 'RERA', 'LONGITUDE', 'LATITUDE'], inplace=True)

In [6]:
df['ADDRESS'] = df['ADDRESS'].str.split(",").map(lambda x : x[-1])
df.rename(columns = {"ADDRESS" : "CITY"}, inplace=True)
df.tail()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,CITY,TARGET(PRICE_IN_LACS)
29446,Owner,0,3,2500.0,1,1,Agra,45.0
29447,Owner,0,2,769.230769,1,1,Vapi,16.0
29448,Dealer,0,2,1022.641509,1,1,Jaipur,27.1
29449,Owner,0,2,927.079009,1,1,Chennai,67.0
29450,Dealer,0,2,896.774194,1,1,Jaipur,27.8


In [7]:
df['POSTED_BY'].value_counts()

Dealer     18291
Owner      10538
Builder      622
Name: POSTED_BY, dtype: int64

In [8]:
df['POSTED_BY'] = df["POSTED_BY"].replace({"Owner":0, "Dealer":1, "Builder":2})
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,CITY,TARGET(PRICE_IN_LACS)
0,0,0,2,1300.236407,1,1,Bangalore,55.0
1,1,0,2,1275.0,1,1,Mysore,51.0
2,0,0,2,933.159722,1,1,Bangalore,43.0
3,0,0,2,929.921143,1,1,Ghaziabad,62.5
4,1,1,2,999.009247,0,1,Kolkata,60.5


In [9]:
df['CITY'].value_counts()

Bangalore      4340
Lalitpur       2993
Mumbai         2023
Pune           1991
Noida          1767
               ... 
Jagdalpur         1
Hajipur           1
Pudukkottai       1
Darbhanga         1
Hathras           1
Name: CITY, Length: 256, dtype: int64

### Encoding Categorical Data

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["CITY"] = le.fit_transform(df['CITY'])
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,CITY,TARGET(PRICE_IN_LACS)
0,0,0,2,1300.236407,1,1,21,55.0
1,1,0,2,1275.0,1,1,160,51.0
2,0,0,2,933.159722,1,1,21,43.0
3,0,0,2,929.921143,1,1,78,62.5
4,1,1,2,999.009247,0,1,134,60.5


### Finding Null Values

In [11]:
df.isnull().sum()

POSTED_BY                0
UNDER_CONSTRUCTION       0
BHK_NO.                  0
SQUARE_FT                0
READY_TO_MOVE            0
RESALE                   0
CITY                     0
TARGET(PRICE_IN_LACS)    0
dtype: int64

### Splitting the dataset into the Training set and Test set

In [12]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the Random Forest Regression model on the Training set

In [15]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

### Predicting the Test set results

In [16]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 70.85 100.  ]
 [ 37.5   36.5 ]
 [ 49.01  80.  ]
 ...
 [ 33.5   32.  ]
 [ 29.15  15.  ]
 [ 75.39  67.5 ]]


### Evaluating the Model Performance

In [17]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9276224484406391

### Manual Testing

In [18]:
regressor.predict([[0.66, -0.47, -0.44, -0.01, 0.47, 0.27, 0.37]])

array([21.85])