### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### importing the data sets


In [2]:
df = pd.read_csv('housing_price_dataset.csv')

In [3]:
df.shape

(50000, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SquareFeet    50000 non-null  int64  
 1   Bedrooms      50000 non-null  int64  
 2   Bathrooms     50000 non-null  int64  
 3   Neighborhood  50000 non-null  object 
 4   YearBuilt     50000 non-null  int64  
 5   Price         50000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.3+ MB


### Data pre processing


##### EDA

In [5]:
df.isna().sum()

SquareFeet      0
Bedrooms        0
Bathrooms       0
Neighborhood    0
YearBuilt       0
Price           0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


In [8]:

df["Neighborhood"] = df['Neighborhood'].map({'Rural':1, 'Suburb':2, 'Urban':3})
                                          

In [9]:
df['Neighborhood'].unique()

array([1, 2, 3], dtype=int64)

### Model Buillding

##### Splitting into input and output

In [10]:
# !pip install scikit-learn

In [11]:
from sklearn.model_selection import train_test_split


In [12]:
X = df.drop('Price',axis = 1)
y = df['Price']

###### spliting into training and testing data sets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000, 5), (10000, 5), (40000,), (10000,))

### Building the model

#### KNN

In [15]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [16]:
knn = KNeighborsRegressor()

knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)

######  model evaluation

In [17]:
np.sqrt(mean_squared_error(y_test,y_pred))



54247.93901533543

In [18]:
r2_score(y_test,y_pred)

0.48846054172419084

#### Linear regression

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:

lr = LinearRegression()

lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)

In [21]:
print(np.sqrt(mean_squared_error(y_test,y_pred)),r2_score(y_test,y_pred))

49403.976927995944 0.5757357155494236


#### Support Vector Regression

In [22]:
from sklearn.svm import SVR

In [23]:
# sv = SVR()
# sv.fit(X_train,y_train)

In [24]:
y_pred = sv.predict(X_test)

NameError: name 'sv' is not defined

In [25]:
print(np.sqrt(mean_squared_error(y_test,y_pred)),r2_score(y_test,y_pred))

49403.976927995944 0.5757357155494236


#### Decision tree

In [26]:
from sklearn.tree import DecisionTreeRegressor

In [27]:

dt = DecisionTreeRegressor()

dt.fit(X_train,y_train)

y_pred = dt.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)),r2_score(y_test,y_pred))

72368.83477136643 0.08963445394192815


In [28]:
lr.predict([[2500,6,2,3,1]])[0]



314508.6717056781

In [29]:
lr.coef_

array([  99.31308607, 5168.85659283, 2625.12603247,  752.26187692,
        -13.64933459])

In [30]:
lr.intercept_

27719.42860046134

In [31]:
import pickle


In [32]:
model = pickle.dump(lr,open("linear_reg.plk",'wb'))