In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.linear_model import LinearRegression

In [4]:
model = LinearRegression()

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.metrics import mean_squared_error,r2_score

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
import matplotlib.pyplot as plt

In [9]:
df = pd.read_csv('Banglore Housing Prices.csv')
df

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00
2,Uttarahalli,3 BHK,1440,2.0,62.00
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00
4,Kothanur,2 BHK,1200,2.0,51.00
...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00
13316,Richards Town,4 BHK,3600,5.0,400.00
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00


In [10]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [11]:
df.dropna(inplace=True)

In [12]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [13]:
df

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00
2,Uttarahalli,3 BHK,1440,2.0,62.00
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00
4,Kothanur,2 BHK,1200,2.0,51.00
...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00
13316,Richards Town,4 BHK,3600,5.0,400.00
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00


In [14]:
df['size'] = df['size'].str.split().str[0].astype(int)

In [15]:
df

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056,2.0,39.07
1,Chikka Tirupathi,4,2600,5.0,120.00
2,Uttarahalli,3,1440,2.0,62.00
3,Lingadheeranahalli,3,1521,3.0,95.00
4,Kothanur,2,1200,2.0,51.00
...,...,...,...,...,...
13315,Whitefield,5,3453,4.0,231.00
13316,Richards Town,4,3600,5.0,400.00
13317,Raja Rajeshwari Nagar,2,1141,2.0,60.00
13318,Padmanabhanagar,4,4689,4.0,488.00


In [16]:
def convert_sqft(x):
    tokens = x.split('-')
    if(len(tokens)==2):
        return (float(tokens[0]) + float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
df['total_sqft'] = df['total_sqft'].apply(convert_sqft)    

In [17]:
df

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056.0,2.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,120.00
2,Uttarahalli,3,1440.0,2.0,62.00
3,Lingadheeranahalli,3,1521.0,3.0,95.00
4,Kothanur,2,1200.0,2.0,51.00
...,...,...,...,...,...
13315,Whitefield,5,3453.0,4.0,231.00
13316,Richards Town,4,3600.0,5.0,400.00
13317,Raja Rajeshwari Nagar,2,1141.0,2.0,60.00
13318,Padmanabhanagar,4,4689.0,4.0,488.00


In [18]:
df['price_per_sqft'] = df['price']/df['total_sqft']

In [19]:
df

Unnamed: 0,location,size,total_sqft,bath,price,price_per_sqft
0,Electronic City Phase II,2,1056.0,2.0,39.07,0.036998
1,Chikka Tirupathi,4,2600.0,5.0,120.00,0.046154
2,Uttarahalli,3,1440.0,2.0,62.00,0.043056
3,Lingadheeranahalli,3,1521.0,3.0,95.00,0.062459
4,Kothanur,2,1200.0,2.0,51.00,0.042500
...,...,...,...,...,...,...
13315,Whitefield,5,3453.0,4.0,231.00,0.066898
13316,Richards Town,4,3600.0,5.0,400.00,0.111111
13317,Raja Rajeshwari Nagar,2,1141.0,2.0,60.00,0.052585
13318,Padmanabhanagar,4,4689.0,4.0,488.00,0.104073


In [20]:
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [21]:
df = detect_outliers(df, 'price_per_sqft')

In [22]:
df = detect_outliers(df,'size')

In [23]:
df.isnull().sum()

location          0
size              0
total_sqft        0
bath              0
price             0
price_per_sqft    0
dtype: int64

In [24]:
df.dropna(inplace=True)

In [25]:
df.isnull().sum()

location          0
size              0
total_sqft        0
bath              0
price             0
price_per_sqft    0
dtype: int64

In [26]:
X = df[['size','total_sqft']]

In [27]:
y = df['price']

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=100)

In [29]:
X_train.head()

Unnamed: 0,size,total_sqft
249,3,2420.0
2486,1,296.0
3906,2,1050.0
8712,3,1530.0
3222,2,1290.0


In [30]:
X_test.head()

Unnamed: 0,size,total_sqft
2791,3,1525.0
1145,3,1590.0
1241,2,1408.0
1636,2,1100.0
1688,2,1150.0


In [31]:
y_train

249     185.00
2486     22.89
3906     52.10
8712     40.00
3222     87.00
         ...  
398     120.00
95       90.00
9308     54.20
8052     50.00
6531    280.00
Name: price, Length: 8618, dtype: float64

In [32]:
y_test

2791      96.00
1145      49.00
1241      85.00
1636      50.00
1688      46.00
          ...  
11696     76.00
11621     80.00
4785     118.00
10434     40.45
8385      35.00
Name: price, Length: 2873, dtype: float64

In [33]:
model.fit(X_train,y_train)

In [34]:
y_pred = model.predict(X_test)

In [35]:
y_pred

array([ 99.7645926 , 102.50066552,  68.88261826, ..., 117.06499216,
        94.50291391,   5.84126373], shape=(2873,))

In [36]:
mse = mean_squared_error(y_test,y_pred)

In [37]:
print("Mean Square Error is :",mse)

Mean Square Error is : 3727.474688641433


In [38]:
r2 = r2_score(y_test,y_pred)

In [39]:
print("R² Score (Accuracy):", r2)

R² Score (Accuracy): 0.2829107896038109
