4 Steps in WebScraping :

-->Sending a HTTP GET request to the url of the webpage by using requests library

-->Fetching and Parsing the data using BeautifulSoup and maintain the data in some dicts/lists

-->Analyzing the HTML tags and their attributes

-->Output the data in any file format(.csv,.xlsx,.json)

In [None]:
#!pip install requests

In [None]:
#!pip install bs4

In [1]:
import requests
import bs4
#help(bs4)

In [2]:
from bs4 import BeautifulSoup
#connect to the url
url = "https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city"
data = requests.get(url)
data

<Response [200]>

In [3]:
#This is where BeautifulSoup comes into action by getting the data
soup = BeautifulSoup(data.content,'html')
#soup

In [4]:
#print(soup.prettify())

In [5]:
#extracting Property name
soup.find('div',attrs={'class':'title-line'}).text

'2 BHK Apartment in Himaja Yukta Avenue'

In [6]:
soup.find('a',attrs={'class':'typelink'}).span.text.strip()

'2'

In [7]:
soup.find('span',attrs={'itemprop':'addressLocality'}).text

'Kesarapalli'

In [8]:
#find the area and total price
soup.find('div',attrs={'data-type':'price-link'}).text.strip()

'35.83 L'

In [9]:
soup.find('td',attrs={'class':'size'}).text.strip()

'1175'

In [10]:
soup.find('td',attrs={'class':'val'}).text

'Ready to move'

In [11]:
soup.find('ul',attrs={'class':'listing-details'}).text

'2 - 3 years oldNew '

In [12]:
#now we will extract details from the entire first page
#find_all()
#soup.find_all('div',attrs={'class':'title-line'}).text

In [13]:
#as find_all() to be used for multiple entries we go for iterations
a = soup.find_all('div',attrs={'class':'title-line'})
names = [] #this list will have property names
for i in a:
    #print(i.text)
    names.append(i.text)
print(names)
print(len(names))

['2 BHK Apartment in Himaja Yukta Avenue', '2 BHK Apartment in Himaja Yukta Avenue', '2 BHK Apartment in Himaja Yukta Avenue', '3 BHK Apartment', 'Residential Plot', '2 BHK Apartment', '2 BHK Independent House', '2 BHK Apartment in Hycon Elite', '3 BHK Apartment', '3 BHK Independent Floor', '3 BHK Apartment', '2 BHK Apartment', '2 BHK Independent House', '3 BHK Independent Floor', '6 BHK Independent House', 'Residential Plot in Harivillu Fortune Legendary', 'Residential Plot', 'Residential Plot', '3 BHK Apartment in Pooja Tree Storey', '3 BHK Apartment in Sri Naga Sun Rise Enclave']
20


In [14]:
b = soup.find_all('span',attrs={'itemprop':'addressLocality'})
#optimized way -->List Comprehension
#syntax is --> [exprsn for var in collection/function]
places = [i.text for i in b]
print(places)
print(len(places))

['Kesarapalli', 'Kesarapalli', 'Kesarapalli', 'Poranki', 'Kankipadu', 'Poranki', 'Kankipadu', 'Poranki', 'Gannavaram', 'Gunadala', 'Gollapudi', 'Enikepadu', 'Benz Circle', 'Vidhyadharpuram', 'Penamaluru', 'Kankipadu', 'Poranki', 'Kankipadu', 'Benz Circle', 'Gannavaram']
20


In [15]:
#extracting price 
c = soup.find_all('div',attrs={'data-type':'price-link'})
d = [i.text.strip() for i in c]
print(d)
print(len(d))

['35.83 L', '37.24 L', '36.32 L', '55.5 L', '15.5 L', '48 L', '50 L', '40 L', '59.86 L', '56 L', '89 L', '39 L', '59 L', '95 L', '1.1 Cr', '16.83 L', '33 L', '20 L', '1.66 Cr', '72 L']
20


In [16]:
#Now we need to remove 'L' and 'Cr' from the above price
#As prices are in Lakhs and Crores we start removing them
Price = []
for i in c:
    i = i.text.strip()
    if "Cr" in i:
        i = i.replace(" Cr",'') #replacing with empty string
        i = float(i) * 100 #converting into lakhs
    else:
        i = i.replace(" L","")
        i = float(i)
    Price.append(i)
print(Price)
print(len(Price))

[35.83, 37.24, 36.32, 55.5, 15.5, 48.0, 50.0, 40.0, 59.86, 56.0, 89.0, 39.0, 59.0, 95.0, 110.00000000000001, 16.83, 33.0, 20.0, 166.0, 72.0]
20


In [17]:
d = soup.find_all('td',attrs={'class':'size'})
area = [i.text.strip() for i in d]
print(area)
print(len(area))

['1175', '1221', '1191', '1587', '1560', '1450', '1290', '1200', '1663', '1500', '2062', '1250', '1350', '1600', '3700', '1782', '1800', '1500', '2317', '1650']
20


In [18]:
#Getting the house type along with information about bathrooms/facing
property_types = ['Apartment','Builder Floor','Villa',
                  'Residential Plot','Independent House',
                  'Independent Floor','Studio Apartment']
f = soup.find_all('div',attrs={'class':'title-line'})
Type = []
for i in f:
    i = i.text
    for p_type in property_types:
        if p_type in i:
            Type.append(p_type)
print(Type)
print(len(Type))  

['Apartment', 'Apartment', 'Apartment', 'Apartment', 'Residential Plot', 'Apartment', 'Independent House', 'Apartment', 'Apartment', 'Independent Floor', 'Apartment', 'Apartment', 'Independent House', 'Independent Floor', 'Independent House', 'Residential Plot', 'Residential Plot', 'Residential Plot', 'Apartment', 'Apartment']
20


In [19]:
#Now for the number of BHK we will replace Residential plot with '0'
f = soup.find_all('div',attrs={'class':'title-line'})
Rooms = []
for i in f:
    i = i.span.text
    #print(i)
    i = i.replace("Residential Plot",'0').replace(' ','')
    Rooms.append(i)
print(Rooms)
print(len(Rooms))

['2', '2', '2', '3', '0', '2', '2', '2', '3', '3', '3', '2', '2', '3', '6', '0', '0', '0', '3', '3']
20


In [20]:
#checking status type
s = soup.find_all('td',attrs={'class':'val'})
status = [i.text for i in s]
print(status)
print(len(status))

['Ready to move', 'Ready to move', 'Ready to move', 'Ready to move', 'Resale', 'Ready to move', 'Under Construction', 'Ready to move', 'Ready to move', 'Ready to move', 'Ready to move', 'Ready to move', 'Ready to move', 'Under Construction', 'Ready to move', 'Resale', 'Resale', 'Resale', 'Under Construction', 'Ready to move']
20


In [21]:
#details of the property
d = soup.find_all('ul',attrs={'class':'listing-details'})
details = [i.text for i in d]
print(details)
print(len(details))

['2 - 3 years oldNew ', '2 - 3 years oldNew ', '2 - 3 years oldNew ', '3 BathroomsResale West facing', '7 - 8 years old2 open sidesEast facing', '2 BathroomsResale ', '2 BathroomsResale NorthEast facing', '2 BathroomsResale ', '3 BathroomsResale East facing', '3 BathroomsResale ', '3 BathroomsResale ', '2 BathroomsResale ', '2 BathroomsResale East facing', '3 BathroomsResale West facing', '7 - 8 years old7 BathroomsResale ', '1 - 2 years old', '1 open sidesWest facing', '7 - 8 years old', '3 BathroomsNew East facing', '2 - 3 years old3 BathroomsNew ']
20


In [22]:
#So we finally separate Bathrooms and Type of Facing listings
#Regular Expressions -->re 
import re #pattern matching
d = soup.find_all('ul',attrs={'class':'listing-details'})
Bathrooms = []
Facing = []
for i in d:
    i = i.text
    bathroom_count = re.findall(r'(\d+) Bathrooms',i)#\d -->digits
    if bathroom_count:
        Bathrooms.append(int(bathroom_count[0]))
    else:
        Bathrooms.append(0)
    facing_direction = re.findall(r'(North|South|East|West|NorthEast|NorthWest|SouthEast|SouthWest) facing',
                                  i)
    if facing_direction:
        Facing.append(facing_direction[0])
    else:
        Facing.append('None')
print(Facing)
print(Bathrooms)

['None', 'None', 'None', 'West', 'East', 'None', 'NorthEast', 'None', 'East', 'None', 'None', 'None', 'East', 'West', 'None', 'None', 'West', 'None', 'East', 'None']
[0, 0, 0, 3, 0, 2, 2, 2, 3, 3, 3, 2, 2, 3, 7, 0, 0, 0, 3, 3]


In [None]:
#now we will use the same above logic to extract for the entire page
#ipython -->Interactive Python
#!pip install ipython

In [None]:
base_url = "https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?page="
for i in range(1,41):
    url = base_url+str(i)
    print(url)

In [23]:
#pip install ipython
from IPython.display import clear_output
import time

## Scraping from multiple pages from the website

In [24]:
Bedrooms = [];Bathrooms = []
Location = [];Price = []
Size = [];Status = []
Facing = [];Type = []

property_types = ['Apartment', 'Independent House', 
                  'Independent Floor',
                  'Villa', 'Studio Apartment', 
                  'Residential Plot', 'Builder Floor']

for i in range(1, 54):
  url = f'https://www.makaan.com/vijayawada-residential-property/buy-property-in-vijayawada-city?propertyType=apartment,builder-floor,villa,residential-plot,independent-house,studio-apartment&page={i}'
  print(f'Scraping Info from Page --> {i}')
  clear_output(wait=True)
  time.sleep(3) #waiting time to move from one page to other

  d = requests.get(url)
  soup = BeautifulSoup(d.content, 'html')

  a = soup.find_all('a', attrs={'class':'typelink'})
  for i in a:
    i = i.span.text #you need only number which is present in start
    i = i.replace('Residential Plot', '0').replace(' ', '')
    Bedrooms.append(i)

  b = soup.find_all('span', attrs={'itemprop':'addressLocality'})
  for i in b:
    Location.append(i.text)

  c = soup.find_all('div', attrs={'data-type':'price-link'})
  for i in c:
    i = i.text.strip()
    if 'Cr' in i:
      i = i.replace(' Cr', '')
      i = float(i) * 100 #converting into Lakhs
    else:
      i = i.replace(' L', '')
      i = float(i)
    Price.append(i)
    
  #Extracting Area  
  d = soup.find_all('td', attrs={'class':'size'})
  for i in d:
    i = i.text.strip()
    Size.append(i)
    
  #Extracting construction status
  e = soup.find_all('td', attrs={'class':'val'})
  for i in e:
    i = i.text
    Status.append(i)
    
  #Extracting Bathrooms and Facing Type
  f = soup.find_all('ul', attrs={'class':'listing-details'})
  for i in f:
    i = i.text
    bathroom_count = re.findall(r'(\d+) Bathrooms', i)
    if bathroom_count:
      Bathrooms.append(int(bathroom_count[0]))
    else:
      Bathrooms.append('0')
    facing_direction = re.findall(r'(North|South|East|West|NorthEast|NorthWest|SouthEast|SouthWest) facing', i)
    if facing_direction:
      Facing.append(facing_direction[0])
    else:
      Facing.append('None')

  g = soup.find_all('div', attrs={'class':'title-line'})
  for i in g:
    i = i.text
    for p_type in property_types:
      if p_type in i:
        Type.append(p_type)

Scraping Info from Page --> 53


In [25]:
print(len(Bathrooms))
print(len(Bedrooms))
print(len(Status))
print(len(Size))
print(len(Location))
print(len(Price))
print(len(Facing))
print(len(Type))

1060
1060
1060
1060
1060
1060
1060
1070


In [27]:
#let's create a dataframe from above data
data_dict = {'Bedrooms':Bedrooms,
            'Bathrooms':Bathrooms,
            'Status':Status,
            'Size':Size,
            'Location':Location,
            'Price':Price,
            'Facing':Facing,
            'Type':Type}
#data_dict

In [30]:
#Now as we have different values in all columns we write a logic
#to create a dataframe keeping max_length as 1070 in above case
import pandas as pd
max_length = 1070 #dependng on your data scraped above

for key in data_dict.keys():
    data_dict[key] += [None] * (max_length - len(data_dict[key]))

data = pd.DataFrame(data_dict)
data
#data.isnull().sum()

Unnamed: 0,Bedrooms,Bathrooms,Status,Size,Location,Price,Facing,Type
0,2,0,Ready to move,1175,Kesarapalli,35.83,,Apartment
1,2,0,Ready to move,1221,Kesarapalli,37.24,,Apartment
2,2,0,Ready to move,1191,Kesarapalli,36.32,,Apartment
3,3,3,Ready to move,1587,Poranki,55.50,West,Apartment
4,0,0,Resale,1560,Kankipadu,15.50,East,Residential Plot
...,...,...,...,...,...,...,...,...
1065,,,,,,,,Residential Plot
1066,,,,,,,,Residential Plot
1067,,,,,,,,Residential Plot
1068,,,,,,,,Residential Plot


In [31]:
data.isnull().sum()

Bedrooms     10
Bathrooms    10
Status       10
Size         10
Location     10
Price        10
Facing       10
Type          0
dtype: int64

In [32]:
#we have created missing values in all other columns so we drop those values
data.dropna(inplace=True)

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1060 entries, 0 to 1059
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Bedrooms   1060 non-null   object 
 1   Bathrooms  1060 non-null   object 
 2   Status     1060 non-null   object 
 3   Size       1060 non-null   object 
 4   Location   1060 non-null   object 
 5   Price      1060 non-null   float64
 6   Facing     1060 non-null   object 
 7   Type       1060 non-null   object 
dtypes: float64(1), object(7)
memory usage: 74.5+ KB


In [41]:
#once we get the final data we perform descriptive statistical information and then check for 
#outliers
type(data)
#data.describe()
#data.describe(include="all")
data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Bedrooms,1060.0,7.0,0,729.0,,,,,,,
Bathrooms,1060.0,17.0,0,736.0,,,,,,,
Status,1060.0,4.0,New,699.0,,,,,,,
Size,1060.0,280.0,1080,65.0,,,,,,,
Location,1060.0,86.0,Edupugallu,317.0,,,,,,,
Price,1060.0,,,,50.627217,57.971419,1.95,25.2,35.56,54.36,570.0
Facing,1060.0,9.0,,438.0,,,,,,,
Type,1060.0,6.0,Residential Plot,719.0,,,,,,,


In [42]:
#before getting into removal of outliers let's have a 
#complete overview of all columns using plotly
import plotly.express as px
data.columns

Index(['Bedrooms', 'Bathrooms', 'Status', 'Size', 'Location', 'Price',
       'Facing', 'Type'],
      dtype='object')

In [44]:
#px.box(data) #As every column is having different type of data
#we need to convert those categorical values to numerical values

In [47]:
data['Status'].unique()

array(['Ready to move', 'Resale', 'Under Construction', 'New'],
      dtype=object)

In [49]:
#data['Location'].unique()
data['Location'].nunique()

86

In [50]:
data['Facing'].unique()

array(['None', 'West', 'East', 'NorthEast', 'North', 'South', 'SouthWest',
       'NorthWest', 'SouthEast'], dtype=object)

In [51]:
data['Type'].unique()

array(['Apartment', 'Residential Plot', 'Independent House',
       'Independent Floor', 'Villa', 'Studio Apartment'], dtype=object)

In [52]:
#we use sklearn for feature encoding
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [53]:
data['Location'] = label.fit_transform(data['Location'])

In [57]:
names = ['Saketh','Codegnan','Obaid','Kesava']
marks = [85,100,65,75]
d = dict(zip(names,marks))
d

{'Saketh': 85, 'Codegnan': 100, 'Obaid': 65, 'Kesava': 75}

In [59]:
#data['Location']

In [60]:
data['Location'] = label.fit_transform(data['Location'])
encoded_to_original = dict(zip(label.transform(label.classes_),
                               label.classes_))

for encoded_label, original_value in encoded_to_original.items():
  print(f'{encoded_label} - {original_value}')

0 - AGIRIPALLI
1 - Ajit Singh Nagar
2 - Andhra Prabha Colony Road
3 - Ashok Nagar
4 - Auto Nagar
5 - Ayyappa Nagar
6 - Bandar Road
7 - Benz Circle
8 - Bharathi Nagar
9 - Bhavanipuram
10 - Chennai Vijayawada Highway
11 - Currency Nagar
12 - Devi Nagar
13 - Edupugallu
14 - Enikepadu
15 - G Konduru
16 - Gandhi Nagar
17 - Gannavaram
18 - Gollapudi
19 - Gollapudi1
20 - Gosala
21 - Governor Peta
22 - Gudavalli
23 - Gunadala
24 - Guntupalli
25 - Guru Nanak Colony
26 - Ibrahimpatnam
27 - Jaggayyapet
28 - Kanchikacherla
29 - Kandrika
30 - Kanigiri Gurunadham Street
31 - Kankipadu
32 - Kanuru
33 - Kesarapalli
34 - LIC Colony
35 - Labbipet
36 - Madhuranagar
37 - Mangalagiri
38 - Milk Factory Road
39 - Moghalrajpuram
40 - Murali Nagar 2nd Cross Road
41 - Mylavaram
42 - MylavaramKuntamukkalaVellaturuVijayawada Road
43 - Nandigama
44 - Nidamanuru
45 - Nunna
46 - Nuzividu
47 - Nuzvid Road
48 - Nuzvid To Vijayawada Road
49 - PNT Colony
50 - Pamarru
51 - Patamata
52 - Payakapuram
53 - Pedapulipaka Tadi

In [61]:
data['Status'] = label.fit_transform(data['Status'])
encoded_to_original = dict(zip(label.transform(label.classes_),
                               label.classes_))

for encoded_label, original_value in encoded_to_original.items():
  print(f'{encoded_label} - {original_value}')

0 - New
1 - Ready to move
2 - Resale
3 - Under Construction


In [62]:
data['Facing'] = label.fit_transform(data['Facing'])
encoded_to_original = dict(zip(label.transform(label.classes_),
                               label.classes_))

for encoded_label, original_value in encoded_to_original.items():
  print(f'{encoded_label} - {original_value}')

0 - East
1 - None
2 - North
3 - NorthEast
4 - NorthWest
5 - South
6 - SouthEast
7 - SouthWest
8 - West


In [63]:
data['Type'] = label.fit_transform(data['Type'])
encoded_to_original = dict(zip(label.transform(label.classes_),
                               label.classes_))

for encoded_label, original_value in encoded_to_original.items():
  print(f'{encoded_label} - {original_value}')

0 - Apartment
1 - Independent Floor
2 - Independent House
3 - Residential Plot
4 - Studio Apartment
5 - Villa


In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1060 entries, 0 to 1059
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Bedrooms   1060 non-null   object 
 1   Bathrooms  1060 non-null   object 
 2   Status     1060 non-null   int32  
 3   Size       1060 non-null   object 
 4   Location   1060 non-null   int32  
 5   Price      1060 non-null   float64
 6   Facing     1060 non-null   int32  
 7   Type       1060 non-null   int32  
dtypes: float64(1), int32(4), object(3)
memory usage: 58.0+ KB


In [99]:
#Converting object dtype into int dtype
data['Bathrooms'] = data['Bathrooms'].astype('int64')
data['Bedrooms'] = data['Bedrooms'].astype('int64')
data['Size'] = data['Size'].astype('int64')

In [100]:
px.box(data)

In [72]:
#data.info()
px.box(data,x='Facing')

In [101]:
#Removing Outliers from the dataframe -->we will create a final dataframe
def remove_outliers(df, threshold=3):
    df_cleaned = df.copy() #copy of dataframe
    #The threshold value determines how far from the first and third quartiles a data point must be to be considered an outlier
    for col in df_cleaned.select_dtypes(include=['int64', 
                                               'float64']).columns:
        Q1 = df_cleaned[col].quantile(0.25) #25th percentile
        Q3 = df_cleaned[col].quantile(0.75) #75th percentile
        IQR = Q3 - Q1 #InterQuantile Range
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        #Filter the DataFrame to keep only the rows where the 
        #column values are within the calculated bounds, 
        #effectively removing outliers
        df_cleaned = df_cleaned[
            (df_cleaned[col] >= lower_bound) & 
            (df_cleaned[col] <= upper_bound)
        ]
    return df_cleaned

In [81]:
def remove_outliers(df, threshold=3):
    df_cleaned = df.copy()

    total_outliers = 0  # Initialize a variable to count total outliers.

    for col in df_cleaned.select_dtypes(include=['int64', 'float64']).columns:
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)

        IQR = Q3 - Q1

        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)

        total_outliers += outliers.sum()  # Count outliers in the current column.

        df_cleaned = df_cleaned[~outliers]

    percentage_outliers = (total_outliers / df.shape[0]) * 100  # Calculate the percentage of outliers.

    return df_cleaned, percentage_outliers


In [102]:
df = remove_outliers(data)
df

Unnamed: 0,Bedrooms,Bathrooms,Status,Size,Location,Price,Facing,Type
0,2,0,1,1175,33,35.83,1,0
1,2,0,1,1221,33,37.24,1,0
2,2,0,1,1191,33,36.32,1,0
3,3,3,1,1587,55,55.50,8,0
4,0,0,2,1560,31,15.50,0,3
...,...,...,...,...,...,...,...,...
1053,0,0,2,1314,2,45.26,8,3
1055,0,0,2,1728,13,30.72,0,3
1056,0,0,0,25,7,8.00,1,3
1057,0,0,2,1287,31,14.00,1,3


In [114]:
#We will create a csv file from the above data
#df.info()
df.to_csv("House.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 935 entries, 0 to 1058
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Bedrooms   935 non-null    int64  
 1   Bathrooms  935 non-null    int64  
 2   Status     935 non-null    int32  
 3   Size       935 non-null    int64  
 4   Location   935 non-null    int32  
 5   Price      935 non-null    float64
 6   Facing     935 non-null    int32  
 7   Type       935 non-null    int32  
dtypes: float64(1), int32(4), int64(3)
memory usage: 51.1 KB


In [96]:
df['Bathrooms'] = df['Bathrooms'].astype('int64')

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 935 entries, 0 to 1058
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Bedrooms   935 non-null    int64  
 1   Bathrooms  935 non-null    int64  
 2   Status     935 non-null    int32  
 3   Size       935 non-null    int64  
 4   Location   935 non-null    int32  
 5   Price      935 non-null    float64
 6   Facing     935 non-null    int32  
 7   Type       935 non-null    int32  
dtypes: float64(1), int32(4), int64(3)
memory usage: 51.1 KB


In [117]:
data = pd.read_csv('House.csv')
#data
data.columns

Index(['Unnamed: 0', 'Bedrooms', 'Bathrooms', 'Status', 'Size', 'Location',
       'Price', 'Facing', 'Type'],
      dtype='object')

In [118]:
data.drop(columns=['Unnamed: 0'],inplace=True)

In [119]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Bedrooms   935 non-null    int64  
 1   Bathrooms  935 non-null    int64  
 2   Status     935 non-null    int64  
 3   Size       935 non-null    int64  
 4   Location   935 non-null    int64  
 5   Price      935 non-null    float64
 6   Facing     935 non-null    int64  
 7   Type       935 non-null    int64  
dtypes: float64(1), int64(7)
memory usage: 58.6 KB


In [160]:
#We will get into Modelling 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [161]:
#divide our data into dependent and independent variables
x = data.drop(columns=['Price'])
y = data['Price']

In [162]:
#let's understand about splitting the data by understanding 
#below example
import numpy as np
a,b = np.arange(10).reshape(5,2),range(5)

In [163]:
#print(a)
#print(b)

In [133]:
w,x,y,z=train_test_split(a,b) #as there is no consistency in data
print(w)
print(x)
print(y)
print(z)

[[2 3]
 [8 9]
 [4 5]]
[[0 1]
 [6 7]]
[1, 4, 2]
[0, 3]


In [136]:
w,x,y,z=train_test_split(a,b,random_state=1) # now there is consistency in data
print(w)
print(x)
print(y)
print(z)

[[8 9]
 [0 1]
 [6 7]]
[[4 5]
 [2 3]]
[4, 0, 3]
[2, 1]


In [164]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,
                                                random_state=3)

In [165]:
print(len(x_train))
print(len(x_test))

654
281


In [143]:
#x_train

In [166]:
#model creation
model = LinearRegression()

In [167]:
#fit the data to the model(training data)
model.fit(x_train,y_train)

LinearRegression()

In [168]:
#we will check the performance metrics -->R-SQuared value,mean_squared erro
from sklearn.metrics import mean_squared_error,r2_score

In [169]:
#before checking the scores let's evaluate the coefficients 
#print(dir(model))
linear_coeffs = pd.DataFrame({'Features':x.columns,
                             'Coefficients':model.coef_})
#linear_coeffs

In [170]:
#checking for predictions and performance metrics
y_pred = model.predict(x_test)

In [171]:
print("Linear Regression Metrics:")
print("R-Squared value is",r2_score(y_test,y_pred))
print("Mean Squared Error is",mean_squared_error(y_test,y_pred))

Linear Regression Metrics:
R-Squared value is 0.6202750302170772
Mean Squared Error is 242.45139702292877


In [172]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Bedrooms   935 non-null    int64  
 1   Bathrooms  935 non-null    int64  
 2   Status     935 non-null    int64  
 3   Size       935 non-null    int64  
 4   Location   935 non-null    int64  
 5   Price      935 non-null    float64
 6   Facing     935 non-null    int64  
 7   Type       935 non-null    int64  
dtypes: float64(1), int64(7)
memory usage: 58.6 KB


In [179]:
#let's pass some inputs and validate it then will create pickle file
#from our finetuned model
bed = int(input('Enter no of bedrooms:'))
bath = int(input('Enter no of bathrooms:'))
loc = int(input('Enter Location:'))
size = int(input('Enter size:'))
status = int(input('Enter status:'))
facing = int(input('Enter facing:'))
Type = int(input('Enter Type:'))

Enter no of bedrooms:2
Enter no of bathrooms:2
Enter Location:0
Enter size:1200
Enter status:1
Enter facing:1
Enter Type:0


In [180]:
#now we will test for the above data
import numpy as np
input_data = np.array([[bed,bath,loc,size,status,facing,Type]])
result = model.predict(input_data)[0]
print(f'House Price Predicted is {result} Lakhs')

House Price Predicted is 43.31681065071236 Lakhs



X does not have valid feature names, but LinearRegression was fitted with feature names



In [181]:
#we will create a pickle file for our finally trained model
import pickle

In [182]:
#open() function for creating our new files and also reading 
with open("House.pkl",'wb') as f:
    pickle.dump(model,f)

In [186]:
#we will use our pickle file for predictions
predicted_model = pickle.load(open('House.pkl','rb'))

In [187]:
predicted_model.predict(input_data)


X does not have valid feature names, but LinearRegression was fitted with feature names



array([43.31681065])

In [188]:
#No Free Lunch Theorem -->No one model perfectly fits the data
#Finally we will present our model-->we can use flask as webservice -->AWS,Pythonanywhere
#House Price Prediction-->  Streamlit directly
!pip install streamlit

Collecting streamlit


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/df/e2/e1c8e3abce0e819a4ef3c6a5af4ddbb1eadbca0cf2ebd5e58853edcd58bd/streamlit-1.28.1-py2.py3-none-any.whl.metadata
  Downloading streamlit-1.28.1-py2.py3-none-any.whl.metadata (8.1 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Obtaining dependency information for altair<6,>=4.0 from https://files.pythonhosted.org/packages/17/16/b12fca347ff9d062e3c44ad9641d2ec50364570a059f3078ada3a5119d7a/altair-5.1.2-py3-none-any.whl.metadata
  Downloading altair-5.1.2-py3-none-any.whl.metadata (8.6 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Obtaining dependency information for blinker<2,>=1.0.0 from https://files.pythonhosted.org/packages/fa/2a/7f3714cbc6356a0efec525ce7a0613d581072ed6eb53eb7b9754f33db807/blinker-1.7.0-py3-none-any.whl.metadata
  Using cached blinker-1.7.0-py3-none-any.whl.metadata (1.9 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Obtaining dependency information for cachet