# Web Scrapping of Zimbabwe Housing - 8 March 2023

In [58]:
#importing our libraries

from bs4 import BeautifulSoup 
import pandas as pd 
import requests

## 1. Starting off with building a code that scraps one page. Lets name it SCRP

In [59]:
#using variable url to store the url 

url = "https://www.classifieds.co.zw/zimbabwe-houses-for-sale?page=1"

page = requests.get(url, verify = False)

soup = BeautifulSoup(page.content, "html.parser")

In [60]:
#finding all sections with class name "details col-md-7 col-sm-7 col-xs-8" and tag 'div', and storing them in sections variable 
sections = soup.find_all('div', class_ = "details col-md-7 col-sm-7 col-xs-8")

### 1.1 Starting with the basics: 
#### The code below uses for loop to scrap through all html sections on page 1 that has tag 'div' and class 'details col-md-7 col-sm-7 col-xs-8'

In [61]:
#In all sections we want to find name, location, price and area
#We use for loop to loop through all subsections with the class names that holds our features we are looking for 

data = []
for subs in sections:
    
    title = subs.find('h5', class_ = 'listing-title').text.strip()
    price = subs.find('div', class_ = 'pull-left usd-price-tooltip').text.strip()
    
    features = subs.find_all('li', class_ = 'property')
    for tags in features: 
        area = features[-2].text
        location = features[-1].text
    
    #Storing the information into variable data
    data.append([title, price, area, location])
    
data

[['Greystone Park - House', '$320,000', '4292 m²', 'harare north'],
 ['New Marlborough - House', '$140,000', '2000 m²', 'harare west'],
 ['Chitungwiza - House', '$30,000', '1 bathrooms', 'chitungwiza'],
 ['Malindela - House', '$110,000', '2839 m²', 'bulawayo south'],
 ['Madokero - House', '$110,000', '400 m²', 'harare west'],
 ['Greendale - House', '$230,000', '2600 m²', 'harare east'],
 ['Vainona - House', '$320,000', '2200 m²', 'harare north'],
 ['Glen Lorne - House', '$195,000', '4000 m²', 'harare north'],
 ['Hatfield - House', '$115,000', '2506 m²', 'harare south'],
 ['Arlington - House', '$130,000', '320 m²', 'harare south'],
 ['Borrowdale - House', '$380,000', '3000 m²', 'harare north'],
 ['Burnside - House', '$70,000', '4000 m²', 'bulawayo east'],
 ['Hatfield - House', '$130,000', '4133 m²', 'harare south'],
 ['Chitungwiza - House', '$35,000', '400 m²', 'chitungwiza'],
 ['Helensvale - House', '$650,000', '4695 m²', 'harare north'],
 ['Mabvuku - House', '$27,000', '300 m²', 'hara

#### Our SCRP code works perfectly fine, as shown above it gives us a list of all features of the houses as they are listed

## 2. Modifying SCRP to scrap more or defined pages from the site

In [67]:
#Asking the reader how many pages they want to scrap

stop = int(input('How many pages do you want to scrap\n\n'))

How many pages do you want to scrap

160


In [68]:
#The website now has 3,503 pages as it has been updated, it had 3,494 yesterday 8 March 2023. For now we'll scrap from 160 pages

url_base = "https://www.classifieds.co.zw/zimbabwe-houses-for-sale?page="

# Creating an empty list to later store our data
data = []

#Here we can modify the number of pages we want to scrap by adjusting on the stop element in our range

for x in range(1,stop+1): 
    # here we are using variable x which is governed by the range above, x is the page number
    new_url_base = f"{url_base}{x}"
    
    # Sending the request to the website to get the data and prompting it not to verify our request 
    web = requests.get(new_url_base, verify = False) 
    #urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    #if the web status code is 200 it means that our request was successful hence we proceed to scrape
    if web.status_code == 200: 
        web_page = BeautifulSoup(web.content, 'html.parser')
        
        #We assign all the sections of the web to the variable listings 
        listings = web_page.find_all('div', class_ = "details col-md-7 col-sm-7 col-xs-8")
        
        #Having assigned all listings into the variable listing, we need now to scrap title name, price, area & location 
        
        #We locate the tag that the name is found in the use our BeautifulSoup method (webpage) to find the tags and class name where we find title and price
        
              
        for listing in listings:
            
            title = listing.find('h5', class_ = 'listing-title').text.strip()
            price = listing.find('div', class_ = 'pull-left usd-price-tooltip').text.strip()
        
            #now using web inspection we find our that location, area, number of bedrooms and bathrooms have the same tag and html class name
            
            #on some listings there is no number of bathrooms, hence we discard those and only focus on area size and location 
        
            #we use out method to find all tags with name 'li' and class name 'property' and assign them into features variable
            features = listing.find_all('li', class_ = 'property')
            
            #From browsing through the page we found out that property features are at least 2, thus location and area. If thats not the case then we leave those features empty
            if len(features) >= 2:
                area = features[-2].text
                location = features[-1].text
            else: 
                area = " "
                location = " "
                 
            #appending our informdation into the list data
            data.append([title, price, area, location])
        

In [69]:
# Creating a pandas dataframe to display the data in a more visually appealling way and we can do our analysis without a hassle

prop_prices = pd.DataFrame(data, columns = ['Title', 'Price', 'Area', 'Location'])
prop_prices.shape

(3200, 4)

## 3. Data cleaning so that we can draw some infererence from our data

In [70]:
# We start by creating a CSV file and store our data into that file 

prop_prices.to_csv('housing_data.csv', index = False)

In [71]:
data = pd.read_csv('housing_data.csv')
data.tail(5)

Unnamed: 0,Title,Price,Area,Location
3195,"Borrowdale - House, House","$200,000",4000 m²,harare north
3196,Glen Lorne - House,"$450,000",5100 m²,harare north
3197,Tynwald - House,"$55,000",400 m²,harare west
3198,Southwood - House,"$45,000",1575 m²,kwekwe
3199,"Mandara - House, Townhouse Complex","$270,000",2000 m²,harare east


In [72]:
#Checking the shape of our data, to establish how many houses data we have
data.shape

(3200, 4)

In [None]:
#Checking for duplicates, as a way of validating if our code is not scrapping 1 page

#data.duplicated()

## 4. Data cleaning so that we can draw some infererence from our data