<a href="https://colab.research.google.com/github/vard-uhi/Yerevan-Apartment-Price-Prediction/blob/master/Web_Scraping_Yerevan_Apartment_Price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Web Scrapping of www.akcern.am real estate company's web page for further analysis as an apartment price prediction dataset**


***Getting all links***

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import csv
import more_itertools


In [None]:
#getting all pages
links=[f'https://akcern.am/en/realty/search-{i}.html?type=sale&property_type%5B1%5D=1&street=&from_floor=&to_floor=&from_price=&to_price=&rate=USD&from_area=&to_area=&code=&vidocode=' for i in range(1,176)]

In [None]:
#getting each apartment link from every page
def get_apartment_links(url):
    response = requests.get(url)
    time.sleep(4)
    response = response.text
    page = BeautifulSoup(response, "html.parser")

    try:
        links = page.find_all('a', class_='address')
    except:
        links = []

    urls = ['https://akcern.am' + item.get('href') for item in links]
    
    return urls

In [None]:
#here we are making list of the list of each page, with respective individual links
all_apartment_links = [get_apartment_links(i) for i in links]

In [None]:
all_apartment_links

[['https://akcern.am/en/realty/sale/73374.html',
  'https://akcern.am/en/realty/sale/73375.html',
  'https://akcern.am/en/realty/sale/73376.html',
  'https://akcern.am/en/realty/sale/73377.html',
  'https://akcern.am/en/realty/sale/73378.html',
  'https://akcern.am/en/realty/sale/73384.html',
  'https://akcern.am/en/realty/sale/73386.html',
  'https://akcern.am/en/realty/sale/73390.html',
  'https://akcern.am/en/realty/sale/73391.html',
  'https://akcern.am/en/realty/sale/73392.html',
  'https://akcern.am/en/realty/sale/73394.html',
  'https://akcern.am/en/realty/sale/73397.html',
  'https://akcern.am/en/realty/sale/73403.html',
  'https://akcern.am/en/realty/sale/73415.html',
  'https://akcern.am/en/realty/sale/73416.html'],
 ['https://akcern.am/en/realty/sale/73417.html',
  'https://akcern.am/en/realty/sale/73418.html',
  'https://akcern.am/en/realty/sale/73420.html',
  'https://akcern.am/en/realty/sale/73421.html',
  'https://akcern.am/en/realty/sale/73423.html',
  'https://akcern.a

In [None]:
#making list of the lists as a genaral, one list
all_apartment_links = list(more_itertools.collapse(all_apartment_links))

In [None]:
all_apartment_links

['https://akcern.am/en/realty/sale/73374.html',
 'https://akcern.am/en/realty/sale/73375.html',
 'https://akcern.am/en/realty/sale/73376.html',
 'https://akcern.am/en/realty/sale/73377.html',
 'https://akcern.am/en/realty/sale/73378.html',
 'https://akcern.am/en/realty/sale/73384.html',
 'https://akcern.am/en/realty/sale/73386.html',
 'https://akcern.am/en/realty/sale/73390.html',
 'https://akcern.am/en/realty/sale/73391.html',
 'https://akcern.am/en/realty/sale/73392.html',
 'https://akcern.am/en/realty/sale/73394.html',
 'https://akcern.am/en/realty/sale/73397.html',
 'https://akcern.am/en/realty/sale/73403.html',
 'https://akcern.am/en/realty/sale/73415.html',
 'https://akcern.am/en/realty/sale/73416.html',
 'https://akcern.am/en/realty/sale/73417.html',
 'https://akcern.am/en/realty/sale/73418.html',
 'https://akcern.am/en/realty/sale/73420.html',
 'https://akcern.am/en/realty/sale/73421.html',
 'https://akcern.am/en/realty/sale/73423.html',
 'https://akcern.am/en/realty/sale/73424

***Defining all interested features***

In [None]:
#function to get all interested features from the description of apartments
def get_all_features(url):
    
    response = requests.get(url)
    time = 4
    page = response.text
    page = BeautifulSoup(page, "html.parser")

    title = page.find('span', class_='desc col-xs-12 col-sm-12 col-md-12 col-lg-12').get_text()
    p = page.find('span', id='price_count').text.strip()
    price, currency = p.split(' ')   
    total_area = page.find("span", title ="Total area").get_text()
    room_count = page.find('span', title = 'Count of rooms').get_text()
    flour = page.find('span', title = 'Floor').get_text()
    condition = page.find('span', title = 'Condition').text.strip().split(' ')[-1]
    building_type = page.find('span', title = 'Building type').text.strip().split(' ')[-2:]
    balcony_type = page.find('span', title = 'balcony_label').text.strip().split(' ')[-1]
   
    return title, price, currency, total_area, room_count, flour, condition, building_type, balcony_type

***Getting data and storing as Dataframe***

In [None]:
#iterate over all links, get mentioned features and store them in the list
apartment_scraped_data = []
for i in all_apartment_links:
    try:
        all_data = get_all_features(i)
    except:
        continue
    apartment_scraped_data.append(all_data)

In [None]:
#create DataFrame from the list
apartment_scraped_data = pd.DataFrame(apartment_scraped_data ,columns=["title", "price", "currency", "total_area", "room_count", "flour", "condition", "building_type", "balcony_type"])

In [None]:
#print first 30 rows to see the result
apartment_scraped_data.head(30)

Unnamed: 0,title,price,currency,total_area,room_count,flour,condition,building_type,balcony_type
0,For sale Apartment on Artsakh in Erebuni,52000,USD,66 m2,Count of rooms 2,Floor 4,repair,"[type, High-rise]",Large
1,For sale Apartment on Jrashat in Downtown,130000,USD,90 m2,Count of rooms 3,Floor 2,repair,"[type, Stone]",Large
2,For sale Apartment on Y.Koghbatsi in Downtown,80000,USD,42 m2,Count of rooms 1,Floor 3,Repairing,"[type, High-rise]",Large
3,For sale Apartment on Khorenatsi in Downtown,55000,USD,37 m2,Count of rooms 1,Floor 9,3-5y,"[type, High-rise]",Lodgia
4,For sale Apartment on Lvovyan in Nor Nork,47000,USD,63 m2,Count of rooms 2,Floor 8,repair,"[type, High-rise]",Large
5,For sale Apartment on Erebuni in Erebuni,30000,USD,30 m2,Count of rooms 1,Floor 5,Original,"[type, Stone]",Front
6,For sale Apartment on Davtashen 1 district in ...,82000,USD,97 m2,Count of rooms 3,Floor 4,Repairing,"[type, High-rise]",Large
7,For sale Apartment on Nar Dos in Downtown,80000,USD,88 m2,Count of rooms 3,Floor Б,repair,"[type, High-rise]",Lodgia
8,For sale Apartment on Leninakan in Ajapnyak,43000,USD,43 m2,Count of rooms 1,Floor 2,2y.,"[type, Khrushchev(2,75м)]",Large
9,For sale Apartment on Avan Charents in Avan,65000,USD,82 m2,Count of rooms 3,Floor 5,repair,"[type, High-rise]",Large


***Save as csv and Download***

In [None]:
#save Dataframe to csv and download file for further analysis 
from google.colab import files
apartment_scraped_data.to_csv('Yerevan_apartment_primary_data.csv', index=False) 
files.download('Yerevan_apartment_primary_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>