# PARSING FROM 'KRISHA.KZ'

### 0 step. Import all libraries

In [None]:
#-----------------------------

import pandas as pd 
import numpy as np
import selenium
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import time 
import re

### 1 step. Opens the Chrome by the selenium and loads the page

In [None]:
#-----------------------------

driver = webdriver.Chrome()
url = "https://krisha.kz/prodazha/kvartiry/kokshetau/"

driver.get(url)

### 2 step. Parsing all necessary links and name of streets from ads

In [None]:
#-----------------------------

links = []
streets = []

while True:
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    a_elements = soup.findAll("a", class_="a-card__title")
    subtitle_elements = soup.findAll("div", class_="a-card__subtitle")
    
    for a_element in a_elements:
        link = a_element.get('href')
        links.append(link)
        
    print(f"Number of links: {len(links)}")
    
    for subtitle in subtitle_elements:
        street = subtitle.get_text(strip=True) 
        streets.append(street)
        
    print(f"Number of streets: {len(links)}")
    
    try:
        next_button = driver.find_element(By.LINK_TEXT, "Дальше")
        next_button.click() 
        
    except:
        print("Button 'Дальше' not found, loop is stopped")
        break
        
print('Links and streets are ready!')

### 3 step. Clean the list of streets of unnecessary words and add "Кокшетау" at the end of each street

In [None]:
#-----------------------------

cleaned_streets = [
    street.lower().replace('мкр.', '').replace('мкр', '').replace('м-н', '').replace('м.', '').strip().split(' —')[0].capitalize() + ', Кокшетау' 
    for street in streets
]
cleaned_streets

### 4 step. Find all coordinates(latitude, longitude) of streets that we parsed in the step 2 and check whether the coordinates are in Kokshetau or not

In [None]:
#-----------------------------

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="kokshetau_mapper")
coordinates = []

for street in cleaned_streets:
    try:
        location = geolocator.geocode(street)
        if location:
            coordinates.append({
                "street": street,
                "latitude": location.latitude,
                "longitude": location.longitude
            })
            print(f"Street: {street}, latitude: {location.latitude}, longitude: {location.longitude}")
        else:
            print(f"We can't find coordinates for {street}")
            coordinates.append({
                "street": street,
                "latitude": None,
                "longitude": None
            })
    except Exception as e:
        print(f"Fail to {street}: {e}")
        coordinates.append({
            "street": street,
            "latitude": None,
            "longitude": None
        })
    time.sleep(1) 
    
def check_city(latitude, longitude):
    min_lat, max_lat = 53.25, 53.35
    min_lon, max_lon = 69.35, 69.45

    if min_lat <= latitude <= max_lat and min_lon <= longitude <= max_lon:
        return True
    else:
        return False
df1 = pd.DataFrame(coordinates) 
df1['is_in_kokshetau'] = df1.apply(lambda row: check_city(row['latitude'], row['longitude']), axis=1)

print(df1)

### 5 step. Parsing all necessary data from ads by the links

In [None]:
df2 = pd.DataFrame(columns=[
    'price', 'room', 'area', 'flat_toilets', 'balcony', 'current_floors', 
    'total_floors', 'ceiling', 'dorm', 'mortgage', 'year', 'type_of_house', 
    'condition', 'repair_status', 'type_of_floor'
])

count = 0  

for link in links_part1:

    page_source = requests.get("https://krisha.kz/" + link)
    soup = BeautifulSoup(page_source.text, 'html.parser')

    try:
        price = soup.find('div', class_="offer__price").text.strip()
        clean_price = ''.join(char for char in price if char.isdigit())
    except:
        clean_price = np.nan

    try:
        room = soup.find("h1").text.strip()
        match = re.search(r"(\d+)-комнатная", room)
        if match:
            room = int(match.group(1))
        else:
            room = np.nan
    except Exception as e:
        room = np.nan  

    try:
        area = soup.find('div', {'data-name': 'live.square'})
        area_text = area.find('div', class_='offer__advert-short-info').text
        area_text = area_text.replace(' м²', '').strip()
        if 'Площадь кухни' in area_text:
            area_text = area_text.split(',')[0].strip()
    except:
        area_text = np.nan

    try:
        flat_toilet = soup.find('div', {'data-name': 'flat.toilet'})
        flat_toilet_text = flat_toilet.find('div', class_="offer__advert-short-info").text.strip()
    except:
        flat_toilet_text = np.nan

    try:
        balcony = soup.find('div', {'data-name': 'flat.balcony'})
        balcony_text = balcony.find('div', class_="offer__advert-short-info").text.strip()
    except:
        balcony_text = np.nan

    try:
        floor = soup.find('div', {'data-name': 'flat.floor'})
        floor_text = floor.find('div', class_="offer__advert-short-info").text.strip()
        if ' из ' in floor_text:
            current_floor, total_floors = map(int, floor_text.split(' из ')[0:2])
        else:
            current_floor, total_floors = np.nan, np.nan
    except:
        current_floor, total_floors = np.nan, np.nan

    try:
        ceiling = soup.find('dt', {'data-name': 'ceiling'})
        if ceiling:
            ceiling_text = ceiling.find_next_sibling('dd').text.strip()
            match = re.search(r'\d+(\.\d+)?', ceiling_text)
            if match:
                clean_ceiling = float(match.group())
            else:
                clean_ceiling = np.nan
        else:
            clean_ceiling = np.nan
    except AttributeError:
        clean_ceiling = np.nan

    try:
        mortgage_text = soup.find('div', class_='offer__parameters-mortgaged').text.strip()
    except:
        mortgage_text = np.nan

    try:
        year = soup.find('div', {'data-name': 'house.year'})
        year_text = year.find('div', class_="offer__advert-short-info").text.strip()
    except:
        year_text = np.nan

    try:
        type_home = soup.find('div', {'data-name': 'flat.building'})
        type_home_text = type_home.find('div', class_="offer__advert-short-info").text.strip().split('\n')[0]
    except:
        type_home_text = np.nan

    try:
        dorm = soup.find('dt', {'data-name': 'flat.priv_dorm'})
        if dorm:
            dorm_text = dorm.find_next_sibling('dd').text.strip()
        else:
            dorm_text = np.nan
    except AttributeError:
        dorm_text = np.nan

    try:
        condition = soup.find('div', {'data-name': 'flat.renovation'})
        condition_text = condition.find('div', class_="offer__advert-short-info").text.strip()
    except:
        condition_text = np.nan

    try:
        repair_status = soup.find('dt', {'data-name': 'live.furniture'})
        if repair_status:
            repair_status_text = repair_status.find_next_sibling('dd').text.strip()
        else:
            repair_status_text = np.nan
    except AttributeError:
        repair_status_text = np.nan

    try:
        type_of_floor = soup.find('dt', {'data-name': 'flat.flooring'})
        if type_of_floor:
            type_of_floor_text = type_of_floor.find_next_sibling('dd').text.strip()
        else:
            type_of_floor_text = np.nan
    except AttributeError:
        type_of_floor_text = np.nan

    row = [
        clean_price, room, area_text, flat_toilet_text, balcony_text, 
        current_floor, total_floors, clean_ceiling, dorm_text, mortgage_text, 
        year_text, type_home_text, condition_text, repair_status_text, type_of_floor_text
    ]
    print(row)
    df2.loc[len(df)] = row

    count += 1
    print(f"Scraped objects: {count}")  

df2


### 6 step. Combining coordinate data and apartment data into one

In [None]:
#-----------------------------

df = pd.concat([df2, df1], axis=1)

### 7 step. Upload all data that i scrapped into csv and text

In [None]:
#-----------------------------

with open('links_krisha_kokshetau.txt', 'w') as file:
    for link in links:
        file.write(link + '\n') 
        
df1.to_csv('coordinate_apartment_kokshetau.csv')
df2.to_csv('dirty_krisha_data.csv')
df.to_csv('dirty_krisha_data_with_coord_apartmnt.csv')
