In [3]:
import os
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from google.cloud import dataplex
import gspread
from oauth2client.service_account import ServiceAccountCredentials

from google.cloud import storage


def getInfoFromPage(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    paraminfo = soup.find_all("b", "parameters__value")

    title = soup.find("h1", "sticker__title")
    if (title != None): title = title.text.strip()
    else: title = "NULL"

    if (len(paraminfo) >= 1):
        city = re.sub(" +", " ", paraminfo[0].text.strip())
    else: city = "NULL"

    if (len(paraminfo) >= 2):
        square = paraminfo[1].text.strip()
    else: square = "NULL"

    if (len(paraminfo) >= 3):
        rooms = paraminfo[2].text.strip()
    else: rooms = "NULL"

    if (len(paraminfo) >= 4):
        floor = paraminfo[3].text.strip()
    else: floor = "NULL"

    if (len(paraminfo) >= 5):
        for text in paraminfo:
            builtIn = re.findall("(?:[1][9][0-9][0-9]|[2][0][0-2][0-9])", text.text)
            if (builtIn == []): builtIn = "NULL"
            else:
                builtIn = builtIn[0]
                break
    else: builtIn = "NULL"

    paraminfo = soup.find_all("div", "parameters__value")

    if (len(paraminfo) >= 1):
        updateTime = paraminfo[0].text.strip()
    else: updateTime = "NULL"

    if (len(paraminfo) >= 1):
        uploadTime = paraminfo[1].text.strip()
    else: uploadTime = "NULL"

    price = soup.find("span", "priceInfo__value")
    if (price != None): price = re.sub(" +", " ", soup.find("span", "priceInfo__value").text.replace("\n", '').strip())
    else: price = "NULL"

    pricePerMeter = soup.find("span", "priceInfo__additional")
    if (pricePerMeter != None): pricePerMeter = pricePerMeter.text.strip()
    else: pricePerMeter = "NULL"

    description = soup.find("div", "description__container")
    if (description != None): description = description.text.replace("\n", '').replace("\r", '').strip()
    else: description = "NULL"

    return (title, city, square, rooms, floor, builtIn, updateTime, uploadTime, price, pricePerMeter, description, url)

def getLinksFromPage(url):
    links = []
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    divs = soup.find_all("div", "listing__teaserWrapper")
    for div in divs:
        links.append(div.find('a', "teaserLink")['href'])
    return links

def getMaxPage(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup.find('a', "")

df = pd.DataFrame(columns=['Title', 'City', 'Square', 'Rooms', 'Floor', 'Built In', 'Update time', 'Upload time', 'Price', 'Price per meter', 'Description', 'Link'])

links = []
URL = "https://gratka.pl/nieruchomosci/mieszkania?page="
MaxPage = 4

for i in range(1, MaxPage):
    links = links + getLinksFromPage(URL + str(i))
    print("Loading page " + str(i) + "/" + str(MaxPage - 1))

for n, link in enumerate(links):
    df.loc[n] = getInfoFromPage(link)
    print("Getting info from page " + str(n + 1) + "/" + str(len(links)))

# Сохранение в CSV
df.to_csv(r"output1.csv", index=True, header=True)
print("Saved to output1.csv file")

# Преобразование в JSON
df_json = df.to_json(orient='records')
with open('output1.json', 'w') as json_file:
    json_file.write(df_json)
print("Saved to output1.json file")


def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(f'File {source_file_name} uploaded to {destination_blob_name} in {bucket_name}.')

source_file_name = 'output1.csv'

#############

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'key.json'

storage_client = storage.Client()
bucket_name = 'gratka_bucket'

# Проверяем, существует ли ведро. Если нет - создаем новое
bucket = storage_client.bucket(bucket_name)
if not bucket.exists():
    bucket = storage_client.create_bucket(bucket, location='EU')

def upload_to_bucket(blob_name, file_path, bucket_name):
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(file_path)
    return True

file_path = r'C:\Users\lukum\PycharmProjects\pythonProject\output1.csv'
upload_to_bucket('output1.csv', file_path, 'gratka_bucket')


Loading page 1/3
Loading page 2/3
Loading page 3/3
Getting info from page 1/96
Getting info from page 2/96
Getting info from page 3/96
Getting info from page 4/96
Getting info from page 5/96
Getting info from page 6/96
Getting info from page 7/96
Getting info from page 8/96
Getting info from page 9/96
Getting info from page 10/96
Getting info from page 11/96
Getting info from page 12/96
Getting info from page 13/96
Getting info from page 14/96
Getting info from page 15/96
Getting info from page 16/96
Getting info from page 17/96
Getting info from page 18/96
Getting info from page 19/96
Getting info from page 20/96
Getting info from page 21/96
Getting info from page 22/96
Getting info from page 23/96
Getting info from page 24/96
Getting info from page 25/96
Getting info from page 26/96
Getting info from page 27/96
Getting info from page 28/96
Getting info from page 29/96
Getting info from page 30/96
Getting info from page 31/96
Getting info from page 32/96
Getting info from page 33/96
G

True