In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
# URL of the website to scrape
URL = 'https://www.property24.co.ke/property-to-rent-in-nairobi-c1890'

# Custom headers to mimic a browser visit
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US, en;q=0.5'
}

# Fetching HTML content from the URL
response = requests.get(URL, headers=HEADERS)
html_content = response.content


In [3]:
def updated_scrape_property_listings(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    listings = soup.find_all('div', attrs={'class': "pull-left sc_listingTileContent"})
    scraped_data = []
    for listing in listings:
        listing_data = {}
        price_info = listing.find('div', attrs={'class': "sc_listingTilePrice primaryColor"})
        if price_info:
            listing_data['price'] = price_info.get_text(strip=True)
        icons = listing.find_all('img', attrs={'src': "/Content/Images/blank.gif?z=693fc8"})
        for icon in icons:
            span = icon.find_next_sibling('span')
            if span:
                if 'property24generic_icon_beds' in icon['class']:
                    listing_data['bedrooms'] = span.text.strip()
                elif 'property24generic_icon_baths' in icon['class']:
                    listing_data['bathrooms'] = span.text.strip()
                elif 'property24generic_icon_parking' in icon['class']:
                    listing_data['parking'] = span.text.strip()
        type_info = listing.find('div', attrs={'class': "sc_listingTileArea"})
        if type_info:
            listing_data['type'] = type_info.get_text(strip=True)
        location_info = listing.find('div', attrs={'class': "sc_listingTileAddress primaryColor"})
        if location_info:
            listing_data['location'] = location_info.get_text(strip=True)
        scraped_data.append(listing_data)
    return scraped_data


In [4]:
# Assuming the HTML content has been successfully fetched from the URL
scraped_data = updated_scrape_property_listings(html_content)

# Convert to DataFrame
df = pd.DataFrame(scraped_data)

# Save to CSV
csv_file_path = 'scraped_data.csv'
df.to_csv(csv_file_path, index=False)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,price,type,location,bathrooms,bedrooms,parking
0,KSh 120 000Per Month,Commercial Property to Rent in Upper Hill,"Upper Hill, Nairobi",,,
1,KSh 110 000Per Month,2 Bedroom Apartment / Flat to Rent in Kileleshwa,"Gatundu close, Kileleshwa, Nairobi",,,
2,KSh 70 000Per Month,3 Bedroom Apartment / Flat to Rent in Kilimani,"Kilimani, Nairobi",,,
3,KSh 300 000Per Month,5 Bedroom House to Rent in Runda,"Runda, Nairobi",,,
4,KSh 300 000Per Month,3 Bedroom Apartment / Flat to Rent in Westlands,"Peponi Rd Nairobi, Westlands, Nairobi",,,


In [5]:
# Load the scraped data
scraped_data_df = pd.read_csv('scraped_data.csv')

# Step 1: Separate price and payment frequency
scraped_data_df['payment_frequency'] = scraped_data_df['price'].str.extract(r'(Per .*)')
scraped_data_df['price'] = scraped_data_df['price'].str.extract(r'(\d[\d\s]*\d)').replace(r'\s+', '', regex=True).astype(float)

# Step 2: Split the 'type' field
type_split = scraped_data_df['type'].str.extract(r'(\d+) Bedroom (.*) to Rent in (.*)')
scraped_data_df['bedrooms'] = type_split[0].astype(float)  # Overwriting the existing bedrooms column
scraped_data_df['sub_type'] = type_split[1]
scraped_data_df['locality'] = type_split[2]

# Assuming the category is always "For Rent" and type is extracted from 'sub_type'
scraped_data_df['category'] = 'For Rent'
scraped_data_df['type'] = scraped_data_df['sub_type'].str.split().str[0]  # Extracting the first word as type

# Step 3: Handle missing values
scraped_data_df.fillna({'bathrooms': 0, 'parking': 0}, inplace=True)

# Step 4: Add missing columns
scraped_data_df['toilets'] = scraped_data_df['bathrooms']  # Assuming toilets are same as bathrooms
scraped_data_df['furnished'] = 0  # Defaulting to not furnished
scraped_data_df['serviced'] = 0  # Defaulting to not serviced
scraped_data_df['shared'] = 0  # Defaulting to not shared
scraped_data_df['state'] = 'Nairobi'  # Assuming state is Nairobi for all listings
scraped_data_df['list_year'] = 2024  # Assuming the current year for all listings
scraped_data_df['list_month'] = 1  # Assuming the current month for all listings

# Reordering columns to match the analytical table
ordered_columns = ['price', 'bedrooms', 'bathrooms', 'toilets', 'furnished', 'serviced', 'shared', 'parking', 'category', 'type', 'sub_type', 'state', 'locality', 'list_year', 'list_month']
scraped_data_df = scraped_data_df[ordered_columns]

# Removing "/ Flat" from the 'sub_type' column
scraped_data_df['sub_type'] = scraped_data_df['sub_type'].str.replace(' / Flat', '', regex=False)

# Removing decimals from 'price','bedrooms', 'bathrooms', 'toilets', and 'parking'
scraped_data_df['price'] = scraped_data_df['price'].fillna(0).astype(int)
scraped_data_df[['bedrooms', 'bathrooms', 'toilets', 'parking']] = scraped_data_df[['bedrooms', 'bathrooms', 'toilets', 'parking']].fillna(0).astype(int)


# Saving the transformed DataFrame
scraped_data_df.to_csv('transformed_scraped_data.csv', index=False)

# Display the first few rows of the reformatted DataFrame
print(scraped_data_df.head())

    price  bedrooms  bathrooms  toilets  furnished  serviced  shared  parking  \
0  120000         0          0        0          0         0       0        0   
1  110000         2          0        0          0         0       0        0   
2   70000         3          0        0          0         0       0        0   
3  300000         5          0        0          0         0       0        0   
4  300000         3          0        0          0         0       0        0   

   category       type   sub_type    state    locality  list_year  list_month  
0  For Rent        NaN        NaN  Nairobi         NaN       2024           1  
1  For Rent  Apartment  Apartment  Nairobi  Kileleshwa       2024           1  
2  For Rent  Apartment  Apartment  Nairobi    Kilimani       2024           1  
3  For Rent      House      House  Nairobi       Runda       2024           1  
4  For Rent  Apartment  Apartment  Nairobi   Westlands       2024           1  


In [None]:

import requests
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
import os
import pickle

# Function to download the credentials file from a Google Drive link
def download_file_from_drive(link, destination):
    r = requests.get(link, allow_redirects=True)
    open(destination, 'wb').write(r.content)


# Function to authenticate and create a Google Drive service
def create_drive_service(credentials_path):
    SCOPES = ['https://www.googleapis.com/auth/drive']
    creds = None

    # Check for token.json (user's access and refresh tokens)
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)

    # If credentials are not valid or do not exist, start the OAuth flow
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
            auth_url, _ = flow.authorization_url()
            print('Please go to this URL and authorize the application:')
            print(auth_url)
            code = input('Enter the authorization code: ')
            flow.fetch_token(code=code)
            creds = flow.credentials
            # Save the credentials
            with open('token.json', 'w') as token:
                token.write(creds.to_json())

    return build('drive', 'v3', credentials=creds)





# Function to upload file to Google Drive
def upload_to_drive(service, filename, filepath, mimetype='text/plain'):
    file_metadata = {'name': filename}
    media = MediaFileUpload(filepath, mimetype=mimetype)
    file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    print('File ID: %s' % file.get('id'))


# download the credentials file
credentials_link = 'https://drive.google.com/uc?export=download&id=1PuJtJ7eBgtyIcCcIqYlx24V_mJOg7kWW'
download_file_from_drive(credentials_link, 'credentials.json')


# Create the Google Drive service
service = create_drive_service('credentials.json')

# upload a file to Google Drive
upload_to_drive(service, 'transformed_scraped data.csv', r'C:\Users\yahya\Downloads\transformed_scraped_data.csv', 'text/csv')



Please go to this URL and authorize the application:
https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=514958244075-pqom2h7kbtdq8p121pjfqf4qc4h9n20m.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=d5s3QMBZvXVwOQKNCgHvNXIfQbnLlf&access_type=offline
