In [1]:
import pandas as pd

In [2]:
df_listing_rent = pd.read_csv("raw_data/bayut_listing_rent.csv")
df_listing_rent_marina = pd.read_csv("raw_data/bayut_listing_rent_dubai_marina.csv")
df_listing_sales_apartments = pd.read_csv("raw_data/bayut_listing_sales_apartments.csv")
df_listing_sales_marina = pd.read_csv("raw_data/bayut_listing_sales_dubai_marina.csv")
df_listing_sales_townhouses = pd.read_csv("raw_data/bayut_listing_sales_townhouses.csv")
df_listing_sales_villas = pd.read_csv("raw_data/bayut_listing_sales_villas.csv")


In [3]:
df_transactions_sales_marina = pd.read_csv("raw_data/bayut_transactions_buy_dubai_marina_last12_20_page.csv")
df_transactions_sales_1 = pd.read_csv("raw_data/bayut_transactions_buy_last12_49_page.csv")
df_transactions_sales_2 = pd.read_csv("raw_data/bayut_transactions_buy_last12_50_page.csv")
df_transactions_rent_marina = pd.read_csv("raw_data/bayut_transactions_rent_dubai_marina_last12_20_page.csv")

In [4]:
df_listing_rent.shape

(168, 14)

In [None]:
df_transactions_rent_marina.head(1)

Unnamed: 0,START DATE,LOCATION,DURATION,Type,Beds,BEDS,AREA (SQFT),FLOOR
0,1 Jun\n2025,Dream Tower 2\nDubai Marina\nDream Towers,12 months\nRENEWAL,65000,Apartment,2,869,4


In [6]:
df_transactions_rent_marina = df_transactions_rent_marina.rename(columns={
    'Type': 'Price',  # Rename price column to match rent dataframe
    'Beds': 'Type'  # Rename area column to match rent dataframe
})


In [7]:
df_transactions_sales_marina.head(1)

Unnamed: 0,DATE,LOCATION,PRICE (AED),Type,Beds,BUILT-UP AREA,FLOOR
0,18 Apr\n2025,The Torch\nDubai Marina,"1,680,000\nVacant at time of sale",Apartment,2,1258,39


In [8]:
df_listing_rent_merged = pd.concat([df_listing_rent, df_listing_rent_marina], ignore_index=True)
df_listing_buy_merged = pd.concat([df_listing_sales_apartments, df_listing_sales_marina,df_listing_sales_townhouses,df_listing_sales_villas], ignore_index=True)

df_transactions_rent_merged = pd.concat([df_transactions_rent_marina], ignore_index=True)
df_transactions_buy_merged = pd.concat([df_transactions_sales_marina, df_transactions_sales_1,df_transactions_sales_2], ignore_index=True)

In [9]:
df_listing_rent_merged = df_listing_rent_merged.drop_duplicates(subset=['price','type','location'])
df_listing_buy_merged = df_listing_buy_merged.drop_duplicates(subset=['price','type','location'])

df_transactions_rent_merged = df_transactions_rent_merged.drop_duplicates(subset=['Type','LOCATION','FLOOR'])
df_transactions_buy_merged = df_transactions_buy_merged.drop_duplicates(subset=['Type','LOCATION','FLOOR'])

In [10]:
print("Listing of rents",df_listing_rent_merged.shape)
print("Listing of buy",df_listing_buy_merged.shape)

print("Transactions of rents",df_transactions_rent_merged.shape)
print("Transaction of buy",df_transactions_buy_merged.shape)

Listing of rents (299, 14)
Listing of buy (1468, 14)
Transactions of rents (356, 8)
Transaction of buy (928, 9)


In [11]:
df_transactions_rent_merged.head(1)

Unnamed: 0,START DATE,LOCATION,DURATION,Price,Type,BEDS,AREA (SQFT),FLOOR
0,1 Jun\n2025,Dream Tower 2\nDubai Marina\nDream Towers,12 months\nRENEWAL,65000,Apartment,2,869,4


In [12]:
df_transactions_buy_merged.head(1)

Unnamed: 0,DATE,LOCATION,PRICE (AED),Type,Beds,BUILT-UP AREA,FLOOR,BUILT-UP,PLOT
0,18 Apr\n2025,The Torch\nDubai Marina,"1,680,000\nVacant at time of sale",Apartment,2,1258,39,,


In [13]:
def process(df):
    df[['building_name', 'street_name', 'area_name']] = df['location'].str.split(',', n=2, expand=True)

    df['building_name'] = df['building_name'].str.strip()
    df['street_name'] = df['street_name'].str.strip()
    df['area_name'] = df['area_name'].str.strip()

    df['verified_date'] = df['verified_plan'].str.replace('on ', '', regex=False)
    df['verified_date'] = pd.to_datetime(df['verified_date'], format='%dst of %B %Y', errors='coerce')
    df['area'] = df['area'].str.replace('sqft', '', regex=False)
    df.drop(columns=['verified_plan'],inplace=True)
    df.drop(columns=['Unnamed: 0'],inplace=True)
    df[['location', 'building_name', 'street_name', 'area_name']].head()

In [14]:
def process_locations(df):
    location_split = df['LOCATION'].str.split('\n', expand=True)
    # If only one column is returned (no split occurred)
    if location_split.shape[1] == 1:
        df['building_name'] = None
        df['area_name'] = location_split[0]
    else:
        # If we have more than 2 columns, prioritize building_name and area_name
        if location_split.shape[1] > 2:
            df['building_name'] = location_split[0]
            df['area_name'] = location_split[1]
        else:
            df['building_name'] = location_split[0]
            df['area_name'] = location_split[1]

In [15]:
process_locations(df_transactions_buy_merged)

In [16]:
price_split = df_transactions_buy_merged['PRICE (AED)'].str.split('\n', expand=True)
if price_split.shape[1] == 1:
    df_transactions_buy_merged['PRICE'] = price_split[0]
else:
    df_transactions_buy_merged['PRICE'] = price_split[0]
    df_transactions_buy_merged['Info'] = price_split[1]

In [17]:
df_transactions_buy_merged.drop(columns=["PRICE (AED)"],inplace=True)

In [18]:
df_transactions_buy_merged['DATE'] = df_transactions_buy_merged['DATE'].str.replace('\n', ' ', regex=False)
df_transactions_buy_merged['DATE'] = pd.to_datetime(df_transactions_buy_merged['DATE'], format='%d %b %Y', errors='coerce')

In [19]:
process(df_listing_rent_merged)
process(df_listing_buy_merged)

In [20]:
df_transactions_rent_merged.head(1)

Unnamed: 0,START DATE,LOCATION,DURATION,Price,Type,BEDS,AREA (SQFT),FLOOR
0,1 Jun\n2025,Dream Tower 2\nDubai Marina\nDream Towers,12 months\nRENEWAL,65000,Apartment,2,869,4


In [21]:
df_transactions_rent_merged["DURATION"].unique()

array(['12 months\nRENEWAL', '12 months\nNEW', '36 months\nRENEWAL',
       '6 months\nRENEWAL', '5 months\nRENEWAL', '13 months\nRENEWAL',
       '2 months\nRENEWAL', '24 months\nNEW', '14 months\nNEW',
       '4 months\nRENEWAL', '3 months\nNEW', '3 months\nRENEWAL',
       '6 months\nNEW', '38 months\nNEW', '13 months\nNEW',
       '36 months\nNEW', '10 months\nRENEWAL', '1 months\nRENEWAL'],
      dtype=object)

In [22]:
df_transactions_rent_merged['START DATE'] = df_transactions_rent_merged['START DATE'].str.replace('\n', ' ', regex=False)
df_transactions_rent_merged['START DATE'] = pd.to_datetime(df_transactions_rent_merged['START DATE'], format='%d %b %Y', errors='coerce')

In [23]:
process_locations(df_transactions_rent_merged)

In [24]:
duration_split = df_transactions_rent_merged['DURATION'].str.split('\n', expand=True)
df_transactions_rent_merged['DURATION(Months)'] = duration_split[0].str.replace('months','',regex=False)
df_transactions_rent_merged['Info'] = duration_split[1]

In [25]:
df_transactions_rent_merged.drop(columns=["DURATION"],inplace=True)

In [26]:
h_split = df_listing_buy_merged['handover'].str.split(' ', expand=True)
df_listing_buy_merged['handover quater'] = h_split[0]
df_listing_buy_merged['handover year'] = h_split[1]

In [27]:
df_listing_buy_merged["offplan"].unique()

array(['Off-Plan\n|\nInitial Sale', 'Off-Plan', nan,
       'Off-Plan\n|\nResale'], dtype=object)

In [28]:
o_split = df_listing_buy_merged['offplan'].str.split('|', expand=True)
if o_split.shape[1] == 1:
    df_listing_buy_merged['offplan type'] = o_split[0].str.replace('\n','',regex=False)
else:
    df_listing_buy_merged['offplan type'] = o_split[0].str.replace('\n','',regex=False)
    df_listing_buy_merged['type of sale'] = o_split[1].str.replace('\n','',regex=False)

In [29]:
df_listing_buy_merged.drop(columns=["handover"],inplace=True)

In [30]:
df_listing_buy_merged.drop(columns=["offplan"],inplace=True)

In [31]:
df_listing_rent_merged.drop(columns=["handover","payment","offplan"],inplace=True)

In [32]:
df_listing_rent_merged["area_name"]=df_listing_rent_merged["area_name"].str.replace("Dubai","")


In [33]:
df_listing_buy_merged["area_name"]=df_listing_buy_merged["area_name"].str.replace("Dubai","")
df_listing_buy_merged["area_name"] = df_listing_buy_merged["area_name"].str.replace(r',\s*$', '', regex=True)

In [34]:
df_listing_rent_merged["area_name"] = df_listing_rent_merged["area_name"].str.replace(r',\s*$', '', regex=True)

In [None]:
df_listing_rent_merged.head(1)

Unnamed: 0,title,price,type,beds,studio,baths,area,location,agent_plan,building_name,street_name,area_name,verified_date
0,PRIME LOCATION | VACANT | MODERN LAYOUT,68000,Apartment,1.0,,2,731,"AKA Residence, JVC District 11, Jumeirah Villa...",TruBroker™,AKA Residence,JVC District 11,Jumeirah Village Circle (JVC),2025-04-21


In [None]:
df_listing_buy_merged.head(1)

Unnamed: 0,title,price,type,beds,studio,baths,area,location,payment,agent_plan,building_name,street_name,area_name,verified_date,handover quater,handover year,offplan type,type of sale
0,Sheikh Zayed Road View | 🏗️ Limited Availabili...,565000,Apartment,,Studio,1,358,"Azizi Arian, Downtown Jebel Ali, Jebel Ali, Dubai",50/50,,Azizi Arian,Downtown Jebel Ali,Jebel Ali,NaT,Q1,2028,Off-Plan,Initial Sale


In [37]:
df_transactions_rent_merged.head(1)

Unnamed: 0,START DATE,LOCATION,Price,Type,BEDS,AREA (SQFT),FLOOR,building_name,area_name,DURATION(Months),Info
0,2025-06-01,Dream Tower 2\nDubai Marina\nDream Towers,65000,Apartment,2,869,4,Dream Tower 2,Dubai Marina,12,RENEWAL


In [38]:
df_transactions_buy_merged.head(1)

Unnamed: 0,DATE,LOCATION,Type,Beds,BUILT-UP AREA,FLOOR,BUILT-UP,PLOT,building_name,area_name,PRICE,Info
0,2025-04-18,The Torch\nDubai Marina,Apartment,2,1258,39,,,The Torch,Dubai Marina,1680000,Vacant at time of sale


In [39]:
df_listing_buy_merged = df_listing_buy_merged.copy()
df_listing_rent_merged = df_listing_rent_merged.copy()
df_transactions_buy_merged = df_transactions_buy_merged.copy()
df_transactions_rent_merged = df_transactions_rent_merged.copy()
# Connect to a database (using SQLite for simplicity)
import sqlite3
conn = sqlite3.connect('real_estate.db')

# Write the DataFrame to a SQL table
df_listing_buy_merged.to_sql('listings_buy', conn, if_exists='replace', index=False)
df_listing_rent_merged.to_sql('listings_rent', conn, if_exists='replace', index=False)
df_transactions_buy_merged.to_sql('transactions_buy', conn, if_exists='replace', index=False)
df_transactions_rent_merged.to_sql('transactions_rent', conn, if_exists='replace', index=False)

# Verify the table was created
#query = "SELECT name FROM sqlite_master WHERE type='table';"
#pd.read_sql(query, conn)

# Preview the data in the SQL table
#pd.read_sql("SELECT * FROM listings_buy LIMIT 5", conn)

# Close the connection
conn.close()

In [8]:
import sqlite3

conn = sqlite3.connect('real_estate.db')

pd.read_sql("SELECT title, price, building_name, REPLACE(area_name, 'Dubai', '') AS area_name FROM listings_rent;", conn)
#
conn.close()

In [46]:
import sqlite3

conn = sqlite3.connect('real_estate.db')

pd.read_sql("SELECT title, price, building_name, area_name FROM listings_buy;", conn)
#
#conn.close()

Unnamed: 0,title,price,building_name,area_name
0,Sheikh Zayed Road View | 🏗️ Limited Availabili...,565000,Azizi Arian,Jebel Ali
1,💫30/70 Plan | Furnished🛋️ Smart Home 🚧2026 Han...,935000,Forest City Tower,
2,🚇 Right next to Metro💰Lowest Price Ever 💳 Best...,599000,Azizi Arian,Jebel Ali
3,Payment Plan | Large Layout | Golf Course,5200000,399 Hills Park B,Hills Estate
4,Vacant on Transfer | Fully Upgraded | Palm Views,2875000,Princess Tower,
...,...,...,...,...
1463,Fully Upgraded | Motivated Seller | VOT,15999000,Mirador,
1464,EXTERNAL MAIDS ROOM | ONE OF A KIND | 4E,4375000,The Springs 14,
1465,Exclusive | Fully Upgraded and Extended,9700000,The Meadows 9,
1466,Upgraded | VOT | Genuine Listing,3050000,Joy,


In [None]:
# First, remove 'sqft' from the area column if present
df_merged['area'] = df_merged['area'].str.replace('sqft', '', regex=False)

# Remove any commas that might be in the numbers (e.g., "1,500")
df_merged['area'] = df_merged['area'].str.replace(',', '', regex=False)

# Strip any whitespace
df_merged['area'] = df_merged['area'].str.strip()

# Convert to float, coercing any errors to NaN
df_merged['area'] = pd.to_numeric(df_merged['area'], errors='coerce')

# Check the result
print(df_merged['area'].dtype)
df_merged['area'].head()

In [None]:
df_merged.info()

In [None]:
# Convert price to decimal (float)
# First, remove any non-numeric characters (like commas and currency symbols)
df_merged['price'] = df_merged['price'].str.replace(',', '', regex=False)
df_merged['price'] = df_merged['price'].str.replace('AED', '', regex=False).str.strip()
# Convert to float
df_merged['price'] = pd.to_numeric(df_merged['price'], errors='coerce')

# Convert beds to nullable integer
# pandas uses Int64 type for nullable integers
df_merged['beds'] = pd.to_numeric(df_merged['beds'], errors='coerce')
df_merged['beds'] = df_merged['beds'].astype('Int64')  # Capital 'I' for nullable integer

# Convert studio to nullable integer
# First, map text values to numbers if needed (e.g., 'Y' to 1, 'N' to 0)
# Assuming 'Y' means it's a studio and 'N' means it's not
studio_mapping = {'Y': 1, 'N': 0}
df_merged['studio'] = df_merged['studio'].map(studio_mapping)
df_merged['studio'] = pd.to_numeric(df_merged['studio'], errors='coerce')
df_merged['studio'] = df_merged['studio'].astype('Int64')

# Convert baths to nullable integer
df_merged['baths'] = pd.to_numeric(df_merged['baths'], errors='coerce')
df_merged['baths'] = df_merged['baths'].astype('Int64')

# Check the updated data types
df_merged[['price', 'beds', 'studio', 'baths']].dtypes

In [None]:
df_merged.drop(columns=['verified_plan'],inplace=True)