In [1]:
# Step 0: Import libraries and load dataset
import pandas as pd

file_path = "F:\InternShip\\\Booking.com-Data-Scraping\\property_booking.csv"  
df = pd.read_csv(file_path)

print("Initial Dataset Shape:", df.shape)
print(df.head())


  file_path = "F:\InternShip\\\Booking.com-Data-Scraping\\property_booking.csv"


Initial Dataset Shape: (1286, 8)
      City Property Type                                               Name  \
0  Karachi     apartment                                      Room for rent   
1  Karachi     apartment  Prestige comfort apartments separate lounge wi...   
2  Karachi     apartment                                           YS Rooms   
3  Karachi     apartment                                   Prestige Comfort   
4  Karachi     apartment                                  Prestige comforts   

   Rating  Reviews Count Location  \
0     NaN            NaN  Karachi   
1     NaN            NaN  Karachi   
2     NaN            NaN  Karachi   
3     NaN            NaN  Karachi   
4     NaN            NaN  Karachi   

                                           Image URL  \
0  https://cf.bstatic.com/xdata/images/hotel/squa...   
1  https://cf.bstatic.com/xdata/images/hotel/squa...   
2  https://cf.bstatic.com/xdata/images/hotel/squa...   
3  https://cf.bstatic.com/xdata/images/hotel/

In [2]:
# inspect Dataset
print('\dataset info')
print(df.info())

\dataset info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1286 entries, 0 to 1285
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   City           1286 non-null   object 
 1   Property Type  1286 non-null   object 
 2   Name           1286 non-null   object 
 3   Rating         816 non-null    float64
 4   Reviews Count  816 non-null    float64
 5   Location       1286 non-null   object 
 6   Image URL      1286 non-null   object 
 7   Image Path     1286 non-null   object 
dtypes: float64(2), object(6)
memory usage: 80.5+ KB
None


  print('\dataset info')


In [4]:

print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
City               0
Property Type      0
Name               0
Rating           470
Reviews Count    470
Location           0
Image URL          0
Image Path         0
dtype: int64


In [5]:
# Remove Duplicaate
df = df.drop_duplicates()
print('\nAfter removing duplicates:', df.shape)


After removing duplicates: (1286, 8)


In [None]:
# Handle missing values
# Fill numeric columns with median
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
df['Reviews Count'] = df['Reviews Count'].fillna(df['Reviews Count'].median())

# Fill categorical columns with mode 
for col in ['City', 'Property Type', 'Name', 'Location']:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])


In [7]:
# Standardize text formatting
text_columns = ['City', 'Property Type', 'Name', 'Location']
for col in text_columns:
    df[col] = df[col].str.strip().str.title()


In [8]:
#  Ensure valid numeric ranges
# Keep Rating between 0 and 10
df = df[(df['Rating'] >= 0) & (df['Rating'] <= 10)]

# Keep Reviews Count non-negative
df = df[df['Reviews Count'] >= 0]


In [9]:
#  Validate image fields (keep both URL & Path for training)
# Replace invalid URLs with NaN (instead of dropping rows)
df['Image URL'] = df['Image URL'].apply(lambda x: x if str(x).startswith("http") else None)

# Replace empty paths with NaN
df['Image Path'] = df['Image Path'].apply(lambda x: x if isinstance(x, str) and len(x.strip()) > 0 else None)



In [10]:
# Reset index
df = df.reset_index(drop=True)
print("\nFinal Cleaned Dataset Shape:", df.shape)
print(df.head())



Final Cleaned Dataset Shape: (1286, 8)
      City Property Type                                               Name  \
0  Karachi     Apartment                                      Room For Rent   
1  Karachi     Apartment  Prestige Comfort Apartments Separate Lounge Wi...   
2  Karachi     Apartment                                           Ys Rooms   
3  Karachi     Apartment                                   Prestige Comfort   
4  Karachi     Apartment                                  Prestige Comforts   

   Rating  Reviews Count Location  \
0     8.1            8.0  Karachi   
1     8.1            8.0  Karachi   
2     8.1            8.0  Karachi   
3     8.1            8.0  Karachi   
4     8.1            8.0  Karachi   

                                           Image URL  \
0  https://cf.bstatic.com/xdata/images/hotel/squa...   
1  https://cf.bstatic.com/xdata/images/hotel/squa...   
2  https://cf.bstatic.com/xdata/images/hotel/squa...   
3  https://cf.bstatic.com/xdata/images

In [11]:
# Step 8: Save cleaned dataset (overwrite original file)
output_path = "property_booking.csv"
df.to_csv(output_path, index=False, encoding="utf-8")
print("\n✅ Cleaned dataset saved back to:", output_path)



✅ Cleaned dataset saved back to: property_booking.csv
