# Airbnb Data Preprocessing

This notebook combines listings, neighbourhoods, reviews, and calendar data from multiple cities (Los Angeles, Portland, Salem, San Diego) into separate preprocessed dataset files.

In [17]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import glob

In [18]:
# Define base directory for data
BASE_DIR = Path('../Airbnb Data')

# List all city directories
cities = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
print(f"Cities found: {cities}")

Cities found: ['Los Angeles', 'Portland', 'Salem', 'San Diego']


In [19]:
# Function to load and combine data from all cities
def load_data_from_all_cities(file_type):
    """Load specific data file from all cities and combine into one dataframe"""
    all_data = []
    
    for city in cities:
        file_path = BASE_DIR / city / f"{file_type}.csv"
        if file_path.exists():
            df = pd.read_csv(file_path)
            # Add city column to identify source
            df['city'] = city
            all_data.append(df)
        else:
            print(f"Warning: {file_path} not found")
    
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        return pd.DataFrame()

# Load listings from all cities
all_listings = load_data_from_all_cities('listings')
print(f"Total listings loaded: {len(all_listings)}")
print(f"Columns in listings: {len(all_listings.columns)}")

Total listings loaded: 62771
Columns in listings: 80


In [20]:
# Load neighbourhoods data
all_neighbourhoods = load_data_from_all_cities('neighbourhoods')
print(f"Neighbourhoods data loaded: {len(all_neighbourhoods)} records")

# Load reviews data
all_reviews = load_data_from_all_cities('reviews')
print(f"Reviews data loaded: {len(all_reviews)} records")

# Load calendar data
all_calendar = load_data_from_all_cities('calendar')
print(f"Calendar data loaded: {len(all_calendar)} records")

Neighbourhoods data loaded: 481 records
Reviews data loaded: 3003791 records
Reviews data loaded: 3003791 records


  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


Calendar data loaded: 22902530 records


## Data Preprocessing

Now let's process and clean the data from all sources before saving to separate files.

In [21]:
# Process all calendar data instead of filtering for 2024 only
if not all_calendar.empty:
    # Convert date to datetime
    all_calendar['date'] = pd.to_datetime(all_calendar['date'])
    
    # Use all calendar data without filtering
    processed_calendar_data = all_calendar
    print(f"Total calendar entries: {len(processed_calendar_data)}")
else:
    processed_calendar_data = pd.DataFrame()
    print("No calendar data available")

Total calendar entries: 22902530


In [22]:
# Process the listings data
def process_listings(df):
    if df.empty:
        print("No listings data available.")
        return pd.DataFrame()
    
    # Select important columns from listings
    essential_columns = [
        'id', 'name', 'description', 'neighbourhood_cleansed', 
        'latitude', 'longitude', 'property_type', 'room_type', 
        'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 
        'price', 'minimum_nights', 'maximum_nights', 'city',
        'number_of_reviews', 'review_scores_rating', 'host_id', 'host_name',
        'host_since', 'host_response_rate', 'host_is_superhost',
        'calculated_host_listings_count'
    ]
    
    # Select columns that exist in the dataframe
    existing_columns = [col for col in essential_columns if col in df.columns]
    processed_df = df[existing_columns].copy()
    
    # Process price - remove $ and convert to numeric
    if 'price' in processed_df.columns:
        processed_df['price'] = processed_df['price'].replace('[\$,]', '', regex=True)
        # Convert to numeric, errors='coerce' will set invalid parsing values to NaN
        processed_df['price'] = pd.to_numeric(processed_df['price'], errors='coerce')
    
    # Fill missing numeric values appropriately
    numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
    processed_df[numeric_cols] = processed_df[numeric_cols].fillna(0)
    
    return processed_df

# Process neighborhoods data
def process_neighbourhoods(df):
    if df.empty:
        print("No neighbourhoods data available.")
        return pd.DataFrame()
    
    # Ensure we have consistent columns
    required_cols = ['neighbourhood', 'city']
    optional_cols = ['neighbourhood_group']
    
    # Create a new dataframe with required columns
    processed_df = df[['neighbourhood', 'city']].copy()
    
    # Add optional columns if they exist
    for col in optional_cols:
        if col in df.columns:
            processed_df[col] = df[col]
    
    # Remove duplicates
    processed_df = processed_df.drop_duplicates()
    
    return processed_df

# Process reviews data
def process_reviews(df):
    if df.empty:
        print("No reviews data available.")
        return pd.DataFrame()
    
    # Select important columns
    essential_columns = [
        'listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 
        'comments', 'city'
    ]
    
    # Select columns that exist in the dataframe
    existing_columns = [col for col in essential_columns if col in df.columns]
    processed_df = df[existing_columns].copy()
    
    # Add review length as a feature
    if 'comments' in processed_df.columns:
        processed_df['review_length'] = processed_df['comments'].str.split().str.len()
    
    return processed_df

# Process calendar data
def process_calendar(df):
    if df.empty:
        print("No calendar data available.")
        return pd.DataFrame()
    
    # Make a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Process price column if it exists
    if 'price' in processed_df.columns:
        # Check if price is a string (it might have $ signs)
        if processed_df['price'].dtype == 'object':
            processed_df['price'] = processed_df['price'].replace('[\$,]', '', regex=True)
            # Convert to numeric
            processed_df['price'] = pd.to_numeric(processed_df['price'], errors='coerce')
    
    # Convert 't'/'f' to True/False in 'available' column
    if 'available' in processed_df.columns:
        processed_df['available'] = processed_df['available'].map({'t': True, 'f': False})
    
    return processed_df

# Process all datasets
processed_listings = process_listings(all_listings)
processed_neighbourhoods = process_neighbourhoods(all_neighbourhoods)
processed_reviews = process_reviews(all_reviews)
processed_calendar = process_calendar(processed_calendar_data)

print(f"Processed listings shape: {processed_listings.shape}")
print(f"Processed neighbourhoods shape: {processed_neighbourhoods.shape}")
print(f"Processed reviews shape: {processed_reviews.shape}")
print(f"Processed calendar shape: {processed_calendar.shape}")

Processed listings shape: (62771, 24)
Processed neighbourhoods shape: (481, 3)
Processed reviews shape: (3003791, 8)
Processed calendar shape: (22902530, 8)


In [23]:
# Save the preprocessed data to separate files
def save_datasets():
    # Create output directory if it doesn't exist
    output_dir = BASE_DIR
    
    # Save listings
    listings_path = output_dir / 'preprocessed_listings.csv'
    processed_listings.to_csv(listings_path, index=False)
    print(f"Preprocessed listings saved to {listings_path}")
    
    # Save neighbourhoods
    neighbourhoods_path = output_dir / 'preprocessed_neighbourhoods.csv'
    processed_neighbourhoods.to_csv(neighbourhoods_path, index=False)
    print(f"Preprocessed neighbourhoods saved to {neighbourhoods_path}")
    
    # Save reviews
    reviews_path = output_dir / 'preprocessed_reviews.csv'
    processed_reviews.to_csv(reviews_path, index=False)
    print(f"Preprocessed reviews saved to {reviews_path}")
    
    # Save calendar
    calendar_path = output_dir / 'preprocessed_calendar.csv'
    processed_calendar.to_csv(calendar_path, index=False)
    print(f"Preprocessed calendar saved to {calendar_path}")

# Save all datasets
save_datasets()

Preprocessed listings saved to ../Airbnb Data/preprocessed_listings.csv
Preprocessed neighbourhoods saved to ../Airbnb Data/preprocessed_neighbourhoods.csv
Preprocessed reviews saved to ../Airbnb Data/preprocessed_reviews.csv
Preprocessed reviews saved to ../Airbnb Data/preprocessed_reviews.csv
Preprocessed calendar saved to ../Airbnb Data/preprocessed_calendar.csv
Preprocessed calendar saved to ../Airbnb Data/preprocessed_calendar.csv


In [24]:
# Data summary statistics
print("Summary of preprocessed data:")
print("\nListings:")
print(f"Total records: {len(processed_listings)}")
print(f"Records by city:\n{processed_listings['city'].value_counts()}")

print("\nNeighbourhoods:")
print(f"Total records: {len(processed_neighbourhoods)}")
print(f"Records by city:\n{processed_neighbourhoods['city'].value_counts()}")

print("\nReviews:")
print(f"Total records: {len(processed_reviews)}")
print(f"Records by city:\n{processed_reviews['city'].value_counts()}")

print("\nCalendar:")
print(f"Total records: {len(processed_calendar)}")
print(f"Records by city:\n{processed_calendar['city'].value_counts()}")
# Display date range in calendar data
if not processed_calendar.empty and 'date' in processed_calendar.columns:
    print(f"\nCalendar date range: {processed_calendar['date'].min()} to {processed_calendar['date'].max()}")

Summary of preprocessed data:

Listings:
Total records: 62771
Records by city:
city
Los Angeles    45031
San Diego      12844
Portland        4542
Salem            354
Name: count, dtype: int64

Neighbourhoods:
Total records: 481
Records by city:
city
Los Angeles    270
San Diego      108
Portland        95
Salem            8
Name: count, dtype: int64

Reviews:
Total records: 3003791
Records by city:
city
Los Angeles    1673605
San Diego       839296
Portland        470428
Salem            20462
Name: count, dtype: int64

Calendar:
Total records: 22902530
Records by city:
city
Los Angeles    16428742
San Diego       4687135
Portland        1657443
Salem            129210
Name: count, dtype: int64
Records by city:
city
Los Angeles    1673605
San Diego       839296
Portland        470428
Salem            20462
Name: count, dtype: int64

Calendar:
Total records: 22902530
Records by city:
city
Los Angeles    16428742
San Diego       4687135
Portland        1657443
Salem            129210
N