In [13]:
# Pull in Dependences
import pandas as pd
import numpy as np

### Load the Data

In [14]:
# Load Airbnb listings data from CSV file
airbnb_data = pd.read_csv('Resources/raw_airbnb_listings_usa.csv', dtype={'neighbourhood_group': str})

# Display the first few rows of the data
airbnb_data.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Resources/airbnb_listings_usa.csv'

### Inspect the Data

In [None]:
# Display data information
airbnb_data.info()

# Check for missing values
airbnb_data.isnull().sum()


### Handling Missing Values

In [None]:
# Fill missing values in 'reviews_per_month' with 0
airbnb_data['reviews_per_month'].fillna(0, inplace=True)

# Drop rows with missing values in crucial columns (e.g., 'name', 'host_id')
airbnb_data.dropna(subset=['name', 'host_id'], inplace=True)

# Verify missing values handling
airbnb_data.isnull().sum()


### Format Consistences

In [None]:
# Ensure data types are consistent
airbnb_data['last_review'] = pd.to_datetime(airbnb_data['last_review'], errors='coerce')

# Check data types
airbnb_data.dtypes


### Remove Duplicates

In [None]:
# Check for duplicates
airbnb_data.duplicated().sum()

# Remove duplicates
airbnb_data.drop_duplicates(inplace=True)

# Verify removal of duplicates
airbnb_data.duplicated().sum()


### Remove Irrelevant Columns

In [None]:
# Define the relevant columns for analysis
relevant_columns = [
    'id', 'name', 'host_id', 'neighbourhood_group', 'neighbourhood', 'latitude',
    'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews',
    'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365'
]

# Select only the relevant columns
airbnb_data = airbnb_data[relevant_columns]

# Verify the columns
airbnb_data.info()


### Filter Data based on Number of Reviews

In [None]:
# Set the threshold for the minimum number of reviews
min_reviews_threshold = 100

# Filter the data to include only listings with at least 'min_reviews_threshold' reviews
filtered_airbnb_data = airbnb_data[airbnb_data['number_of_reviews'] >= min_reviews_threshold].copy()

# Verify the filtering
filtered_airbnb_data.info()

# Display the first few rows of the filtered data
filtered_airbnb_data.head()


### Sort Data

In [None]:
# Sort the data based on 'number_of_reviews' in descending order
filtered_airbnb_data.sort_values(by='number_of_reviews', ascending=False, inplace=True)

# Display the first few rows of the sorted data
filtered_airbnb_data

In [None]:
# Save the cleaned, filtered, and sorted data to a new CSV file
filtered_airbnb_data.to_csv('Resources/cleaned_airbnb_listings_usa.csv', index=False)
