# AIRBNB DATA ANALYSIS FOR TORONTO/CANADA

In [7]:
# IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### BUSINESS UNDERSTANDING



Our exploration will focus on the 2019 Airbnb Toronto dataset obtained from Inside Airbnb (URL: http://insideairbnb.com/get-the-data.html). 
This dataset provides valuable insights related to prices, neighbourhoods, availability, and room types in the city.

People come to Toronto for various reasons - study, work, or simply travel. Each of these groups is searching for a suitable place to stay, 
be it for the short or long term. In this analysis, we aim to provide an extensive understanding of Airbnb rental options in Toronto
to aid people in making an informed decision.




In [8]:
# LOADING DATASETS

In [9]:
listing=pd.read_csv('data/listings.csv')
neighbourhoods=pd.read_csv('data/neighbourhoods.csv')

### KEY QUESTIONS TO ANALYZE:


1) What districts offer the most diverse styles of houses, indicating the clusters of Airbnb guests? 

2) Which districts are the most popular among users? 

3) How does the city map look when divided by districts?
Can we determine locations in the two most popular districts with the most reviews and those that are cheaper than the rest? 

4) How is the distribution of room types across different districts? 

5) Can we analyze the mean prices according to room types district by district? 

6) What are the most expensive and popular months for Airbnb rentals?

With these questions in mind, let's proceed with loading our datasets.



### DEFINE THE NEIGHNORHOOD GROUPS AS DICTIONARIES

#### Since the neighborhoods data is missing in the dataset, I found all the information using wikipedia
#### The neighborhoods data needed to be define manually. So, the best way to do is using dictionary.

In [10]:
neighbourhoods.head()

Unnamed: 0,neighbourhood_group,neighbourhood
0,,Agincourt North
1,,Agincourt South-Malvern West
2,,Alderwood
3,,Annex
4,,Banbury-Don Mills


In [11]:

neighbourhood_groups = {
    'Downtown Core': [
        'Alexandra Park', 'Annex', 'Baldwin Village', 'Cabbagetown', 'CityPlace', 'Chinatown', 
        'Church and Wellesley', 'Corktown', 'Discovery District', 'Distillery District', 
        'The Entertainment District', 'East Bayfront', 'Fashion District', 'Financial District','Garden District',
        'Grange Park','Harbord Village','Harbourfront','Kensington Market','Little Japan','Moss Park','Old Town',
        'Quayside','Queen Street West','Regent Park','South Core','St. James Town','St. Lawrence','Toronto Islands',
        'Trefann Court','University (includes Huron–Sussex)','Yorkville',
        'Waterfront Communities-The Island',
        'Church-Yonge Corridor','Kensington-Chinatown','Cabbagetown-South St.James Town','University',
        'North St.James Town'
    ],
    'East End': [
        'The Beaches', 'The Beach', 'East Chinatown', 'East Danforth', 'Gerrard Street East',
        'Gerrard India Bazaar', 'Little India', 'Greektown', 'Danforth',
'Leslieville','Main Square','Playter Estates','Port Lands', 'Villiers Island','Riverdale','Upper Beaches',
'Bay Street Corridor','Woodbine Corridor','South Riverdale','East Riverdale','Danforth East York',
'Playter Estates-Danforth','North Riverdale','Blake-Jones','East End-Danforth'
    ],
    'North End': [
        'Bedford Park', 'Casa Loma', 'Chaplin Estates', 'Davisville Village', 'Deer Park', 'Yonge', 'St. Clair',
        'Forest Hill', 'Forest Hill Village', 'Upper Village', 'Lawrence Park', 'Lytton Park', 'Midtown', 
        'Moore Park', 'North Toronto', 'Rosedale', 'South Hill', 'Rathnelly', 'Summerhill', 'Uptown', 
        'Wanless Park', 'Wychwood Park', 'Yonge–Eglinton', 'Midtown Toronto',
'Rosedale-Moore Park','Yonge–Eglinton','Wychwood Park','Wychwood Park','Mount Pleasant West',
'Mount Pleasant East','Lawrence Park South','Lawrence Park North','Forest Hill South','Forest Hill North',
'Wychwood','Bedford Park-Nortown'
    ],
    'West End': [
        'Beaconsfield Village', 'Bloor West Village', 'Bracondale Hill', 'Brockton Village', 'Carleton Village',
        'Corso Italia', 'Davenport', 'Dovercourt Park', 'Dufferin Grove', 'Earlscourt', 'Fort York', 'High Park',
'The Junction', 'West Toronto', 'Dundas Street', 'Little Malta','Junction Triangle',
'Koreatown','Liberty Village','Little Italy','Little Portugal','Little Tibet','Mirvish Village',
'Niagara','Palmerston','Parkdale','Queen Street West','Regal Heights',
'Roncesvalles','Runnymede','Seaton Village','Swansea','Trinity–Bellwoods','Wallace Emerson',
'South Parkdale','Palmerston-Little Italy',
'Dovercourt-Wallace Emerson-Junction','Corso Italia-Davenport','High Park-Swansea','Trinity-Bellwoods',
'Weston-Pellam Park','Runnymede-Bloor West Village','Junction Area'
    ],
    'East York': [
        'Broadview North', 'Crescent Town', 'East Danforth', 'Pape Village', 'Woodbine Heights', 
        'Bermondsey', 'Governor s Bridge', 'Leaside', 'O Connor–Parkview', 'Thorncliffe Park',
        'Greenwood-Coxwell', 'Woodbine Heights', 'Woodbine-Lumsden', 'Taylor-Massey', 'Old East York',
        "O'Connor-Parkview",'Leaside-Bennington'
    ],
    'Etobicoke': [
        'Alderwood', 'Centennial Park', 'Clairville', 'Eatonville', 'Etobicoke West Mall', 'The Elms',
        'Eringate', 'Humber Bay', 'Humber Heights – Westmount', 'Humber Valley Village', 'Humberwood',
        'Islington–City Centre West', 'Kingsview Village', 'The Westway', 'The Kingsway', 'Long Branch',
        'Markland Wood','Mimico','New Toronto','Princess Gardens','Rexdale','Richview','Smithfield',
'Stonegate-Queensway','Sunnylea','Thistletown','Thorncrest Village','West Humber-Clairville',
'West Deane Park','Willowridge',
'Islington-City Centre West','Elms-Old Rexdale','Rexdale-Kipling','Yonge-Eglinton','Yonge-St.Clair',
'Willowridge-Martingrove-Richview','Thistletown-Beaumond Heights','Princess-Rosethorn',
'Mount Olive-Silverstone-Jamestown','Mimico (includes Humber Bay Shores)','Kingsway South',
'Kingsview Village-The Westway','Humber Heights-Westmount','Humber Heights-Westmount',
'Eringate-Centennial-West Deane','Humber Heights-Westmount','Edenbridge-Humber Valley'
    ],
    'North York': [
        'Amesbury', 'Armour Heights', 'Bathurst Manor', 'Bayview Village', 'Bayview Woods-Steeles', 'Bermondsey',
        'Black Creek', 'The Bridle Path', 'Clanton Park', 'Wilson Heights', 'Don Mills', 'Don Valley Village',
        'Downsview', 'Flemingdon Park', 'Glen Park',  'Englemount', 'Marlee Village',
'Henry Farm','Hillcrest Village','Hoggs Hollow','Humber Summit','Humbermede', 'Emery',
'Jane and Finch', 'University Heights', 'Elia','Lansing','Lawrence Heights','Lawrence Manor',
'Ledbury Park','Maple Leaf','Newtonbrook','North York City Centre','Parkway Forest','Parkwoods',
'The Peanut','Pelmo Park', 'Humberlea','Pleasant View','Uptown Toronto','Victoria Village','Westminster–Branson',
'Willowdale','York Mills','York University Heights', 'Village at York','Banbury-Don Mills' ,'Yorkdale-Glen Park',
'Willowdale East','Willowdale West','Westminster-Branson','St.Andrew-Windfields','Pelmo Park-Humberlea',
'Parkwoods-Donalda','Newtonbrook West','Newtonbrook East','Lansing-Westgate','Glenfield-Jane Heights',
'Glenfield-Jane Heights','Englemount-Lawrence','Glenfield-Jane Heights','Willowdale West','Willowdale East',
'Westminster-Branson','St.Andrew-Windfields','Rustic','Pelmo Park-Humberlea','Parkwoods-Donalda',
'Bridle Path-Sunnybrook-York Mills','Brookhaven-Amesbury','Downsview-Roding-CFB'
    ],
    'Scarborough': [
        'Agincourt', 'Armadale', 'Bendale', 'Cedarbrae', 'Birch Cliff', 'Birch Cliff Heights', 'Brown s Corners',
        'Clairlea', 'Cliffside', 'Cliffcrest', 'Dorset Park', 'Eglinton East', 'Golden Mile', 'Guildwood',
        'Highland Creek', 'Ionview', "L'Amoreaux" ,'Malvern','Maryvale','Milliken','Morningside','Morningside Heights','Oakridge',
'Port Union', 'Centennial Scarborough','Rouge','Scarborough City Centre','Scarborough Junction',
'Scarborough Village','Steeles','Tam O Shanter','Sullivan','West Hill','West Rouge','Wexford','Woburn',
'Wexford/Maryvale','Agincourt North','Agincourt South-Malvern West','Agincourt','Agincourt North',
'Tam OShanter-Sullivan','Kennedy Park','Birchcliffe-Cliffside','Clairlea-Birchmount'
    ],
    'York': [
'Baby Point',
'Briar Hill','Belgravia',
'Eglinton West', 'Little Jamaica',
'Fairbank', 'Caledonia','Fairbank',
'Humewood–Cedarvale',  'Upper Village', 'Forest Hill',
'Lambton',
'Mount Dennis',
'Oakwood–Vaughan', 'Oakwood Village', 'Five Points', 'Northcliffe',
'Old Mill',
'Rockcliffe–Smythe',
'Silverthorn' ,'Keelesdale',
'Tichester',
'Weston',
'Briar Hill-Belgravia',
'Rockcliffe-Smythe',
'High Park North','Rustic',
'Lambton Baby Point',
'Keelesdale-Eglinton West','Humewood-Cedarvale','Beechborough-Greenbrook','Caledonia-Fairbank'
    ]
}

In [12]:
# Function to assign the neighborhood group based on the neighborhood name

def assign_neighbourhood_group(neighbourhood):
    for group, neighbourhoods in neighbourhood_groups.items():
        if neighbourhood in neighbourhoods:
            return group
    return 0


In [13]:
# Assign neighborhood groups to the listing dataset
listing['neighbourhood_group'] = listing['neighbourhood'].apply(assign_neighbourhood_group)

# Assign neighborhood groups to the neighborhoods dataset
neighbourhoods['neighbourhood_group'] = neighbourhoods['neighbourhood'].apply(assign_neighbourhood_group)

## DATA WRANGLING:

### Neighbourhood Dataset

In [14]:
# Renaming the 'neighborhood_group' column to 'district'
neighbourhoods = neighbourhoods.rename(columns={'neighbourhood_group': 'district'})

# Checking for unassigned districts in the neighborhoods dataset
neighbourhoods['district'].isin([0]).sum()

1

In [15]:
# Assigning 'Scarborough' to the unassigned districts in the neighborhoods dataset
neighbourhoods.loc[114, 'district'] = 'Scarborough'

### Listing Dataset

In [16]:
# Renaming the 'neighbourhood_group' column to 'district' in both datasets
listing = listing.rename(columns={'neighbourhood_group': 'district'})

# Removing unnecessary columns from the listing dataset
listing = listing.drop(['name', 'host_name'], axis=1)

# Counting the number of unassigned districts in the listing dataset
listing['district'].isin([0]).sum()

102

In [17]:
# Changing 0 values to NaN values in the 'district' column and filling NaN values with 'Scarborough'
listing['district'] = listing['district'].mask(listing['district'] == 0).fillna('Scarborough')

# Verifying that all districts have been assigned in the listing dataset
listing['district'].isin([0]).sum()

0

In [18]:
# Printing columns without any missing values
no_nulls = set(listing.columns[listing.isnull().mean() == 0])
print(no_nulls)

{'minimum_nights', 'calculated_host_listings_count', 'district', 'longitude', 'number_of_reviews', 'neighbourhood', 'latitude', 'price', 'id', 'availability_365', 'host_id', 'room_type'}


In [19]:
# Calculating the percentage of NaN values in the 'last_review' column
nan_values = listing['last_review'].isnull().sum() / listing.shape[0]
print(nan_values)

0.18763089284951062


In [20]:
# Filtering out listings with fewer than 2 reviews
listing = listing[listing['number_of_reviews'] >= 2]

# Creating new columns for year, month, and day from the 'last_review' column
listing['year'] = pd.DatetimeIndex(listing['last_review']).year
listing['month'] = pd.DatetimeIndex(listing['last_review']).month
listing['day'] = pd.DatetimeIndex(listing['last_review']).day

# Removing listings from before 2016 for better analysis
listing = listing[listing['year'] > 2015]

listing.to_csv('data/listing_etl_processed.csv')
neighbourhoods.to_csv('data/neighbourhood_etl_processed.csv')

### THE GIVEN CODE BELOW CAN BE USED AS A PIPELINE 

In [21]:
# NOTE THAT; YOU NEED TO RUN THE NEIGHBOURHOOD DICTIONARY DATA AND ASSIGN FUNCTION 
# AFTER THEN YOU CAN USE THE FUNCTIONS BELOW 

In [7]:
def fix_neighbourhood_data(neighbourhoods):
    
    # Renaming the 'neighbourhood_group' column to 'district'
    neighbourhoods = neighbourhoods.rename(columns={'neighbourhood_group': 'district'})
    
    # Fixing the broken data in the Scarborough district
    neighbourhoods.loc[114, 'district'] = 'Scarborough'

    # Assigning 'Scarborough' to the unassigned districts in the neighborhoods dataset
    neighbourhoods.loc[neighbourhoods['district'].isin([0]), 'district'] = 'Scarborough'

    return neighbourhoods

In [8]:
def fix_listing_data(listing):
    
    # Renaming the 'neighbourhood_group' column to 'district'
    listing = listing.rename(columns={'neighbourhood_group': 'district'})
    
    # Changing 0 values to NaN values in the 'district' column and filling NaN values with 'Scarborough'
    listing['district'] = listing['district'].mask(listing['district'] == 0).fillna('Scarborough')


    # Removing unnecessary columns from the listing dataset
    listing = listing.drop(['name', 'host_name'], axis=1)

    # Filtering out listings with fewer than 2 reviews
    listing = listing[listing['number_of_reviews'] >= 2]

    # Creating new columns for year, month, and day from the 'last_review' column
    listing['year'] = pd.DatetimeIndex(listing['last_review']).year
    listing['month'] = pd.DatetimeIndex(listing['last_review']).month
    listing['day'] = pd.DatetimeIndex(listing['last_review']).day

    # Removing listings from before 2016 for better analysis
    listing = listing[listing['year'] > 2015]

    return listing

In [None]:
def main():
    # Load data
    neighbourhoods = pd.read_csv('data/neighbourhoods.csv')
    listing = pd.read_csv('data/listing.csv')

    # Extract-Transform-Load
    neighbourhoods = fix_neighbourhood_data(neighbourhoods)
    listing = fix_listing_data(listing)

    # Save updated data
    neighbourhoods.to_csv('neighbourhoods_etl_processed.csv', index=False)
    listing.to_csv('listing_etl_processed.csv', index=False)

In [None]:
if __name__ == '__main__':
    main()