In [1]:
import pandas as pd

# Airbnb Tokyo 2021 Calendar Dataset 

## Import Dataset 
- Tokyo-calendar-2021-cleaned.csv
- Tokyo-calendar_JAN_FEB_MAR_2021.csv

In [None]:
df1 = pd.read_csv(r"..\data\Tokyo-calendar-2021-cleaned.csv") # Wrong path

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887404 entries, 0 to 887403
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   listing_id      887404 non-null  int64         
 1   date            887404 non-null  datetime64[ns]
 2   available       887404 non-null  object        
 3   price           887404 non-null  object        
 4   adjusted_price  887404 non-null  object        
 5   minimum_nights  887308 non-null  float64       
 6   maximum_nights  887308 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 47.4+ MB


In [None]:
df2 = pd.read_csv(r"..\data\Tokyo-calendar_JAN_FEB_MAR_2021.csv") # Wrong path
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901961 entries, 0 to 901960
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   listing_id      901961 non-null  int64 
 1   date            901961 non-null  object
 2   available       901961 non-null  object
 3   price           901766 non-null  object
 4   adjusted_price  901766 non-null  object
 5   minimum_nights  901961 non-null  int64 
 6   maximum_nights  901961 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 48.2+ MB


## Converting to datetime type

In [7]:
df1['date'] = pd.to_datetime(df1['date'])

In [16]:
def dateRange(df):
    startDate = min(df['date'])
    endDate = max(df['date'])
    return f'between {startDate} and {endDate}'

In [17]:
print(f'df1 date range {dateRange(df1)}')

df1 date range between 2021-02-25 00:00:00 and 2021-12-31 00:00:00


In [None]:
df2['date'] = pd.to_datetime(df2['date'])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901961 entries, 0 to 901960
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   listing_id      901961 non-null  int64         
 1   date            901961 non-null  datetime64[ns]
 2   available       901961 non-null  object        
 3   price           901766 non-null  object        
 4   adjusted_price  901766 non-null  object        
 5   minimum_nights  901961 non-null  int64         
 6   maximum_nights  901961 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 48.2+ MB


In [18]:
print(f'df2 date range {dateRange(df2)}')

df2 date range between 2021-01-01 00:00:00 and 2021-03-02 00:00:00


### Filter the Correct Date for `df2`

In [49]:
# Convert 'date' column to datetime (if not already done)
df2['date'] = pd.to_datetime(df2['date'])

# Filter df2 to keep only rows where date <= 2021-02-25
df2 = df2[df2['date'] <= '2021-02-24']

In [50]:
df2.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,35303,2021-01-01,f,"$4,200.00","$4,200.00",28,1125
1,35303,2021-01-02,f,"$4,200.00","$4,200.00",28,1125
2,35303,2021-01-03,f,"$4,200.00","$4,200.00",28,1125
3,35303,2021-01-04,f,"$4,200.00","$4,200.00",28,1125
4,35303,2021-01-05,f,"$4,200.00","$4,200.00",28,1125


In [51]:
df2.tail()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
901953,41388320,2021-02-20,f,"$1,000,000.00","$1,000,000.00",1,30
901954,41388320,2021-02-21,f,"$1,000,000.00","$1,000,000.00",1,30
901955,41388320,2021-02-22,f,"$1,000,000.00","$1,000,000.00",1,30
901956,41388320,2021-02-23,f,"$1,000,000.00","$1,000,000.00",1,30
901957,41388320,2021-02-24,f,"$1,000,000.00","$1,000,000.00",1,30


## Check for listing_id

In [19]:
df1['listing_id'].nunique()

2874

In [52]:
df2['listing_id'].nunique()

15551

### Step 1: Check if all `listing_id`s from `df1` and `df2`

In [53]:
# Get the unique listing_ids from df1
unique_df1 = set(df1['listing_id'].unique())

# Get the unique listing_ids from df2
unique_df2 = set(df2['listing_id'].unique())

# Check if all listing_ids from df1 exist in df2
all_df1_in_df2 = unique_df1.issubset(unique_df2)

# Count the number of listing_ids from df1 that exist in df2
exist_df1_in_df2 = len(unique_df1.intersection(unique_df2))

# Count the number of listing_ids from df1 that do not exist in df2
non_exist_df1_in_df2 = len(unique_df1 - unique_df2)

print(f"All listing_ids from df1 exist in df2: {all_df1_in_df2}")
print(f"Number of listing_ids from df1 that exist in df2: {exist_df1_in_df2}")
print(f"Number of listing_ids from df1 that do not exist in df2: {non_exist_df1_in_df2}")

All listing_ids from df1 exist in df2: False
Number of listing_ids from df1 that exist in df2: 2685
Number of listing_ids from df1 that do not exist in df2: 189


### Step 2: Check if all `listing_id`s from `df2` exist in `df1`

In [54]:
# Check if all listing_ids from df2 exist in df1
all_df2_in_df1 = unique_df2.issubset(unique_df1)

# Count the number of listing_ids from df2 that exist in df1
exist_df2_in_df1 = len(unique_df2.intersection(unique_df1))

# Count the number of listing_ids from df2 that do not exist in df1
non_exist_df2_in_df1 = len(unique_df2 - unique_df1)

print(f"All listing_ids from df2 exist in df1: {all_df2_in_df1}")
print(f"Number of listing_ids from df2 that exist in df1: {exist_df2_in_df1}")
print(f"Number of listing_ids from df2 that do not exist in df1: {non_exist_df2_in_df1}")

All listing_ids from df2 exist in df1: False
Number of listing_ids from df2 that exist in df1: 2685
Number of listing_ids from df2 that do not exist in df1: 12866


### Step 3: Merge Existing `listing_id`s on both `df1` and `df2`
#### Step 1: Find Shared `listing_id`s

In [55]:
shared_listing_ids = set(df1['listing_id']).intersection(df2['listing_id'])

#### Step 2: Filter Rows from Both DataFrames

In [56]:
df1_shared = df1[df1['listing_id'].isin(shared_listing_ids)]
df2_shared = df2[df2['listing_id'].isin(shared_listing_ids)]

#### Step 3: Combine the Filtered Rows

In [58]:
merged_df = pd.concat([df1_shared, df2_shared], ignore_index=True)

#### Step 4: Verify the Results

In [59]:
print(f"Number of rows in merged_df: {len(merged_df)}")
print(f"Number of unique listing_ids: {merged_df['listing_id'].nunique()}")
print(f"Columns: {merged_df.columns.tolist()}")

Number of rows in merged_df: 976710
Number of unique listing_ids: 2685
Columns: ['listing_id', 'date', 'available', 'price', 'adjusted_price', 'minimum_nights', 'maximum_nights']


In [63]:
merged_df

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,4391546,2021-02-25,f,"$9,500.00","$9,500.00",30.0,120.0
1,4606903,2021-02-25,f,"$20,000.00","$20,000.00",3.0,30.0
2,6308465,2021-02-25,f,"$50,000.00","$50,000.00",1.0,90.0
3,6428968,2021-02-25,f,"$5,000.00","$5,000.00",1.0,60.0
4,1526132,2021-02-25,t,"$6,980.00","$6,980.00",2.0,30.0
...,...,...,...,...,...,...,...
976705,28347068,2021-02-20,f,"$11,250.00","$11,250.00",1.0,30.0
976706,28347068,2021-02-21,f,"$10,500.00","$10,500.00",1.0,30.0
976707,28347068,2021-02-22,f,"$10,500.00","$10,500.00",1.0,30.0
976708,28347068,2021-02-23,f,"$10,500.00","$10,500.00",1.0,30.0


In [62]:
print(f"Final merged_df shape: {merged_df.shape}")  # Should have 7 columns
print(f"Date range in merged_df: {merged_df['date'].min()} to {merged_df['date'].max()}")
print(f"Unique listing_ids: {merged_df['listing_id'].nunique()}")

Final merged_df shape: (976710, 7)
Date range in merged_df: 2021-01-01 00:00:00 to 2021-12-31 00:00:00
Unique listing_ids: 2685


## Export CSV

In [None]:
merged_df.to_csv('..\data\Airbnb_Tokyo_2021\Tokyo_2021_calendar.csv')

# Airbnb Listings Tokyo 2021

## Import Dataset
- Tokyo_202_calendar.csv
- Tokyo_listings.csv

In [None]:
df3 = pd.read_csv(r"..\data\Airbnb_Tokyo_2021\Tokyo_2021_calendar.csv")
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 976710 entries, 0 to 976709
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   listing_id      976710 non-null  int64  
 1   date            976710 non-null  object 
 2   available       976710 non-null  object 
 3   price           976655 non-null  object 
 4   adjusted_price  976655 non-null  float64
 5   minimum_nights  976623 non-null  float64
 6   maximum_nights  976623 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 52.2+ MB


In [80]:
df4 = pd.read_csv(r"C:\Users\thaop\Desktop\OneDrive - University of the Pacific\2024 Fall\MSBA 286\project\Cleaned Dataset - 2_12_2025\Tokyo 2021 cleaned by Thao Pham\Tokyo-listings.csv")
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11308 entries, 0 to 11307
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            11308 non-null  int64  
 1   listing_url                                   11308 non-null  object 
 2   scrape_id                                     11308 non-null  float64
 3   last_scraped                                  11308 non-null  object 
 4   name                                          11308 non-null  object 
 5   description                                   11306 non-null  object 
 6   neighborhood_overview                         8822 non-null   object 
 7   picture_url                                   11308 non-null  object 
 8   host_id                                       11308 non-null  int64  
 9   host_url                                      11308 non-null 

## EDA

In [81]:
df3['listing_id'].nunique()

2685

In [82]:
df4['id'].nunique()

11308

## Checking for Unique Values

In [89]:
# Get the unique ids from df3
unique_df3 = set(df3['listing_id'].unique())

# Get the unique listing_ids from df4
unique_df4 = set(df4['id'].unique())

In [None]:
# Check if all listing_ids from df3 exist in df4
all_df3_in_df4 = unique_df3.issubset(unique_df4)

# Count the number of listing_ids from df3 that exist in df4
exist_df3_in_df4 = len(unique_df3.intersection(unique_df4))

# Count the number of listing_ids from df3 that do not exist in df4
non_exist_df3_in_df4 = len(unique_df3 - unique_df4)

print(f'All listing_ids from df3 exist in df4: {all_df3_in_df4}')
print(f'Number of listing_ids from df3 that exist in df4: {exist_df3_in_df4}')
print(f'Number of listing_ids from df3 that do not exist in df3: {non_exist_df3_in_df4}')

All listing_ids from df3 exist in df3: True
Number of listing_ids from df3 that exist in df4: 2685
Number of listing_ids from df3 that do not exist in df3: 0


In [None]:
# Check if all listing_ids from df4 exist in df3
all_df4_in_df3 = unique_df4.issubset(unique_df3)

# Count the number of listing_ids from df4 that exist in df3
exist_df4_in_df3 = len(unique_df4.intersection(unique_df3))

# Count the number of listing_ids from df4 that do not exist in df3
non_exist_df4_in_df3 = len(unique_df4 - unique_df3)

print(f"All ids from df4 exist in df3: {all_df4_in_df3}")
print(f"Number of ids from df4 that exist in df3: {exist_df4_in_df3}")
print(f"Number of ids from df4 that do not exist in df3: {non_exist_df4_in_df3}")

All listing_ids from df4 exist in df3: False
Number of listing_ids from df4 that exist in df3: 2685
Number of listing_ids from df4 that do not exist in df3: 8623


## Filtering Rows for `df4`
### Step 1: Extract Unique `listing_id`s from `df3`/`merge_df`

In [None]:
# valid_ids = merged_df['listing_id'].unique()

### Step 2: Filter `df4` to Keep Rosw with Matching `id`s

In [None]:
df4_filtered = df4[df4['id'].isin(unique_df3)] # alter: df4[df4['id'].isin(valid_ids)]

### Step 3: Verify the Results

In [94]:
print(f"Number of rows in df3_filtered: {len(df4_filtered)}")
print(f"Number of unique ids in df3_filtered: {df4_filtered['id'].nunique()}")

Number of rows in df3_filtered: 2685
Number of unique ids in df3_filtered: 2685


In [95]:
df4_filtered.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,197677,https://www.airbnb.com/rooms/197677,20210230000000.0,2/26/2021,Oshiage Holiday Apartment,<b>The space</b><br />We are happy to welcome ...,,https://a0.muscache.com/pictures/38437056/d27f...,964081,https://www.airbnb.com/users/show/964081,...,10.0,9.0,10.0,M130003350,f,1,1,0,0,1.44
2,899003,https://www.airbnb.com/rooms/899003,20210230000000.0,2/27/2021,"Classy room @Shinjuku, Takadanoba",..*+;.*:' Popular room in Airbnb@Tokyo *;+:..*...,,https://a0.muscache.com/pictures/20005274/1df3...,4799233,https://www.airbnb.com/users/show/4799233,...,9.0,9.0,9.0,Other reasons | 【マンスリー契約】最低30泊以上の一時使用賃貸借契約を結びま...,f,2,2,0,0,0.95
3,1016831,https://www.airbnb.com/rooms/1016831,20210230000000.0,2/27/2021,WOMAN ONLY LICENSED ! Cosy & Cat behnd Shibuya,female travellers here only.<br />Sorry no boy...,The location is walkable distance to famous Sh...,https://a0.muscache.com/pictures/20134416/5c34...,5596383,https://www.airbnb.com/users/show/5596383,...,10.0,10.0,10.0,M130001107,f,1,0,1,0,2.19
4,1033276,https://www.airbnb.com/rooms/1033276,20210230000000.0,2/26/2021,private room @Senju area,Our house is new and clean <br />Big hub stati...,There are shopping mall near Senjuohashi stati...,https://a0.muscache.com/pictures/71577415/b060...,5686404,https://www.airbnb.com/users/show/5686404,...,10.0,9.0,9.0,M130007760,f,2,0,2,0,0.76
5,1096292,https://www.airbnb.com/rooms/1096292,20210230000000.0,2/26/2021,Home stay with a Japanese language teacher fam...,★*:.★*:. Must Read .:*★.:*★<br />*:.Maximum 5...,This area Shinjuku is most convenient town in ...,https://a0.muscache.com/pictures/miso/Hosting-...,6018145,https://www.airbnb.com/users/show/6018145,...,10.0,10.0,10.0,M130002814,f,4,0,4,0,2.78


In [96]:
df4_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2685 entries, 0 to 2946
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            2685 non-null   int64  
 1   listing_url                                   2685 non-null   object 
 2   scrape_id                                     2685 non-null   float64
 3   last_scraped                                  2685 non-null   object 
 4   name                                          2685 non-null   object 
 5   description                                   2685 non-null   object 
 6   neighborhood_overview                         2176 non-null   object 
 7   picture_url                                   2685 non-null   object 
 8   host_id                                       2685 non-null   int64  
 9   host_url                                      2685 non-null   object

## Export CSV

In [97]:
df4_filtered.to_csv('Tokyo_2021_listing.csv')

## Check for Duplicates

### Duplicates in `df1`

In [98]:
# Check duplicates in df1
total_rows_df1 = len(df1)
unique_listing_ids_df1 = df1['listing_id'].nunique()
duplicate_rows_df1 = total_rows_df1 - unique_listing_ids_df1
duplicated_ids_df1 = df1['listing_id'].value_counts()[df1['listing_id'].value_counts() > 1]

print("Results for df1:")
print(f"- Total rows: {total_rows_df1}")
print(f"- Unique listing_ids: {unique_listing_ids_df1}")
print(f"- Total duplicate rows: {duplicate_rows_df1}")
print(f"- Number of listing_ids with duplicates: {len(duplicated_ids_df1)}")
if not duplicated_ids_df1.empty:
    print("- Example duplicated listing_ids (ID: count):")
    print(duplicated_ids_df1.head().to_string())

Results for df1:
- Total rows: 887404
- Unique listing_ids: 2874
- Total duplicate rows: 884530
- Number of listing_ids with duplicates: 2874
- Example duplicated listing_ids (ID: count):
listing_id
6750472     310
28209969    310
27994110    310
28269901    310
11049003    310


### Duplicates in `df2`

In [99]:
# Check duplicates in df2
total_rows_df2 = len(df2)
unique_listing_ids_df2 = df2['listing_id'].nunique()
duplicate_rows_df2 = total_rows_df2 - unique_listing_ids_df2
duplicated_ids_df2 = df2['listing_id'].value_counts()[df2['listing_id'].value_counts() > 1]

print("\nResults for df2:")
print(f"- Total rows: {total_rows_df2}")
print(f"- Unique listing_ids: {unique_listing_ids_df2}")
print(f"- Total duplicate rows: {duplicate_rows_df2}")
print(f"- Number of listing_ids with duplicates: {len(duplicated_ids_df2)}")
if not duplicated_ids_df2.empty:
    print("- Example duplicated listing_ids (ID: count):")
    print(duplicated_ids_df2.head().to_string())


Results for df2:
- Total rows: 901961
- Unique listing_ids: 15551
- Total duplicate rows: 886410
- Number of listing_ids with duplicates: 15551
- Example duplicated listing_ids (ID: count):
listing_id
38526633    61
370759      58
5527486     58
3871442     58
3885341     58


### Duplicates in `df4`

In [None]:
# Check duplicates in df4
total_rows_df4 = len(df4)
unique_ids_df4 = df4['id'].nunique()
duplicate_rows_df4 = total_rows_df4 - unique_ids_df4
duplicated_ids_df4 = df4['id'].value_counts()[df4['id'].value_counts() > 1]

print("\nResults for df4:")
print(f"- Total rows: {total_rows_df4}")
print(f"- Unique ids: {unique_ids_df4}")
print(f"- Total duplicate rows: {duplicate_rows_df4}")
print(f"- Number of ids with duplicates: {len(duplicated_ids_df4)}")
if not duplicated_ids_df4.empty:
    print("- Example duplicated ids (ID: count):")
    print(duplicated_ids_df4.head().to_string())


Results for df4:
- Total rows: 11308
- Unique ids: 11308
- Total duplicate rows: 0
- Number of ids with duplicates: 0


### Duplicates in `merged_df`

In [102]:
# Check duplicates in merged_df
total_rows_merged_df = len(merged_df)
unique_listing_ids_merged_df = merged_df['listing_id'].nunique()
duplicate_rows_merged_df = total_rows_merged_df - unique_listing_ids_merged_df
duplicated_ids_merged_df = merged_df['listing_id'].value_counts()[merged_df['listing_id'].value_counts() > 1]

print("\nResults for merged_df:")
print(f"- Total rows: {total_rows_merged_df}")
print(f"- Unique listing_ids: {unique_listing_ids_merged_df}")
print(f"- Total duplicate rows: {duplicate_rows_merged_df}")
print(f"- Number of listing_ids with duplicates: {len(duplicated_ids_merged_df)}")
if not duplicated_ids_merged_df.empty:
    print("- Example duplicated listing_ids (ID: count):")
    print(duplicated_ids_merged_df.head().to_string())


Results for merged_df:
- Total rows: 976710
- Unique listing_ids: 2685
- Total duplicate rows: 974025
- Number of listing_ids with duplicates: 2685
- Example duplicated listing_ids (ID: count):
listing_id
22590719    365
27843553    365
28021551    365
28035202    365
27853052    365


In [104]:
merged_df.query('listing_id == 22590719')

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
250,22590719,2021-02-25,f,"$10,500.00","$3,675.00",1.0,1125.0
1554,22590719,2021-02-26,f,"$10,500.00","$3,675.00",1.0,1125.0
3977,22590719,2021-02-27,f,"$10,500.00","$3,675.00",1.0,1125.0
6662,22590719,2021-02-28,f,"$10,500.00","$3,675.00",1.0,1125.0
9347,22590719,2021-03-01,f,"$10,500.00","$3,675.00",1.0,1125.0
...,...,...,...,...,...,...,...
914445,22590719,2021-02-20,f,"$10,000.00","$10,000.00",1.0,1125.0
914446,22590719,2021-02-21,f,"$10,000.00","$10,000.00",1.0,1125.0
914447,22590719,2021-02-22,f,"$10,000.00","$10,000.00",1.0,1125.0
914448,22590719,2021-02-23,f,"$10,000.00","$10,000.00",1.0,1125.0
