 Input Data | Output Data |
| --- | --- |
| Region | Tokyo - city/ward, district/town/village |
| Price range (Korean standards) | Accurate price (price) |
| Room Type | Private room, Entire home/apt |
| Accommodates number of guests | Host-written accommodation description (neighborhood_overview) |
| Bathrooms (bathrooms_text) | Host identity verification (host_identity_verified) |
| Bedrooms (bedrooms) | Customer-written accommodation description (description) |
| Beds: number of beds | Minimum number of nights (minimum_nights) |
| Rating (review_scores_rating) range | Overall rating score (review_scores_rating), Number of reviews in the last 30 days (number_of_reviews_l30d), Date of the last review (last_review), Date calendar was last updated (calendar_updated) |

In [19]:
import pandas as pd
import numpy as np
import random

In [20]:
data_file = '/Users/genie/Documents/COLLABORATION/AirbnbWise/Tokyo_Airbnb/data/listings.csv.gz'

listing_gz = pd.read_csv(data_file, compression='gzip', header=0, sep=',', quotechar='"')
listing_gz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11177 entries, 0 to 11176
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            11177 non-null  int64  
 1   listing_url                                   11177 non-null  object 
 2   scrape_id                                     11177 non-null  int64  
 3   last_scraped                                  11177 non-null  object 
 4   source                                        11177 non-null  object 
 5   name                                          11177 non-null  object 
 6   description                                   11176 non-null  object 
 7   neighborhood_overview                         7721 non-null   object 
 8   picture_url                                   11177 non-null  object 
 9   host_id                                       11177 non-null 

### User Class

In [21]:
class User:
    def __init__(self, user_id, host_id, review_scores_rating, bedrooms, beds, bathrooms, region = None, price = None, accommodates = None, minimum_nights = None, id = None, password = None):
        self.user_id = user_id
        self.host_id = host_id
        self.review_scores_rating = review_scores_rating
        self.bedrooms = bedrooms
        self.beds = beds
        self.bathrooms = bathrooms
        self.password = password
        self.region = region
        self.price = price
        self.accommodates = accommodates
        self.minimum_nights = minimum_nights
        

In [22]:
# # airbnb id 랜덤으로 200 개 추출 
# random_airbnb_id = random.sample(listing_gz.index.tolist(), 100)

# random_airbnb_id = random_airbnb_id * 5
# random.shuffle(random_airbnb_id)
# random_airbnb_id


In [23]:
import random

sample = listing_gz.loc[:, [ 'id', 'host_id','review_scores_rating', 'bedrooms', 'beds', 'bathrooms_text', 'neighbourhood_cleansed', 'price', 'accommodates', 'minimum_nights']]
sample.reset_index(drop=True, inplace=True)
sample.sort_values(by='review_scores_rating', ascending=False, inplace=True)
sample = sample.loc[:, :]
sample.reset_index(drop=True, inplace=True)
sample


Unnamed: 0,id,host_id,review_scores_rating,bedrooms,beds,bathrooms_text,neighbourhood_cleansed,price,accommodates,minimum_nights
0,922833637376956356,490272024,5.0,1.0,1.0,1 bath,Koto Ku,"$9,600.00",2,1
1,727140790042920016,481595902,5.0,1.0,2.0,1 bath,Sumida Ku,"$11,536.00",4,2
2,729355347138793951,481729050,5.0,1.0,1.0,1 shared bath,Shinjuku Ku,"$21,600.00",3,1
3,30049203,68181338,5.0,1.0,6.0,1 private bath,Arakawa Ku,"$25,000.00",7,1
4,30047622,68181338,5.0,1.0,2.0,0 shared baths,Arakawa Ku,"$9,000.00",2,1
...,...,...,...,...,...,...,...,...,...,...
11172,922873759575839340,294322772,,1.0,3.0,1 bath,Shinjuku Ku,"$12,000.00",4,2
11173,922998222764343634,518343589,,1.0,2.0,1 bath,Shinjuku Ku,"$16,000.00",3,2
11174,923008447744339896,518343589,,1.0,2.0,1 bath,Shinjuku Ku,"$16,000.00",4,2
11175,923011844205437846,518343589,,3.0,6.0,1 bath,Shinjuku Ku,"$40,000.00",9,2


In [24]:
import csv 

user_list = []

for i in range(len(sample)):
    price = sample['price'][i].replace('$', '').replace(',', '')
    user = User(user_id = i, host_id = sample['host_id'][i], review_scores_rating = sample['review_scores_rating'][i], bedrooms = sample['bedrooms'][i], beds = sample['beds'][i], bathrooms = sample['bathrooms_text'][i],region = sample['neighbourhood_cleansed'][i], price = price, accommodates = sample['accommodates'][i], minimum_nights = sample['minimum_nights'][i])
    user_list.append(user)

# CSV 파일에 데이터 쓰기
csv_filename = 'csv/user_totaldb.csv'
with open(csv_filename, 'w', newline='') as csvfile:
    fieldnames = ['user_id', 'host_id','review_scores_rating', 'bedrooms', 'beds', 'bathrooms','region', 'price', 'accommodates', 'minimum_nights']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # CSV 파일의 헤더 작성
    writer.writeheader()
    
    # 사용자 데이터를 CSV 파일에 작성
    for user in user_list:
        writer.writerow({
            'user_id' : user.user_id,
            'host_id' : user.host_id,
            'review_scores_rating' : user.review_scores_rating,
            'bedrooms' : user.bedrooms,
            'beds' : user.beds,
            'bathrooms' : user.bathrooms,
            'region': user.region,
            'price': user.price,
            'accommodates': user.accommodates,
            'minimum_nights': user.minimum_nights,
        })

print(f'사용자 데이터가 {csv_filename} 파일에 성공적으로 저장되었습니다.')

사용자 데이터가 csv/user_totaldb.csv 파일에 성공적으로 저장되었습니다.


In [25]:
user_db = pd.read_csv('csv/user_totaldb.csv')
user_db

#* id는 하나만 있으므로 평점 높은 숙소 대상으로 정렬

Unnamed: 0,user_id,host_id,review_scores_rating,bedrooms,beds,bathrooms,region,price,accommodates,minimum_nights
0,0,490272024,5.0,1.0,1.0,1 bath,Koto Ku,9600.0,2,1
1,1,481595902,5.0,1.0,2.0,1 bath,Sumida Ku,11536.0,4,2
2,2,481729050,5.0,1.0,1.0,1 shared bath,Shinjuku Ku,21600.0,3,1
3,3,68181338,5.0,1.0,6.0,1 private bath,Arakawa Ku,25000.0,7,1
4,4,68181338,5.0,1.0,2.0,0 shared baths,Arakawa Ku,9000.0,2,1
...,...,...,...,...,...,...,...,...,...,...
11172,11172,294322772,,1.0,3.0,1 bath,Shinjuku Ku,12000.0,4,2
11173,11173,518343589,,1.0,2.0,1 bath,Shinjuku Ku,16000.0,3,2
11174,11174,518343589,,1.0,2.0,1 bath,Shinjuku Ku,16000.0,4,2
11175,11175,518343589,,3.0,6.0,1 bath,Shinjuku Ku,40000.0,9,2
