## 에어비앤비 데이터 정리
### Inside_Airbnb 데이터 정보 : <a href = 'https://docs.google.com/spreadsheets/d/1iWCNJcSutYqpULSQHlNyGInUvHg2BoUGoNRIGa6Szc4/edit#gid=150111846'>Inside Airbnb Data Dictionary</a>



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

## 데이터 살펴보기

### reviews.csv.gz : 리뷰

In [None]:
!pip install xlsxwriter


In [None]:
# colab
# base_path = '/content/drive/MyDrive/Colab Notebooks/Data Project/Data/'

#vscode 
base_path = '../data/'
review_gz = pd.read_csv(base_path+'reviews.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
review_gz.head()



#### neighbourhoods.geojson

In [None]:
import json, pandas as pd

In [None]:
with open(base_path+'neighbourhoods.geojson') as f:
    json_f = json.loads(f.read()) # json 라이브러리 이용

df = pd.DataFrame(json_f)
df = pd.read_json(base_path+'neighbourhoods.geojson') # pd.read_json 이용


In [None]:
df.head()

#### reviews.csv

In [None]:
review = pd.read_csv(base_path+'reviews.csv')
review

#### neighbourhoods.csv

In [None]:
neighbor = pd.read_csv(base_path+'neighbourhoods.csv')
neighbor

### 가장 정보가 많은 데이터 

#### listings.csv.gz



In [None]:
listing_gz = pd.read_csv(base_path+'listings.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
listing_gz.head()

In [None]:
listing_gz.columns

In [None]:
input_data = listing_gz[['neighbourhood', 'price','amenities', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'review_scores_rating', 'review_scores_accuracy']]
input_data

In [None]:
listing_gz.columns

#### listings.csv

In [None]:
listings = pd.read_csv(base_path+'listings.csv')
listings.tail()

#### calendar.csv.gz

In [None]:
calendar_gz = pd.read_csv(base_path+'calendar.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
calendar_gz.head()

In [None]:
listing_gz.iloc[0,:]

## 에어비앤비 주요 지역 탐색

### 주요지역 추출에 도움이 될만한 컬럼 선택

- listing_gz :  latitude, longitude, neighbourhood_cleansed

- listings :  latitude, longitude, **neighbourhood**


In [None]:
listing_gz.loc[0,['latitude', 'longitude', 'neighbourhood', 'neighbourhood_cleansed']]

In [None]:
listings.loc[0,['latitude', 'longitude', 'neighbourhood']]

In [None]:
listings.info()

In [None]:
print(len(listings['neighbourhood'].unique()))
listings['neighbourhood'].unique()

In [None]:
listings['neighbourhood'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
plt.figure(figsize=(14,6))
sns.countplot(x='neighbourhood',data=listings,
              order=listings['neighbourhood'].value_counts().head(10).index,
              palette='RdPu',
              edgecolor=sns.color_palette('dark',10))
plt.xticks(rotation=50)
plt.ylabel('Count of District', weight='bold',size = 15)
plt.xlabel('District', weight='bold', size = 15)
plt.title('TOP 10 District', weight='bold' ,size = 20)
plt.grid(True)
plt.show()

In [None]:
import geopandas as gpd
import folium
import json

In [None]:
neighbor_cnt = listings['neighbourhood'].value_counts()

### 도쿄 지역 에어비앤비 분포 시각화

In [None]:
# GeoJSON 파일 로드

geojson_path = base_path + 'neighbourhoods.geojson'
# gdf = gpd.read_file(geojson_path)
# gdf

with open(geojson_path) as f:
    geo_data = json.load(f)

# Folium 맵 생성
map = folium.Map(location=[35.6894875,139.6917064], zoom_start=11, width='80%', height='80%')

# Choropleth 레이어 추가
folium.Choropleth(
    geo_data=geo_data,
    data=neighbor_cnt,
    columns=[neighbor_cnt.index, neighbor_cnt.values],
    key_on='feature.properties.neighbourhood',
    fill_color='YlGnBu',
    fill_opacity=0.5,
    legend_name='Airbnb density scale'
).add_to(map)


# 동네 이름 표시
for feature in geo_data['features']:
    neighborhood = feature['properties']['neighbourhood']
    coordinates = feature['geometry']['coordinates'][0][0]  # 동네의 경계 좌표 가져오기
    center = [sum(coord[1] for coord in coordinates) / len(coordinates),
              sum(coord[0] for coord in coordinates) / len(coordinates)]  # 중심 좌표 계산
    icon = folium.Icon(icon='home', color='gray')  # 아이콘 선택 (여기서는 구름 아이콘)

    folium.Marker(location=center, popup=neighborhood, icon=icon).add_to(map)
    


# 타일셋 추가
folium.TileLayer('Stamen Toner').add_to(map)

# 맵 출력
map.save('map.html')

map

### 에어비앤비 지역 관련 컬럼 

In [None]:
listing_gz[['neighbourhood_cleansed','host_neighbourhood','neighborhood_overview']]

#### 에어비앤비 동네 후기

In [None]:
listing_gz['neighborhood_overview']

In [None]:
listing_gz['neighborhood_overview'].isnull().sum() # 11177 중 3456 널값 
listing_gz['neighborhood_overview'].isnull().groupby(listing_gz['neighbourhood_cleansed']).sum().sort_values(ascending=False)

### 입력, 출력 데이터 정리


#### 입력 데이터 

- 지역 : 도쿄 -  시/구 , 동/읍/면,  
- 가격 : 범위 (한국 기준)
- 룸 타입 : Private room, Entire, home, apt (room_type)
- 수용 인원 : (accommodates)
- 화장실 : (bathrooms_text) 
- 침실 (bedrooms )
- 침대 개수 (beds)
- 별점(review_scores_rating) : 범위 
#### 출력 데이터 


<b>에어비앤비에서 제공하는 데이터</b>
- 에어비앤비 링크 (description)
- 고객이 작성한 숙소 설명 (description) #
- 호스트가 작성한 숙소 설명 (neighborhood_overview)
- 호스트 설명 (host_about)
- 호스트신원 여부 (host_identity_verified)
- 정확한 가격 (price)
- 최소 숙박일 수 (minimum_nights)
- 캘린더가 마지막으로 업데이트 된 날 (calendar_updated)
- 30일간 리뷰 수 (number_of_reviews_l30d)
- 목록의 마지막 리뷰 날짜 (last_review)
- 전체 평점 점수 (review_scores_rating)
---------------------

<b>딥러닝을 통해 제공할 데이터 </b>
- 치안 정보 : 오시마랜드와 절도관련 오픈데이터 이용 (ex 주변 3날치기가 빈번하게 발생, 투신 자살 이슈가 있는 아파트) 
- 에어비앤비 치안 포함 별점 : 치안 정보를 포함한 알고리즘 생성 

In [None]:
import pandas as pd

base_path = '../data/'

listing_gz = pd.read_csv(base_path+'listings.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
listing_gz.head()

In [None]:
input_cols = ['neighbourhood_cleansed', 'price', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms' , 'beds', 'review_scores_rating', 'latitude', 'longitude']
airbnb_df = listing_gz[input_cols]
airbnb_df.head()

### 오시마랜드 데이터

In [None]:
import pandas as pd
oshima_yun = pd.read_csv('../../Oshimaland_data/yunyoung/selenium_tokyo_data.csv', index_col=0)
oshima_ji = pd.read_csv('../../Oshimaland_data/jieun/oshimaland_dataset_final.csv')

In [None]:
oshima_yun.tail()

In [None]:
oshima_ji.tail()

In [None]:
oshima_yun = oshima_yun.reindex(columns=['district', 'address', 'content', 'date'])
oshima_yun.rename(columns={'content': 'info'}, inplace=True)
oshima_yun.head()

In [None]:
oshima_ji_rename = oshima_ji.reindex(columns=[ 'address', 'info', 'occurred_date'])
oshima_ji_rename.rename(columns={'occurred_date': 'date'}, inplace=True)
oshima_ji_rename.head()

#### 중복데이터 확인

In [None]:
oshima_yun.duplicated().sum()

In [None]:
oshima_yun = oshima_yun.drop_duplicates()
oshima_yun.reset_index(drop=True, inplace=True)
oshima_yun

In [None]:
oshima_ji_rename.duplicated().sum()

In [None]:
oshima_ji_rename = oshima_ji_rename.drop_duplicates()
oshima_ji_rename.reset_index(drop=True, inplace=True)
oshima_ji_rename

#### 결측치 확인 

In [None]:
oshima_yun.isnull().sum()

In [None]:
oshima_yun.dropna(subset=['info'], inplace=True)
print(oshima_yun.isnull().sum())
oshima_yun

In [None]:
oshima_ji_rename.isnull().sum()

#### district 컬럼 생성

In [None]:
jp_district_list = [
    '新宿区', '台東区', '墨田区', '豊島区', '渋谷区',
    '港区', '世田谷区', '大田区', '中野区', '中央区',
    '北区', '葛飾区', '杉並区', '江東区', '江戸川区',
    '文京区', '荒川区', '板橋区', '品川区', '千代田区',
    '足立区', '目黒区', '練馬区', '府中市', '八王子市',
    '日野市', '国分寺市', '町田市', '青梅市', '調布市',
    '武蔵野市', 'あきる野市', '三鷹市', '小金井市', '東村山市',
    '多摩市', '国立市', '狛江市', '西東京市', '立川市',
    '小平市', '羽村市', '武蔵村山市', '奥多摩町', '昭島市',
    '福生市'
]

In [None]:
oshima_ji_rename = oshima_ji_rename.reindex(columns = ['district', 'address', 'info', 'date'])
oshima_ji_rename

In [None]:
oshima_ji_rename['district'] = oshima_ji_rename['address'].apply(lambda x: next((district for district in jp_district_list if district in x), x))
oshima_ji_rename.reindex(columns=['district', 'address', 'info', 'occurred_date'])
oshima_ji_rename

In [None]:
# 공통 컬럼 
same_oshima = oshima_yun.merge(oshima_ji_rename, how='inner')
same_oshima

In [None]:
# 데이터 합치기 

combined_df = pd.concat([oshima_yun, oshima_ji_rename], ignore_index=True)
combined_df

### EDA 추가 -230825

- Dataset_Processing 폴더에서 위도 경도 정보가 추가됨 에어비앤미 데이터 가져오기

In [None]:
import pandas as pd 

airbnb_data= pd.read_csv('../../Dataset_Processing/yunyoung/data/airbnb_data_add_latlng.csv', index_col=0)
airbnb_data

### 결측치 확인 
- 각 컬럼들은 입력 데이터 이므로 결픅피가 있는 데이터는 제거해준다. 

In [None]:
airbnb_data.isna().sum()

### 결측치 제거 후 11177 -> 7863 데이터만 존재

In [None]:
airbnb_data.dropna(inplace = True)
airbnb_data.reset_index(drop=True, inplace=True)
airbnb_data

In [None]:
airbnb_data['room_type'].unique()

In [None]:
airbnb_data.columns

In [None]:
from collections import Counter

neighbourhood_counts = Counter(airbnb_data['neighbourhood_cleansed'])
print('neighbourhood_counts : ',neighbourhood_counts)

room_type_counts = Counter(airbnb_data['room_type'])
print('room_type_counts : ',room_type_counts)

accommodates_counts = Counter(airbnb_data['accommodates'])
print('accommodates_counts : ',accommodates_counts)

bathrooms_counts = Counter(airbnb_data['bathrooms_text'])
print('bathrooms_counts : ',bathrooms_counts)

bedrooms_counts = Counter(airbnb_data['bedrooms'])
print('bedrooms_counts : ',bedrooms_counts)

beds_counts = Counter(airbnb_data['beds'])
print('beds_counts : ',beds_counts)

bedroomstext_counts = Counter(airbnb_data['bedrooms'])
print('bedroomstext_counts : ',bedroomstext_counts)


### 데이터 시각화

In [None]:
 !pip install chart_studio

In [None]:
import chart_studio.plotly as py

username = 'username' # your username
api_key = 'api_key' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [None]:
import plotly.express as px

def create_bar_chart(x_data, y_data, x_title, y_title, chart_title, isMark = True):
    fig = px.bar(x=x_data, y=y_data, color=y_data)
    fig.update_layout(title=chart_title)
    fig.update_xaxes(title=x_title)
    fig.update_yaxes(title=y_title)


    if isMark:
        # 값(value) 표시
        for i, count in enumerate(sorted_counts):
            if i<5 or i>len(sorted_counts)-5:
                fig.add_annotation(
                    x=sorted_neighbourhoods[i],
                    y=count + 80,  
                    text=str(count),
                    font=dict(color='black', size=12),
                    showarrow=False
                )

    
    fig.show()
    
    return fig

    


In [None]:
def sorted_count(data):
    sorted_value = sorted(data, key=data.get, reverse=True)
    sorted_counts = [data[neighbourhood] for neighbourhood in sorted_value]

    return sorted_value, sorted_counts


In [None]:
sorted_neighbourhoods, sorted_counts = sorted_count(neighbourhood_counts)

fig = create_bar_chart(
    x_data=sorted_neighbourhoods,
    y_data=sorted_counts,
    x_title='neighbourhoods',
    y_title='Counts',
    chart_title='neighbourhoods Counts'
)


In [None]:
py.plot(fig, filename = 'airbnb_neigborhoof', auto_open=True)


In [None]:
import chart_studio.tools as tls
tls.get_embed('https://plotly.com/~Jyundev/1/')

In [None]:
sorted_room_type, sorted_counts = sorted_count(room_type_counts)

create_bar_chart(
    x_data=sorted_room_type,
    y_data=sorted_counts,
    x_title='room_type',
    y_title='Counts',
    chart_title='room_type Counts',
    isMark = False
)


In [None]:
sorted_bedrooms, sorted_counts = sorted_count(bedrooms_counts)

create_bar_chart(
    x_data=sorted_bedrooms,
    y_data=sorted_counts,
    x_title='bedrooms',
    y_title='Counts',
    chart_title='bedrooms Counts',
    isMark = False
)


### 가격 

In [None]:
airbnb_data['price'] = airbnb_data['price'].str.replace('$', '').str.replace(',', '').astype(float)
airbnb_data['price']

In [None]:
airbnb_price = 	airbnb_data[['neighbourhood_cleansed', 'price']]	
sorted_df = airbnb_price.sort_values(by='price')
sorted_df

In [None]:
import numpy as np

def identify_outliers_iqr(data):
    Q1 = np.percentile(data, 25)  # 1사분위수
    Q3 = np.percentile(data, 75)  # 3사분위수
    IQR = Q3 - Q1  # IQR 계산
    lower_bound = max(Q1 - 1.5 * IQR, np.min(data))  # 하한값 설정
    upper_bound = Q3 + 1.5 * IQR  # 상한값
    outliers = [x for x in data if x < lower_bound or x > upper_bound]

    
    return lower_bound, upper_bound, outliers


In [None]:
lower_bound, upper_bound, outliers = identify_outliers_iqr(sorted_df['price'])
lower_bound

In [None]:
import plotly.express as px

def create_colored_histogram(data, x_column, color_column, x_title, y_title, chart_title):
    fig = px.histogram(data, x=x_column, color=color_column)
    fig.update_layout(title=chart_title)
    fig.update_xaxes(title=x_title)
    fig.update_yaxes(title=y_title)
    fig.show()


def create_scatter_plot(data, x_col, y_col, title, color, outliers=False):
    fig = px.scatter(data, 
                     x=x_col, 
                     y=y_col, 
                     title=title, 
                     color=color, 
                     color_continuous_scale='Viridis')
    
    
    # if outliers:
    #     lower_bound, upper_bound, outliers = identify_outliers_iqr(data[y_col])
    #     fig.add_trace(px.line(x=data[x_col], y=lower_bound, mode='lines', name='Y Line').data[0])

    if outliers:
        lower_bound, upper_bound, _ = identify_outliers_iqr(data[y_col])
        y_line_lower = [upper_bound] * len(data[x_col])  # y축 데이터 기준의 라인 값 생성
        y_line_upperr = [lower_bound] * len(data[x_col])  # y축 데이터 기준의 라인 값 생성

        fig.add_trace(px.line(x=data[x_col], y=y_line_lower, line_shape='linear').data[0])
        fig.add_trace(px.line(x=data[x_col], y=y_line_upperr, line_shape='linear').data[0])


    fig.show()


In [None]:
create_scatter_plot(sorted_df,
                    x_col='neighbourhood_cleansed', 
                    y_col='price', 
                    title='Price Scatter Plot',
                    color=sorted_df['price'],
                    outliers=True)


In [None]:
def find_price_outliers(data, threshold=3):
    mean = np.mean(data)
    std = np.std(data)
    
    z_scores = [(x - mean) / std for x in data]
    outliers = [data[i] for i, z_score in enumerate(z_scores) if np.abs(z_score) > threshold]
    
    return outliers


### 리뷰 데이터 확인 

In [None]:
review_cols = ['id',  'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', \
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value',]

listing_gz[review_cols ]

### 에어비앤비 리뷰 리스트는 에어비앤비 별 사용자의 리뷰 데이터가 담겨있음

In [None]:
review_gz.head()

In [None]:
review_gz['comments']

In [None]:
len(review_gz)

#### GoogleTranslator 를 이용하여 모든 리뷰 데이터를 한국어로 번역  
#### 용량 문제로 코랩에서 시도 중 

In [None]:
# from KitContainer.LinguaUtil import translate_text as translate
# #from Utils.KitContainer.LinguaUtil import translate_text as translate
# review_gz['kr_comments'] = review_gz['comments'].apply(lambda x : translate('auto', 'ko', x))
# review_gz

In [None]:
review_gz.head(40)

In [None]:
# reviews = []

# # 데이터를 50000개씩 나눔
# for i in range(0, len(review_gz), 50000):
#     review = review_gz[i:i+50000]
#     reviews.append(review)
# review1, review2, review3, review4, review5, review6, review7, review8, review9 = reviews


In [None]:
from deep_translator import GoogleTranslator
from tqdm import tqdm

# translate 함수 정의
def translate_text(source, target, text):
    translator = GoogleTranslator(source=source, target=target)
    translated_text = translator.translate(text)
    return translated_text

# 'comments' 열의 각 항목을 번역하여 'kr_comments' 열에 저장
review_gz['kr_comments'] = ""

# tqdm을 사용하여 진행 상황을 표시
with tqdm(total=50000) as pbar:
    for index, row in review_gz.iterrows():
        translated_comment = translate_text('auto', 'ko', row['comments'])
        review_gz.at[index, 'kr_comments'] = translated_comment
        pbar.update(1)  # 진행 상황 갱신

In [None]:
#review_gz.to_csv('translated_reviews.csv', index=False) #* 용량 과다

In [None]:
review_gz.head(100)

In [17]:
df = pd.read_csv('C:\\Users\\lucky\\Documents\\COLLABORATION\\AirbnbWise\\Tokyo_Airbnb\\yunyoung\\translated_reviews.csv')

  df = pd.read_csv('C:\\Users\\lucky\\Documents\\COLLABORATION\\AirbnbWise\\Tokyo_Airbnb\\yunyoung\\translated_reviews.csv')


In [18]:
df = df.loc[df['kr_comments'].notnull()]
df

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,kr_comments
0,197677,554285,2011-09-21,1002142,Pablo,Couldn’t get any better!\r<br/>\r<br/>The apar...,더 이상 나아질 수 없습니다!\r<br/>\r<br/>아파트 자체는 훌륭합니다. 여...
1,197677,627651,2011-10-14,1031940,Ana & Ricardo,The apartment is bigger than it looks in the p...,아파트는 사진에서 보는 것보다 더 큽니다. 커플에게 딱 맞습니다. 깨끗하고 잘 관리...
2,197677,733040,2011-11-21,1097040,Samuel,The appartement is perfect for a couple! It is...,아파트는 커플에게 딱 맞습니다! 조금 작지만 정말 완벽한 가구가 그것을 보완하고 일...
3,197677,755841,2011-11-30,1183674,Lisa,We had a terrific stay at Yoshimi and Marek's ...,우리는 요시미와 마렉의 사랑스러운 아파트에서 즐거운 시간을 보냈습니다. 모든 것이 ...
4,197677,852938,2012-01-09,1538004,Langan,We loved our stay at the Oshiage Holiday Apart...,우리는 Oshiage Holiday Apartment에서의 숙박을 좋아했습니다. 아...
...,...,...,...,...,...,...,...
2912,1249571,67415653,2016-03-29,23523928,Min,Yume is always taking care of everything . Her...,Yume는 항상 모든 것을 돌보고 있습니다. 그녀의 집은 항상 도쿄에서 나의 첫 번...
2913,1249571,68426185,2016-04-04,29841156,Daniel,Yume is a generous and attentive host who welc...,"Yume는 우리를 따뜻하게 환영하고, 우리를 아파트에 소개하고, 우리를 대신하여 문..."
2914,3846001,449904684,2019-05-08,230135014,Luke,Had an amazing stay at Manami’s place. <br/>S...,마나미의 숙소에서 즐거운 시간을 보냈습니다. <br/>도착 시 매우 깨끗하고 깔끔합...
2915,3846001,461625962,2019-06-01,30085093,Katie,Great little apartment with everything you nee...,필요한 모든 것을 갖춘 훌륭한 작은 아파트입니다. 고엔지역과 가까워서 위치가 좋습니다.


In [19]:
df.to_csv('translated_ko_data.csv', index=False)

In [None]:
#* https://www.kaggle.com/datasets/zinnie1025/translated-review-ko-airbnb(데이터 링크)