In [1]:
# Adjusting current working directory to parent directory
from pathlib import Path
from os import chdir
from platform import system

try:
    current_directory
except: # First  run - initialize current_directory
    current_directory = Path.cwd()
    if system() == "Linux": # Colab
        from google.colab import drive
        drive.mount('/content/drive')
        current_directory = f"{current_directory}/drive/MyDrive/Colab Notebooks/RecTour2024Challenge"
    else:
        current_directory = current_directory.parent
finally:
    chdir(current_directory)



# External imports
import pandas as pd
import numpy as np

from random import randint

# import torch
# import torch.nn as nn

# from sentence_transformers import SentenceTransformer

# import tensorflow as tf
# from tensorflow.keras.layers import Input, Dense, Lambda
# from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# from tensorflow.keras.models import load_model



# Internal imports
from src.data.csv_tools import csv_to_dataframe, dataframe_to_csv, save_submission
from src.data.pickle_tools import save_to_pickle, load_pickle
from src.data.keras_tools import save_keras_model, load_keras_model
from src.utils.preprocessing_tools import *

In [2]:
csv_to_dataframe("train").head()

Unnamed: 0,user_id,accommodation_id,guest_type,guest_country,room_nights,month,accommodation_type,accommodation_country,accommodation_score,accommodation_star_rating,location_is_ski,location_is_beach,location_is_city_center,review_id,review_title,review_positive,review_negative,review_score,review_helpful_votes
0,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,ca87eb9f-3b2a-4e3b-9f86-6a8d14722a9e,Amazing hotel &amp; staff,I loved the hotel the rooms were amazing and t...,Nothing,10.0,0
1,776accc3-9f1b-4a2e-8616-aafcac7eeb1d,-1189343073,Solo traveller,Gobuf,2,3,Hotel,Italy,8.0,4.0,0,0,0,b555f9d4-176a-4813-b88d-54e0b676be9c,Spacious and Clean Room,Room was very spacious and clean. Staff were f...,Area of the hotel wasn&#39;t great but it was ...,8.0,0
2,e32f90a1-1580-4a87-86a1-7602d5543a15,-1454980525,Couple,Qehoj,3,6,Hotel,Germany,8.5,4.0,0,0,1,baf7a395-049f-4ea7-a380-58e9836b5da2,A wonderful hotel where you feel comfortable a...,"The room was beautifully decorated, the bed wa...",,10.0,0
3,9f246c70-def0-4b93-8c8a-296f5014f6fc,-773005129,Family with children,Mejok,1,8,Apartment,United Kingdom,9.3,0.0,0,0,0,a29ebd72-c49e-4a4c-a68e-093d88c87cf3,,What an amazing apartment!!!!! Easy check in. ...,Nothing,10.0,0
4,2958080f-b80d-4bcb-84e9-2e053c314e38,462909752,Family with children,Zuc,6,6,Hotel,Australia,8.7,4.0,0,1,0,da5f55ce-43c3-4ce5-8f7c-3ef02bedcf68,The Palmere Collective has everything you need...,"We were up for a Cowboys game, and the locatio...","The Hotel could benefit from a bigger, better ...",10.0,0


In [7]:
train_df = csv_to_dataframe("train")
df = train_df.groupby(['guest_type', 'guest_country', 'month', 'room_nights']).size()
# print(train_df.groupby("guest_type").agg({"review_score": ["mean", "std"]}))
# # summarize average review_score by accommodation_id and guest_type
# df = train_df.groupby(['accommodation_id', 'guest_type']).agg({'review_score': ['mean', 'std']}).reset_index()
# df.columns = ['accommodation_id', 'guest_type', 'review_score_mean', 'review_score_std']
# # print top 100
# for row, data in df.head(100).iterrows():
#     print(data.guest_type, data.review_score_mean, data.review_score_std)

In [16]:
len(df)
df.mean()

31.201306288187862

In [5]:
train_df.head()

Unnamed: 0,user_id,accommodation_id,guest_type,guest_country,room_nights,month,accommodation_type,accommodation_country,accommodation_score,accommodation_star_rating,location_is_ski,location_is_beach,location_is_city_center,review_id,review_title,review_positive,review_negative,review_score,review_helpful_votes
0,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,ca87eb9f-3b2a-4e3b-9f86-6a8d14722a9e,Amazing hotel &amp; staff,I loved the hotel the rooms were amazing and t...,Nothing,10.0,0
1,776accc3-9f1b-4a2e-8616-aafcac7eeb1d,-1189343073,Solo traveller,Gobuf,2,3,Hotel,Italy,8.0,4.0,0,0,0,b555f9d4-176a-4813-b88d-54e0b676be9c,Spacious and Clean Room,Room was very spacious and clean. Staff were f...,Area of the hotel wasn&#39;t great but it was ...,8.0,0
2,e32f90a1-1580-4a87-86a1-7602d5543a15,-1454980525,Couple,Qehoj,3,6,Hotel,Germany,8.5,4.0,0,0,1,baf7a395-049f-4ea7-a380-58e9836b5da2,A wonderful hotel where you feel comfortable a...,"The room was beautifully decorated, the bed wa...",,10.0,0
3,9f246c70-def0-4b93-8c8a-296f5014f6fc,-773005129,Family with children,Mejok,1,8,Apartment,United Kingdom,9.3,0.0,0,0,0,a29ebd72-c49e-4a4c-a68e-093d88c87cf3,,What an amazing apartment!!!!! Easy check in. ...,Nothing,10.0,0
4,2958080f-b80d-4bcb-84e9-2e053c314e38,462909752,Family with children,Zuc,6,6,Hotel,Australia,8.7,4.0,0,1,0,da5f55ce-43c3-4ce5-8f7c-3ef02bedcf68,The Palmere Collective has everything you need...,"We were up for a Cowboys game, and the locatio...","The Hotel could benefit from a bigger, better ...",10.0,0


In [2]:
# Import modules
# External modules

# Internal modules
from src.data.csv_tools import *

In [3]:
# Get processed train
train_df = csv_to_dataframe("train")
print(train_df.shape)
train_df.head()

(1628989, 19)


Unnamed: 0,user_id,accommodation_id,guest_type,guest_country,room_nights,month,accommodation_type,accommodation_country,accommodation_score,accommodation_star_rating,location_is_ski,location_is_beach,location_is_city_center,review_id,review_title,review_positive,review_negative,review_score,review_helpful_votes
0,8e2ee00e-08bf-4229-aefb-74224a5adfa6,-1109473678,Family with children,Vey,2,2,Hotel,Turkey,8.3,5.0,0,0,0,ca87eb9f-3b2a-4e3b-9f86-6a8d14722a9e,Amazing hotel &amp; staff,I loved the hotel the rooms were amazing and t...,Nothing,10.0,0
1,776accc3-9f1b-4a2e-8616-aafcac7eeb1d,-1189343073,Solo traveller,Gobuf,2,3,Hotel,Italy,8.0,4.0,0,0,0,b555f9d4-176a-4813-b88d-54e0b676be9c,Spacious and Clean Room,Room was very spacious and clean. Staff were f...,Area of the hotel wasn&#39;t great but it was ...,8.0,0
2,e32f90a1-1580-4a87-86a1-7602d5543a15,-1454980525,Couple,Qehoj,3,6,Hotel,Germany,8.5,4.0,0,0,1,baf7a395-049f-4ea7-a380-58e9836b5da2,A wonderful hotel where you feel comfortable a...,"The room was beautifully decorated, the bed wa...",,10.0,0
3,9f246c70-def0-4b93-8c8a-296f5014f6fc,-773005129,Family with children,Mejok,1,8,Apartment,United Kingdom,9.3,0.0,0,0,0,a29ebd72-c49e-4a4c-a68e-093d88c87cf3,,What an amazing apartment!!!!! Easy check in. ...,Nothing,10.0,0
4,2958080f-b80d-4bcb-84e9-2e053c314e38,462909752,Family with children,Zuc,6,6,Hotel,Australia,8.7,4.0,0,1,0,da5f55ce-43c3-4ce5-8f7c-3ef02bedcf68,The Palmere Collective has everything you need...,"We were up for a Cowboys game, and the locatio...","The Hotel could benefit from a bigger, better ...",10.0,0


In [4]:
# Check for missing values
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

guest_country        2198
review_title       531861
review_positive         8
review_negative    469549
dtype: int64

In [5]:
# Get unique values for each column
unique_values = train_df.nunique()
unique_values

user_id                      1628989
accommodation_id               40000
guest_type                         4
guest_country                    237
room_nights                       72
month                             12
accommodation_type                27
accommodation_country            180
accommodation_score               71
accommodation_star_rating         10
location_is_ski                    2
location_is_beach                  2
location_is_city_center            2
review_id                    1628989
review_title                  844148
review_positive              1582370
review_negative               914773
review_score                      22
review_helpful_votes              50
dtype: int64