In [2]:
# 1. Loading the Dataset
import pandas as pd

# Load the hotels dataset
hotels = pd.read_csv('hotels.csv')

# 2. Data Preprocessing
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
hotel_encoder = LabelEncoder()
hotels['hotel_id'] = hotel_encoder.fit_transform(hotels['name'])

user_encoder = LabelEncoder()
hotels['user_id'] = user_encoder.fit_transform(hotels['userCode'])

# 3. Feature Engineering
data = hotels[['user_id', 'hotel_id', 'total']]

# 4. Model Training
# Using matrix factorization for collaborative filtering
from sklearn.decomposition import TruncatedSVD

# Create a matrix of users x hotels, filling missing values with zeros
matrix = data.pivot_table(index='user_id', columns='hotel_id', values='total').fillna(0)

# Determine the number of components
num_components = min(matrix.shape) - 1  # One less than the smaller dimension of the matrix

# Apply Truncated SVD for dimensionality reduction
svd = TruncatedSVD(n_components=num_components, random_state=42)
latent_matrix = svd.fit_transform(matrix)

# 5. Model Evaluation
reconstructed_matrix = svd.inverse_transform(latent_matrix)
reconstruction_error = ((matrix - reconstructed_matrix) ** 2).mean()

print(f'Reconstruction Error: {reconstruction_error}')

# 6. Saving the Trained Model
import joblib

joblib.dump(svd, 'hotel_recommendation_model.pkl')
joblib.dump(hotel_encoder, 'hotel_encoder.pkl')
joblib.dump(user_encoder, 'user_encoder.pkl')


Reconstruction Error: hotel_id
0       2.935113
1      23.993150
2       0.414198
3      13.824629
4       5.400451
5    4870.784689
6      19.158635
7       4.164845
8       1.743386
dtype: float64


['user_encoder.pkl']