In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load the datasets
ama_fact_data = pd.read_csv('ama_fact.csv')
product_dimension_data = pd.read_csv('product_dimension.csv')
category_dimension_data = pd.read_csv('category_dimension.csv')

# Merge the datasets based on their relationships
# Merge product details into the fact table
merged_data = pd.merge(ama_fact_data, product_dimension_data, on="Asin", how='left')
# Merge category details into the combined data
final_merged_data = pd.merge(merged_data, category_dimension_data, on="Category_Id", how='left')

# One-Hot Encoding for categorical data
encoder = OneHotEncoder(sparse=False)
categorical_features = final_merged_data[["Category_name"]]  # Assuming 'category_name' needs to be encoded
categorical_encoded = encoder.fit_transform(categorical_features)
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out())

# Concatenate encoded categorical data with the original DataFrame
final_merged_data = pd.concat([final_merged_data.drop("Category_name", axis=1), categorical_encoded_df], axis=1)

# Assume 'boughtinlastmonth' is the target variable and we drop non-numeric or irrelevant columns
X = final_merged_data.drop(["BoughtInLastMonth", "Asin", "Product_URL", "Title"], axis=1)
y = final_merged_data["BoughtInLastMonth"]

# Handling possible NaN values
X.fillna(X.mean(), inplace=True)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the Decision Tree model
dt_regressor = DecisionTreeRegressor(max_depth=5, random_state=42)

# Training the model
dt_regressor.fit(X_train, y_train)

# Making predictions
predictions = dt_regressor.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

print(f"Decision Tree RMSE: {rmse}")




Decision Tree RMSE: 653.1602929325763
