# Importing libraries and loading data

In [1]:
# Install Python packages using pip.

# The "!pip" command allows you to run shell commands in Jupyter Notebook or Colab cells.
# It is used here to install Python packages.
# The "-q" flag stands for "quiet," which means it will suppress output during installation.
# "feature_engine," "autoviz," and "dataprep" are the packages being installed.
# The "2>/dev/null" part redirects any error messages (stderr) to the null device, effectively silencing them.
# This is often used when you want to hide installation messages.
!pip install -q feature_engine autoviz dataprep 2>/dev/null

In [2]:
# Import necessary libraries
import numpy as np  # Import NumPy for handling numerical operations
import pandas as pd  # Import Pandas for data manipulation and analysis
import warnings  # Import Warnings to suppress unnecessary warnings

# Suppress warning messages
warnings.filterwarnings("ignore")

# Import AutoViz from the autoviz library for automated visualization of data
from autoviz import AutoViz_Class

# Import load_dataset and create_report from the dataprep library for data loading and EDA
from dataprep.datasets import load_dataset
from dataprep.eda import create_report

# Import SHAP for interpreting model predictions
import shap

# Import matplotlib for data visualization
import matplotlib.pyplot as plt

# Import CatBoostRegressor for building a regression model
from catboost import Pool, CatBoostRegressor

# Import mean_squared_error for evaluating model performance
from sklearn.metrics import mean_squared_error

# Import train_test_split for splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

# Import RareLabelEncoder from feature_engine.encoding for encoding categorical features
from feature_engine.encoding import RareLabelEncoder

# Import CountVectorizer from sklearn.feature_extraction.text for text feature extraction
from sklearn.feature_extraction.text import CountVectorizer

# Import ast and re for working with text and regular expressions
import ast
import re

# Import gc for garbage collection
import gc

# Set Pandas options to display a maximum of 1000 rows
pd.set_option('display.max_rows', 1000)

ModuleNotFoundError: No module named 'numpy'

In [None]:
%%time

# Load the raw data
# taken from https://www.kaggle.com/code/lorentzyeung/starter-notebook-for-uk-property-price-paid-data

colnames=['Transaction_unique_identifier', 'price', 'Date_of_Transfer', 
          'postcode', 'Property_Type', 'Old/New', 
          'Duration', 'PAON', 'SAON', 
          'Street', 'Locality', 'Town/City', 
          'District', 'County', 'PPDCategory_Type',
          'Record_Status - monthly_file_only'
          ] 

df = pd.read_csv('/kaggle/input/price-paid-data-202304/202304.csv',
                  header=None,
                 names=colnames,
                 infer_datetime_format=True,
                 parse_dates=["Date_of_Transfer"],
                 dayfirst=False
                 ).drop(['Transaction_unique_identifier'], axis=1)

# Reads the CSV file into a Pandas DataFrame
item0 = df.shape[0]  # Stores the initial number of rows in the DataFrame
df = df.drop_duplicates()  # Removes duplicate rows from the DataFrame
item1 = df.shape[0]  # Stores the number of rows after removing duplicates
print(f"There are {item0-item1} duplicates found in the dataset")  # Prints the number of duplicates that were removed

df['log10_price'] = df['price'].apply(lambda x: np.log10(x))

df['Year'] = df['Date_of_Transfer'].dt.year
df['Location'] = df['Street'].fillna('None') + ', ' + df['Locality'].fillna('None') + ', ' + df['Town/City'].fillna('None') + ', ' + df['District'].fillna('None') + ', ' + df['County'].fillna('None')
df['Location'] = df['Location'].str.lower()

selected_cols = ['log10_price', 'Year', 'Property_Type', 'Old/New', 'Duration', 'Location', 'PPDCategory_Type']

In [None]:
df = df[selected_cols]
print(df.shape)
df.sample(5).T

In [None]:
# Use the smaller random subsample of data to accelerate the process
df = df.sample(frac=0.25)
gc.collect()

In [None]:
df.columns

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df['Location'].value_counts().head(30)

# Data visualisation

In [None]:
# An update taken from the nice work https://www.kaggle.com/code/anshtanwar/auto-eda-missing-migrants-interactive-charts 
# made by @anshtanwar

# Import the AutoViz_Class
# This class is used for automated exploratory data analysis and visualization.
AV = AutoViz_Class()

# Initialize variables
filename = ""  # Specify the filename of the dataset (empty in this case)
target_variable = 'log10_price'  # Specify the target variable for analysis
custom_plot_dir = "custom_plot_directory"  # Specify the directory to save custom plots

# Perform automated EDA using the AutoViz library
# The following parameters are used:
# - filename: Empty in this case as the data is provided directly as 'df'
# - sep: Delimiter used in the data (comma in this case)
# - depVar: Target variable for analysis ('rating' in this case)
# - dfte: DataFrame to be analyzed ('df' is assumed to be defined earlier)
# - header: Indicates that the first row contains column names (0 for True)
# - verbose: Verbosity level (1 for verbose output)
# - lowess: Smoothing using Lowess algorithm (False for no smoothing)
# - chart_format: Format in which charts will be generated (HTML format in this case)
# - max_rows_analyzed: Maximum number of rows to analyze (up to 10,000 rows)
# - max_cols_analyzed: Maximum number of columns to analyze (up to 50 columns)
# - save_plot_dir: Directory to save the generated plots ('custom_plot_directory' in this case)
dft = AV.AutoViz(
    filename,
    sep=",",
    depVar=target_variable,
    dfte=df,
    header=0,
    verbose=1,
    lowess=False,
    chart_format="html",
    max_rows_analyzed=min([df.shape[0], 10**4]),
    max_cols_analyzed=min([df.shape[1], 50]),
    save_plot_dir=custom_plot_dir
)

In [None]:
# Import the necessary library for displaying HTML content
from IPython.core.display import display, HTML

# Import the pathlib library to work with file paths
from pathlib import Path

# Initialize an empty list to store file names
file_names = []

# Use pathlib to iterate through HTML files in a specific directory
for file in Path(f'/kaggle/working/{custom_plot_dir}/{target_variable}/').glob('*.html'):
    
    # Extract the filename from the full path and add it to the list
    filename = str(file).split('/')[-1]
    file_names.append(filename)

# Iterate through the list of file names and display each HTML file
for file_name in file_names:
    
    # Construct the full file path for each HTML file
    file_path = f'/kaggle/working/{custom_plot_dir}/{target_variable}/{file_name}'
    
    # Open the HTML file for reading
    with open(file_path, 'r') as file:
        
        # Read the content of the HTML file
        html_content = file.read()
        
        # Display the HTML content using IPython
        display(HTML(html_content))

In [None]:
create_report(df.sample(10**4))

# Data transformation

In [None]:
# Accessing DataFrame columns
# This line of code retrieves the column names from a DataFrame called 'df'.
# It allows you to access and work with the names of the columns in the DataFrame.

df.columns

In [None]:
df.sample(5).T

In [None]:
# Display information about the DataFrame 'df'
# This includes the data types, non-null values, and memory usage
# Useful for getting a quick overview of the dataset's structure
df.info()

In [None]:
%%time

# Select the main label.
main_label = 'log10_price'

# Set up a rare label encoder for selected columns.
for col in ['Property_Type', 'Old/New', 'Duration', 'Location', 'PPDCategory_Type']:
    df[col] = df[col].fillna('None')
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=150, replace_with='Other', tol=50.0 / df.shape[0])
    df[col] = encoder.fit_transform(df[[col]])
    print(f"LabelEncoded column {col}")

print(df.shape)  # Print the shape of the resulting DataFrame.
df.sample(5).T  # Display a sample of 5 rows, transposed for easier readability.

In [None]:
gc.collect()

# Machine learning

In [None]:
%%time
# Initialize data
# Extract the values of the 'main_label' column and reshape it into a 1D array as 'y'
y = df[main_label].values.reshape(-1,)

# Create the feature matrix 'X' by dropping the 'main_label' column from the DataFrame 'df'
X = df.drop([main_label], axis=1)

# Identify categorical columns in the DataFrame 'df'
# These columns contain non-numeric data
cat_cols = df.select_dtypes(include=['object']).columns

del df
gc.collect()

In [None]:
%%time

# Create a list of indices for categorical columns in the feature matrix 'X'
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]

# Split the data into training and testing sets
# - 'X_train' and 'y_train' will contain the training features and labels, respectively
# - 'X_test' and 'y_test' will contain the testing features and labels, respectively
# The split is done with a 20% test size, a random seed of 0, and stratification based on the selected column(s)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, stratify=X[['Location']])

del X, y
gc.collect()

# Print the dimensions of the training and testing sets
# This provides insight into the sizes of the datasets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
%%time

# Initialize the training and testing data pools using CatBoost's Pool class
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=cat_cols_idx)  # Create a training data pool with categorical features
test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)  # Create a testing data pool with categorical features

# Specify the training parameters for the CatBoostRegressor model
model = CatBoostRegressor(iterations=100,    # Number of boosting iterations
                          depth=5,           # Maximum depth of trees in the ensemble
                          verbose=1,         # Set verbosity level to 0 (no output during training)
                          learning_rate=0.2,  # Learning rate for gradient boosting
                          loss_function='RMSE')  # Loss function to optimize (Root Mean Squared Error)

# Train the CatBoostRegressor model on the training data
model.fit(train_pool)

# Make predictions using the trained model on both the training and testing data
y_train_pred = model.predict(train_pool)  # Predictions on the training data
y_test_pred = model.predict(test_pool)    # Predictions on the testing data

# Calculate and print the Root Mean Squared Error (RMSE) scores for training and testing data
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)  # RMSE for training data
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)     # RMSE for testing data

# Print the RMSE scores rounded to three decimal places
print(f"RMSE score for train {round(rmse_train, 3)} dex, and for test {round(rmse_test, 3)} dex")

In [None]:
# Calculate the baseline RMSE (Root Mean Squared Error) scores for the training and test datasets.

# For the training dataset:

# Calculate the RMSE by comparing the actual target values (y_train) with the predicted values,
# where the predicted values are the mean of the training target values repeated for each data sample.
rmse_bs_train = mean_squared_error(y_train, [np.mean(y_train)]*len(y_train), squared=False)

# For the test dataset:

# Calculate the RMSE by comparing the actual target values (y_test) with the predicted values,
# where the predicted values are the mean of the training target values repeated for each test data sample.
rmse_bs_test = mean_squared_error(y_test, [np.mean(y_train)]*len(y_test), squared=False)

# Print the baseline RMSE scores for both the training and test datasets, rounded to 3 decimal places.
print(f"RMSE baseline score for train {round(rmse_bs_train, 3)} dex, and for test {round(rmse_bs_test, 3)} dex")

# Explanations with SHAP values

In [None]:
%matplotlib inline

In [None]:
%%time
# Initialize the SHAP library for visualization
shap.initjs()

# Create a TreeExplainer object for the 'model' (assumes 'model' is a tree-based model like a Random Forest or XGBoost)
ex = shap.TreeExplainer(model)

# Calculate SHAP values for the 'X_test' data using the TreeExplainer
shap_values = ex.shap_values(X_test)

# Generate a summary plot to visualize the impact of features on model predictions
shap.summary_plot(shap_values, X_test)

In [None]:
# Calculate the expected values (e.g., predicted ratings) using a variable named 'ex.expected_value'.
expected_values = ex.expected_value

# Print the average predicted label.
print(f"Average predicted price is {round(10**expected_values):,} GBP")

# Calculate the average actual label from 'y_test'.
actual_rating_avg = round(10**np.mean(y_test))

# Print the average actual label.
print(f"Average actual price is {actual_rating_avg:,} GBP")

In [None]:
# Define a function named 'show_shap' that visualizes SHAP values for a specific feature.
# Parameters:
#   - col: The name of the feature for which SHAP values will be visualized.
#   - shap_values: SHAP values calculated for the model's predictions.
#   - label: The label to be displayed in the plot title.
#   - X_test: The DataFrame containing the test data.
#   - ylabel: The label for the y-axis in the plot (default is 'points').
def show_shap(col, shap_values=shap_values, label=main_label, X_test=X_test, ylabel='dex'):
    # Create a copy of the test data DataFrame.
    df_infl = X_test.copy()
    
    # Add a new column 'shap_' to the DataFrame containing SHAP values for the specified feature.
    df_infl['shap_'] = shap_values[:, df_infl.columns.tolist().index(col)]
    
    # Calculate the mean SHAP values and standard deviation grouped by the specified feature.
    gain = round(df_infl.groupby(col)['shap_'].mean(), 4)
    gain_std = round(df_infl.groupby(col)['shap_'].std(), 4)
    
    # Count the number of data points for each category of the specified feature.
    cnt = df_infl.groupby(col)['shap_'].count()
    
    # Create a dictionary containing the feature, mean SHAP values, standard deviation, and count.
    dd_dict = {'col': list(gain.index), 'gain': list(gain.values), 'gain_std': list(gain_std.values), 'count': cnt}
    
    # Create a DataFrame from the dictionary and sort it by 'gain' in descending order.
    df_res = pd.DataFrame.from_dict(dd_dict).sort_values('gain', ascending=False).set_index('col')
    
    # Create a plot to visualize the SHAP values with error bars.
    plt.figure(figsize=(24, 16))
    plt.errorbar(df_res.index, df_res['gain'], yerr=df_res['gain_std'], fmt="o", color="r")
    
    # Set plot title and axis labels.
    plt.title(f'SHAP values for column {col}, label {label}')
    plt.ylabel(ylabel)
    plt.tick_params(axis="x", rotation=90)
    
    # Display the plot and the DataFrame with results.
    plt.show()
    print(df_res)
    
    # Return the function.
    return

# Loop through the columns in the test data.
for col in X_test.columns:
    print()  # Print an empty line for better readability.
    print(col)  # Print the name of the current column.
    print()  # Print another empty line for separation.

    # Call the 'show_shap' function to visualize SHAP values for the current column.
    show_shap(col, shap_values, label=main_label, X_test=X_test)