### Setup ###

In [None]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from pathlib import Path

# Set Matplotlib defaults
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)


def make_mi_scores(X: pd.DataFrame, y: pd.Series) -> pd.Series:
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(
        X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


### Load data ###

In [None]:
data_dir = Path("../input/house-prices-advanced-regression-techniques/")

X = pd.read_csv("data/X_research.csv", index_col="Id")
y = pd.read_csv("data/y_research.csv", index_col="Id")
df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")

numerical_data: list[str] = ['BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2',
                             'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch',
                             'Fireplaces', 'FirstFlrSF', 'FullBath', 'GarageArea',
                             'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr',
                             'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea',
                             'MiscVal', 'MoSold', 'OpenPorchSF', 'PoolArea', 'ScreenPorch',
                             'SecondFlrSF', 'Threeseasonporch', 'TotRmsAbvGrd', 'TotalBsmtSF',
                             'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold']


## Feature reserch ##

### OverallQual ###

In [None]:
X.OverallQual.describe()

In [None]:
X['SalePrice'] = y['SalePrice']
sns.scatterplot(x="OverallQual", y="SalePrice", data=X,)

In [None]:
sns.displot(X, x="OverallQual", kind="kde",)

In [None]:
X.pop('SalePrice')
mi = make_mi_scores(X, y['SalePrice'])
plot_mi_scores(mi)
mi[mi == 0]