In [None]:
# Required for debug only. If you have data_fast_insights installed, you can delete this cell.
import sys
from pathlib import Path
import os

sys.path.append(str(Path(os.getcwd()).parent.parent))
sys.path.append(str(Path(os.getcwd()).parent))

In [None]:
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt

from data_fast_insights import BinaryDependenceModelData
import data_fast_insights.calculations as calc
from data_fast_insights.plotting import plot_segments_basic_info

In [None]:
%config InlineBackend.figure_format = 'svg'

Getting data

In [None]:
raw_data = datasets.fetch_california_housing()
print(raw_data['DESCR'])

In [None]:
df = pd.DataFrame(raw_data['data'], columns=raw_data['feature_names'])
df['MedianHouseValue'] = raw_data['target']
df.head()

## Using Data Fast Insights

Initializing model data

In [None]:
dmd = BinaryDependenceModelData(
    base_data=df,
    cat_cols=None,
    num_cols={'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'},
    y_name='MedianHouseValue',
    y_quantile=0.5)

Getting bins for numeric variables, optimizing for Information Value. 

In [None]:
num_bins = calc.make_bins(model_data=dmd)

Converting variables

In [None]:
dmd.convert_to_binary(bins=num_bins)

Calculating group importance and other metrics

In [None]:
res = calc.calculate_dependence(model_data=dmd)

Getting data about segments (dataframe is sorted by importance)

In [None]:
# print(res[res['perc_of_total'] > 5][:3][['low_perc', 'base_col']])
res[res['perc_of_total'] > 5]

#### Plotting basic info about features segments

With increasing the occupancy number house value drops

In [None]:
f = plot_segments_basic_info(
    model_data=dmd, res_low_df=res, base_feature_name='AveOccup', base_feature_rename='Average Occupancy')

There is a significant drop of house value in blocks 
located at Longitude from -121 up to -119 (not including -119), which requires further research.

In [None]:
f = plot_segments_basic_info(
    model_data=dmd, res_low_df=res, base_feature_name='Longitude')
plt.gcf().set_size_inches(9.0, 5.5)

Blocks with residents having highest income contain the most highly values houses.

In [None]:
f = plot_segments_basic_info(
    model_data=dmd, res_low_df=res, base_feature_name='MedInc', base_feature_rename='MedianIncome')