In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import PowerTransformer

from spektral.datasets import qm9

In [None]:
_, _, _, y = qm9.load_data(return_type='numpy',
                          amount=None)

y = y[y.columns[1:]]

In [None]:
y

In [None]:
def percentiles_and_hist(data, prop, bins=20, outlier_scale=5):
    print(f"Analyzing {prop}")
    print()
    percentiles = np.percentile(data, [100/bins*i for i in range(bins+1)])
    for i in range(bins+1):
        print(f"{100/bins*i}%\t: {percentiles[i]:.2f}")
    
    first_quartile = np.percentile(data, 25)
    third_quartile = np.percentile(data, 75)
    iqr = third_quartile - first_quartile
    print(f"iqr: {iqr}")
    scale = 5
    min_threshold = first_quartile - scale*iqr
    max_threshold = third_quartile + scale*iqr
    
    outlier_mask = []
    for elem in data:
        outlier_mask.append(elem < min_threshold or elem > max_threshold)
    outliers = data[outlier_mask]
    print(f"Scale: {scale}")
    print(f"Num outliers: {len(outliers)}")
    
    plt.hist(data, bins=bins, range=(np.percentile(data, 0.1), np.percentile(data, 99.9)))
    plt.title(f"{prop}")
    plt.show()

for prop in y:
    # lumo contains both positive and negative values
    # so we have to analyze this separately
    if prop == 'lumo':
        continue
    data = y[[prop]].values
    data = np.abs(data)
    percentiles_and_hist(data, prop)
    
    log_data = np.log(1+data)
    percentiles_and_hist(log_data, f"log {prop}")

In [None]:
skew_df = pd.concat({'y_skew': y.skew(axis=0), 'log_y_skew': np.log(1+np.abs(y)).skew(axis=0)}, axis=1)
skew_df

In [None]:
for prop in y:
    pt = PowerTransformer()
    data = y[[prop]].values
    y_trans = pt.fit_transform(data)
    y_trans.reshape(1, -1)
    
    print(f"lambda: {pt.lambdas_[0]}")
    
    percentiles_and_hist(y_trans, f"Power Transformed {prop}")
    
    # to recover the original data
    y_orig = pt.inverse_transform(y_trans)
    print(y_orig)