# Feature engineering

In [None]:
import logging
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math
import requests
from collections import Counter
from adjustText import adjust_text
from tqdm import tqdm
from bisect import bisect_left
from scipy.cluster.hierarchy import ward, leaves_list
from scipy.spatial.distance import squareform, pdist

import datetime 
import os
import re
import sklearn
import colorsys  # Not used; too limited...
from PIL import Image, ImageEnhance
from umap import UMAP  # pip install umap-learn
import ot              # pip install POT

import textstat
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk import word_tokenize, pos_tag
import gensim.downloader
import matplotlib.patheffects as path_effects

from sklearn.pipeline import Pipeline
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, FunctionTransformer, StandardScaler, RobustScaler, Normalizer, PowerTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, TargetEncoder, LabelEncoder
from sklearn.preprocessing import SplineTransformer, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA

import statsmodels
import statsmodels.nonparametric.smoothers_lowess
from palmerpenguins import load_penguins
from pydataset import data  # Many datasets from R

In [None]:
colours_db = [
    "#193296", "#0092D0", "#FFA000", "#D70032", "#2D962D", "#0055AA",
    "#B4D2F0", "#8296AA", "#0018A8", "#8C9FEC", "#82C3FF", "#FFD999",
    "#FF8DA7", "#9EE29E", "#3399FF", "#113557", "#E6EAEE", "#B6C0C7"
]

colours_quintile = ["#D7191C", "#FDAE61", "#CCCC66", "#ABD9E9", "#2C7BB6", "black"]

# Polychrome: Creating and Assessing Qualitative Palettes with Many Colors
# K.R. Coombes et al. (Journal of Statistical Software, 2019)
# https://www.jstatsoft.org/article/view/v090c01

colours_kelly = [
    "#f2f3f4", "#222222", "#f3c300", "#875692", "#f38400", "#a1caf1",
    "#be0032", "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5",
    "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", "#882d17",
    "#8db600", "#654522", "#e25822", "#2b3d26"
]
colours_kelly_reordered = [
    "#F2F3F4", "#BE0032", "#F3C300", "#0067A5", "#2B3D26", "#8DB600",
    "#E68FAC", "#F38400", "#875692", "#882D17", "#008856", "#C2B280",
    "#A1CAF1", "#F99379", "#848482", "#654522", "#E25822", "#B3446C",
    "#DCD300", "#F6A600", "#604E97", "#222222"
]
colours_glasbey = [
    "#FFFFFF", "#0000FF", "#FF0000", "#00FF00", "#000033", "#FF00B6",
    "#005300", "#FFD300", "#009FFF", "#9A4D42", "#00FFBE", "#783FC1",
    "#1F9698", "#FFACFD", "#B1CC71", "#F1085C", "#FE8F42", "#DD00FF",
    "#201A01", "#720055", "#766C95", "#02AD24", "#C8FF00", "#886C00",
    "#FFB79F", "#858567", "#A10300", "#14F9FF", "#00479E", "#DC5E93",
    "#93D4FF", "#004CFF"
]
colours_glasbey_reordered = [
    "#FFFFFF", "#FF0000", "#0000FF", "#00FF00", "#FF00B6", "#FFD300",
    "#000033", "#9A4D42", "#DD00FF", "#00FFBE", "#FFACFD", "#1F9698",
    "#B1CC71", "#00479E", "#FE8F42", "#005300", "#C8FF00", "#A10300",
    "#720055", "#02AD24", "#766C95", "#886C00", "#009FFF", "#DC5E93",
    "#14F9FF", "#93D4FF", "#F1085C", "#FFB79F", "#858567", "#783FC1",
    "#201A01", "#004CFF"
]
colors_green_armytage = [
    "#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31",
    "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00",
    "#C20088", "#003380", "#19A405", "#FFA8BB", "#426600", "#FF0010",
    "#5EF1F2", "#00998F", "#E0FF66", "#100AFF", "#990000", "#FFFF80",
    "#FFE100", "#FF5000"
]
colours_green_armytage_reordered = [
    "#F0A3FF", "#FF0010", "#2BCE48", "#FFCC99", "#191919", "#100AFF",
    "#5EF1F2", "#990000", "#C20088", "#003380", "#426600", "#E0FF66",
    "#FFA8BB", "#0075DC", "#808080", "#FFE100", "#8F7C00", "#94FFB5",
    "#4C005C", "#00998F", "#FF5000", "#993F00", "#005C31", "#9DCC00",
    "#FFFF80", "#19A405"
]
colours_polychrome_36 = [
    "#5A5156", "#E4E1E3", "#F6222E", "#FE00FA", "#16FF32", "#3283FE",
    "#FEAF16", "#B00068", "#1CFFCE", "#90AD1C", "#2ED9FF", "#DEA0FD",
    "#AA0DFE", "#F8A19F", "#325A9B", "#C4451C", "#1C8356", "#85660D",
    "#B10DA1", "#FBE426", "#1CBE4F", "#FA0087", "#FC1CBF", "#F7E1A0",
    "#C075A6", "#782AB6", "#AAF400", "#BDCDFF", "#822E1C", "#B5EFB5",
    "#7ED7D1", "#1C7F93", "#D85FF7", "#683B79", "#66B0FF", "#3B00FB"
]
colours_alphabet = [
    "#AA0DFE", "#3283FE", "#85660D", "#782AB6", "#565656", "#1C8356",
    "#16FF32", "#F7E1A0", "#E2E2E2", "#1CBE4F", "#C4451C", "#DEA0FD",
    "#FE00FA", "#325A9B", "#FEAF16", "#F8A19F", "#90AD1C", "#F6222E",
    "#1CFFCE", "#2ED9FF", "#B10DA1", "#C075A6", "#FC1CBF", "#B00068",
    "#FBE426", "#FA0087"
]
colours_dark24 = [
    "#3292E6", "#E05D99", "#16A300", "#FE0D26", "#DE00FF", "#262E26",
    "#BB8700", "#760D88", "#5C16FF", "#E96638", "#00A19F", "#FE00D4",
    "#712A22", "#FC0081", "#87965A", "#998AAE", "#AD73F3", "#0D0DB3",
    "#16225F", "#DC5DD7", "#944577", "#00925D", "#AD8580", "#9F0D35"
]
colours_light24 = [  # For use on a dark background
    "#FD3222", "#0DFE00", "#557AFE", "#FED8C8", "#FC00D3", "#0DF9FF",
    "#F6F70D", "#FE990D", "#3D9B4B", "#AF70BD", "#DD557D", "#CE38FF",
    "#698E96", "#38AAE6", "#9B8E1C", "#00FDC5", "#AAF866", "#CAFBD7",
    "#C27555", "#FE0D84", "#FE7FD2", "#F1D8FF", "#FBC72E", "#F36247"
]
colours_sky = [   # For use on a dark background; close to the standard karyotyping palette
    "#FDFD16", "#D04D60", "#B1BEFA", "#1CFDF1", "#F98500", "#A955D0",
    "#9B6C6A", "#FBC87A", "#EDFFED", "#9FE600", "#22C7E9", "#FC0DF8",
    "#F60000", "#FE76C8", "#3D8484", "#C00DFE", "#00AAFD", "#FF0085",
    "#6DFCB2", "#4969FF", "#E5FFA3", "#FFC3EF", "#4F8A35", "#00FE5D"
]


# https://yeun.github.io/open-color/
# https://github.com/rougier/scientific-visualization-book/blob/master/code/colors/open-colors.py
colours_open_gray   = [ "#f8f9fa", "#f1f3f5", "#e9ecef", "#dee2e6", "#ced4da", "#adb5bd", "#868e96", "#495057", "#343a40", "#212529", ]
colours_open_red    = [ "#fff5f5", "#ffe3e3", "#ffc9c9", "#ffa8a8", "#ff8787", "#ff6b6b", "#fa5252", "#f03e3e", "#e03131", "#c92a2a", ]
colours_open_pink   = [ "#fff0f6", "#ffdeeb", "#fcc2d7", "#faa2c1", "#f783ac", "#f06595", "#e64980", "#d6336c", "#c2255c", "#a61e4d", ]
colours_open_grape  = [ "#f8f0fc", "#f3d9fa", "#eebefa", "#e599f7", "#da77f2", "#cc5de8", "#be4bdb", "#ae3ec9", "#9c36b5", "#862e9c", ]
colours_open_violet = [ "#f3f0ff", "#e5dbff", "#d0bfff", "#b197fc", "#9775fa", "#845ef7", "#7950f2", "#7048e8", "#6741d9", "#5f3dc4", ]
colours_open_indigo = [ "#edf2ff", "#dbe4ff", "#bac8ff", "#91a7ff", "#748ffc", "#5c7cfa", "#4c6ef5", "#4263eb", "#3b5bdb", "#364fc7", ]
colours_open_blue   = [ "#e7f5ff", "#d0ebff", "#a5d8ff", "#74c0fc", "#4dabf7", "#339af0", "#228be6", "#1c7ed6", "#1971c2", "#1864ab", ]
colours_open_cyan   = [ "#e3fafc", "#c5f6fa", "#99e9f2", "#66d9e8", "#3bc9db", "#22b8cf", "#15aabf", "#1098ad", "#0c8599", "#0b7285", ]
colours_open_teal   = [ "#e6fcf5", "#c3fae8", "#96f2d7", "#63e6be", "#38d9a9", "#20c997", "#12b886", "#0ca678", "#099268", "#087f5b", ]
colours_open_green  = [ "#ebfbee", "#d3f9d8", "#b2f2bb", "#8ce99a", "#69db7c", "#51cf66", "#40c057", "#37b24d", "#2f9e44", "#2b8a3e", ]
colours_open_lime   = [ "#f4fce3", "#e9fac8", "#d8f5a2", "#c0eb75", "#a9e34b", "#94d82d", "#82c91e", "#74b816", "#66a80f", "#5c940d", ]
colours_open_yellow = [ "#fff9db", "#fff3bf", "#ffec99", "#ffe066", "#ffd43b", "#fcc419", "#fab005", "#f59f00", "#f08c00", "#e67700", ]
colours_open_orange = [ "#fff4e6", "#ffe8cc", "#ffd8a8", "#ffc078", "#ffa94d", "#ff922b", "#fd7e14", "#f76707", "#e8590c", "#d9480f", ]

# https://github.com/rougier/scientific-visualization-book/blob/master/code/colors/material-colors.py
colours_material_red       = [ "#ffebee", "#ffcdd2", "#ef9a9a", "#e57373", "#ef5350", "#f44336", "#e53935", "#d32f2f", "#c62828", "#b71c1c", ]
colours_material_pink      = [ "#fce4ec", "#f8bbd0", "#f48fb1", "#f06292", "#ec407a", "#e91e63", "#d81b60", "#c2185b", "#ad1457", "#880e4f", ]
colours_material_purple    = [ "#f3e5f5", "#e1bee7", "#ce93d8", "#ba68c8", "#ab47bc", "#9c27b0", "#8e24aa", "#7b1fa2", "#6a1b9a", "#4a148c", ]
colours_material_d_purple  = [ "#ede7f6", "#d1c4e9", "#b39ddb", "#9575cd", "#7e57c2", "#673ab7", "#5e35b1", "#512da8", "#4527a0", "#311b92", ]
colours_material_indigo    = [ "#e8eaf6", "#c5cae9", "#9fa8da", "#7986cb", "#5c6bc0", "#3f51b5", "#3949ab", "#303f9f", "#283593", "#1a237e", ]
colours_material_blue      = [ "#e3f2fd", "#bbdefb", "#90caf9", "#64b5f6", "#42a5f5", "#2196f3", "#1e88e5", "#1976d2", "#1565c0", "#0d47a1", ]
colours_material_l_blue    = [ "#e1f5fe", "#b3e5fc", "#81d4fa", "#4fc3f7", "#29b6f6", "#03a9f4", "#039be5", "#0288d1", "#0277bd", "#01579b", ]
colours_material_cyan      = [ "#e0f7fa", "#b2ebf2", "#80deea", "#4dd0e1", "#26c6da", "#00bcd4", "#00acc1", "#0097a7", "#00838f", "#006064", ]
colours_material_teal      = [ "#e0f2f1", "#b2dfdb", "#80cbc4", "#4db6ac", "#26a69a", "#009688", "#00897b", "#00796b", "#00695c", "#004d40", ]
colours_material_green     = [ "#e8f5e9", "#c8e6c9", "#a5d6a7", "#81c784", "#66bb6a", "#4caf50", "#43a047", "#388e3c", "#2e7d32", "#1b5e20", ]
colours_material_l_green   = [ "#f1f8e9", "#dcedc8", "#c5e1a5", "#aed581", "#9ccc65", "#8bc34a", "#7cb342", "#689f38", "#558b2f", "#33691e", ]
colours_material_lime      = [ "#f9fbe7", "#f0f4c3", "#e6ee9c", "#dce775", "#d4e157", "#cddc39", "#c0ca33", "#afb42b", "#9e9d24", "#827717", ]
colours_material_yellow    = [ "#fffde7", "#fff9c4", "#fff59d", "#fff176", "#ffee58", "#ffeb3b", "#fdd835", "#fbc02d", "#f9a825", "#f57f17", ]
colours_material_amber     = [ "#fff8e1", "#ffecb3", "#ffe082", "#ffd54f", "#ffca28", "#ffc107", "#ffb300", "#ffa000", "#ff8f00", "#ff6f00", ]
colours_material_orange    = [ "#fff3e0", "#ffe0b2", "#ffcc80", "#ffb74d", "#ffa726", "#ff9800", "#fb8c00", "#f57c00", "#ef6c00", "#e65100", ]
colours_material_d_orange  = [ "#fbe9e7", "#ffccbc", "#ffab91", "#ff8a65", "#ff7043", "#ff5722", "#f4511e", "#e64a19", "#d84315", "#bf360c", ]
colours_material_brown     = [ "#efebe9", "#d7ccc8", "#bcaaa4", "#a1887f", "#8d6e63", "#795548", "#6d4c41", "#5d4037", "#4e342e", "#3e2723", ]
colours_material_grey      = [ "#fafafa", "#f5f5f5", "#eeeeee", "#e0e0e0", "#bdbdbd", "#9e9e9e", "#757575", "#616161", "#424242", "#212121", ]
colours_material_blue_grey = [ "#eceff1", "#cfd8dc", "#b0bec5", "#90a4ae", "#78909c", "#607d8b", "#546e7a", "#455a64", "#37474f", "#263238", ]


def test_colours(ax = None):
    """
    Show all the colour palettes

    Also check: text_colours_2

    Inputs: ax: where to plot the palettes
    Output: None

    Example:

        text_colours()

    """
    all_colours = {
        "colours_db":            colours_db,
        "colours_quintile":      colours_quintile,
        'Pastel1':               plt.get_cmap('Pastel1').colors,
        'Pastel2':               plt.get_cmap('Pastel2').colors,
        'Paired':                plt.get_cmap('Paired').colors,
        'Accent':                plt.get_cmap('Accent').colors,
        'Dark2':                 plt.get_cmap('Dark2').colors,
        'Set1':                  plt.get_cmap('Set1').colors,
        'Set2':                  plt.get_cmap('Set2').colors,
        'Set3':                  plt.get_cmap('Set3').colors,
        'tab10':                 plt.get_cmap('tab10').colors,
        'tab20':                 plt.get_cmap('tab20').colors,
        'tab20b':                plt.get_cmap('tab20b').colors,
        'tab20c':                plt.get_cmap('tab20c').colors,
        "colours_kelly":         colours_kelly,
        "colours_glasbey":       colours_glasbey,
        "colors_green_armytage": colors_green_armytage,
        "colours_polychrome_36": colours_polychrome_36,
        "colours_alphabet":      colours_alphabet,
        "colours_dark24":        colours_dark24,
        "colours_light24":       colours_light24,
        "colours_sky":           colours_sky,
    }
    ax_was_None = ax is None
    if ax_was_None:
        fig, ax = plt.subplots( figsize=(20,len(all_colours)/2) )
    n = len(all_colours)
    for i,label in enumerate(all_colours.keys()):
        col = all_colours[label]
        ax.scatter( [i for i in range(len(col))], [n-i for u in col], color=col )
        for j in range(len(col)):
            ax.add_patch( plt.Rectangle((j-.4, n-i-.4), .8,.8, fill=True, color=col[j] ) )
            ax.text(j, n-i, j, color='white', va='center', ha='center', weight='bold')
        ax.text( -.5, n-i, label, ha='right')
    ax.axis('off')
    if ax_was_None:
        fig.tight_layout()
        plt.show()


def test_colours_2(ax = None):
    """
    Show the "Open" and "Material" colour palettes

    Also check: text_colours

    References:
    - https://yeun.github.io/open-color/
    - https://github.com/rougier/scientific-visualization-book/blob/master/code/colors/open-colors.py
    - https://github.com/rougier/scientific-visualization-book/blob/master/code/colors/material-colors.py


    Inputs: ax: where to plot the palettes
    Output: None

    Example:

        text_colours_2()

    """
    all_colours = {
        "open gray":   colours_open_gray,
        "open red":    colours_open_red,
        "open pink":   colours_open_pink,
        "open grape":  colours_open_grape,
        "open violet": colours_open_violet,
        "open indigo": colours_open_indigo,
        "open blue":   colours_open_blue,
        "open cyan":   colours_open_cyan,
        "open teal":   colours_open_teal,
        "open green":  colours_open_green,
        "open lime":   colours_open_lime,
        "open yellow": colours_open_yellow,
        "open orange": colours_open_orange,

        "material red":       colours_material_red,
        "material pink":      colours_material_pink,
        "material purple":    colours_material_purple,
        "material d_purple":  colours_material_d_purple,
        "material indigo":    colours_material_indigo,
        "material blue":      colours_material_blue,
        "material l_blue":    colours_material_l_blue,
        "material cyan":      colours_material_cyan,
        "material teal":      colours_material_teal,
        "material green":     colours_material_green,
        "material l_green":   colours_material_l_green,
        "material lime":      colours_material_lime,
        "material yellow":    colours_material_yellow,
        "material amber":     colours_material_amber,
        "material orange":    colours_material_orange,
        "material d_orange":  colours_material_d_orange,
        "material brown":     colours_material_brown,
        "material grey":      colours_material_grey,
        "material blue_grey": colours_material_blue_grey,
    }
    ax_was_None = ax is None
    if ax_was_None:
        fig, ax = plt.subplots( figsize=(6,len(all_colours)*.4) )
    n = len(all_colours)
    for i,label in enumerate(all_colours.keys()):
        col = all_colours[label]
        ax.scatter( [i for i in range(len(col))], [n-i for u in col], color=col )
        for j in range(len(col)):
            ax.add_patch( plt.Rectangle((j-.4, n-i-.4), .8,.8, fill=True, color=col[j] ) )
            ax.text(j, n-i, j, color='white', va='center', ha='center', weight='bold')
        ax.text( -.5, n-i, label, ha='right')
    ax.axis('off')
    if ax_was_None:
        fig.tight_layout()
        plt.show()

In [None]:
def remove_scientific_notation_from_vertical_axis(ax, deprecated_argument=None):
    """
    Remove the scientific notation from the vertical axis tick labels.
    If the scale is logarithmic but spans less than one or two orders of magnitude.
    """

    if deprecated_argument is None:
        fig = ax.get_figure()
    else:
        # The old version of this function was taking fig, ax as argument...
        # TODO: issue a deprecation warning
        fig, ax = ax, deprecated_argument

    fig.canvas.draw()

    def remove_scientific_notation(text = '$\\mathdefault{2\\times10^{-2}}$'):
        if text == '':
            return text
        expr = r'\$\\mathdefault\{((.*)\\times)?10\^\{(.*)\}\}\$'
        mantissa = re.sub( expr, r'\2', text )
        exponent = re.sub( expr, r'\3', text )
        if mantissa == '':
            mantissa = 1
        mantissa = float(mantissa)
        exponent = float(exponent)
        result = mantissa * 10 ** exponent
        return f'{float(f"{result:.4g}"):g}'

    labels = ax.yaxis.get_ticklabels()
    for label in labels:
        a = label.get_text()
        b = remove_scientific_notation(a)
        label.set_text(b)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message = "FixedFormatter should only be used together with FixedLocator" )
        ax.yaxis.set_ticklabels(labels)

    labels = ax.yaxis.get_minorticklabels()
    for label in labels:
        a = label.get_text()
        b = remove_scientific_notation(a)
        label.set_text(b)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', message = "FixedFormatter should only be used together with FixedLocator" )
        ax.yaxis.set_ticklabels(labels, minor=True)

def corrplot(
        C,
        ax      = None,
        vmin    = -1,
        vmax    = +1,
        cmap    = 'RdBu',
        title   = None,
        figsize = (12, 12),
        order   = False,
        aspect  = None,
        labels  = False,
        ticks   = True,
        interpolation = 'nearest',
):
    """
    Plot a correlation matrix.

    Positive correlations are in blue, negative correlations in red.
    The correlations can be written in white in the cells -- if they are not legible, they are not significant.
    The rows and columns can be reordered, using a hierarchical clustering.

    Inputs: C:  DataFrame, part of a correlation matrix
            ax: where to put the plot; use None to create a new plot
            vmin, vmax: minimum and maximum, for the colour gradient; you probably want 0 to be in the middle of this interval
            cmap: colourmap; use a divergent colourmap, with white (or grey) in the middle
            title: str
            figsize:
            order: boolean; whether to re-order the rows and the columns using hierarchical clustering
            aspect: None for square cells, 'auto' for rectangular ones
            label: boolean; whether to write the correlations in the cells (in white: if you cannot read them, they were not significant)
            interpolation: passed to imshow()
    Output: None

    Example:
        from vz import corrplot, LETTERS
        import numpy as np
        import pandas as pd
        x = np.random.normal(size=(100,11))
        x = pd.DataFrame( x, columns = LETTERS[:x.shape[1]] )
        C = x.corr()
        C = np.sign(C) * np.abs(C) ** (1/2)
        corrplot(C, figsize=(6,6), order=True, labels = True)
    """
    assert len(C.shape) == 2
    if not isinstance(C, pd.DataFrame):
        C = pd.DataFrame(C)
    if order:
        ## First, check and fix the correlation matrix
        C = C.copy()
        if np.any( ( C < -1 ) | ( C > +1 ) ):
            LOG( "Warning: the correlation matrix has values outside [-1,+1]" )
            C = np.clip(C, -1, 1)
        if np.any( ~ np.isfinite(C) ):
            LOG( "Warning: the correlation matrix contains infinite and/or missing values" )
            C[ ~ np.isfinite(C) ] = 0

        ## It can be a correlation matrix or just part of it
        if ( C.shape[0] == C.shape[1] ) and np.all( C.index == C.columns ):

            if np.any( np.diag(C) != 1 ):
                LOG( "Warning: the correlation matrix has values != 1 on the diagonal" )
                np.fill_diagonal(C.values, 1)

            #i = leaves_list( ward(pdist(C)) )            # Distances between the columns of the correlation matrix
            #i = leaves_list( ward(squareform(1-C) ) )    # (Squared) distance matrix from the correlation matrix
            i = leaves_list( ward(squareform( np.sqrt(1-C), checks=False )))
            C = C.iloc[i,i]

        else:
            i = leaves_list( ward(pdist(C)) )            # Distances between the columns of the correlation matrix
            j = leaves_list( ward(pdist(C.T)) )
            C = C.iloc[i,j]

    ax_was_None = ax is None
    if ax is None:
        fig, ax = plt.subplots( figsize = figsize )
    ax.imshow(C, vmin = vmin, vmax = vmax, cmap = cmap, aspect = aspect, interpolation = interpolation )

    if labels:
        for x in range(C.shape[1]):
            for y in range(C.shape[0]):
                ax.text( x, y, f"{C.iloc[y,x]:.2f}", ha='center', va='center', color='white' )

    if ticks:
        ax.set_xticks( range(C.shape[1]) )
        ax.set_xticklabels( C.columns, rotation = 90 )
        ax.set_yticks( range(C.shape[0]) )
        ax.set_yticklabels( C.index )
    else:
        ax.set_xticks([])
        ax.set_yticks([])

    if title is not None:
        ax.set_title(title)
    if ax_was_None:
        plt.show()

def mfrow(
        n:      int,
        aspect: float = 29.7/21,
        width:  float = 29.7,
        height: float = 21,
        pages:  int   = 1
):
    """
    Compute a layout (number of rows and columns) to put n plots,
    as close as possible to the desired aspect ratio,
    with as few empty cells as possible,
    for the given plot dimensions.

    Also see: remove_empty_axes

    Inputs:  n: number of subplots
             aspect: desired aspect ratio of the subplots
             width: width of the (super)plot
             height: height of the (super)plot
             pages: number of (super)plots (untested -- I think it does not do what I want)
    Outputs: nr: Number of rows
             nc: Number of columns
    """
    best = (1,1)
    best_value = float('inf')
    for nc in range(1,n+1):
        nr = math.ceil( n / nc / pages ) * pages
        a = ( width / nc ) / ( height / nr )
        if abs( a - aspect ) < best_value:
            best_value = abs( a - aspect )
            best = (nr, nc)
    return best

def uniformize1(x: np.ndarray) -> np.ndarray:
    """Uniformize a 1-dimensional NumPy array"""
    assert len( x.shape ) == 1
    a = scipy.stats.rankdata( x, nan_policy = 'omit' )
    missing = np.isnan(x)
    a[:] = np.where( missing, np.NaN, a )
    y = (a - .5) / np.nanmax(a, axis=0)  # Between 0 and 1, excluding 0 and 1
    return y

def uniformize2(x: np.ndarray) -> np.ndarray:
    """Uniformize the columns of a 2-dimensional NumPy array"""
    assert len(x.shape) == 2
    y = x.copy()
    for i in range(x.shape[1]):
        y[:,i] = uniformize1( x[:,i] )
    return y

def uniformize(x):
    """
    Uniformize 1-dimensional (or the columns of a 2-dimensional) NumPy array or Pandas Series or DataFrame
    """
    if len(x.shape) == 1:
        if isinstance(x, pd.Series):
            y = uniformize1(x.values)
            y = pd.Series(y, index = x.index)
        else:
            y = uniformize1(x)
    else:
        assert len(x.shape) == 2, f"Expecting a 1- or 2-dimensional array, got a {len(x.shape)}-dimensional one..."
        if isinstance(x, pd.DataFrame):
            y = uniformize2(x.values)
            y = pd.DataFrame(y, index = x.index, columns = x.columns)
        else:
            y = uniformize2(x)
    return y

logging.basicConfig(
    format  = '%(asctime)-15s %(message)s',
    datefmt = '%Y-%m-%d %H:%M:%S',
    level   = logging.INFO,
)

def LOG(*args) -> None:
    """
    Print a message to stderr.
    You can also use logging.info() directly.
    """
    logging.info(*args)

def get_cmap(cmap):
    """
    Colourmap from a string or a list of colours
    This is similar to matplotlib.cm.get_cmap, but it also works with a list of colours.

    Input: cmap: string
                 or list of colours
                 or matplotlib colormap
    Output: matplotlib colormap

    Examples:
        from adia.vz import colours_polychrome_36
        get_cmap("tab10")
        get_cmap( get_cmap("tab10") )
        get_cmap( colours_polychrome_36 )
    """
    if isinstance( cmap, list ) or isinstance( cmap, np.ndarray ):
        cmap = matplotlib.colors.ListedColormap(cmap)
    cmap = matplotlib.colormaps.get_cmap(cmap)
    return cmap

def remove_empty_axes(axs: np.ndarray) -> None:
    """
    Remove empty subplots

    Inputs: axs: numpy array, returned by plt.subplots()
    Output: None

    Examples:
        import matplotlib.pyplot as plt
        import numpy as np
        fig, axs = plt.subplots(2,2)
        x = np.random.normal(size=20)
        y = np.random.normal(size=20)
        axs[0,0].scatter(x, y)
        axs[1,0].scatter(x, y)
        axs[0,1].scatter(x, y)
        remove_empty_axes(axs)
        fig.tight_layout()
        plt.show()
    """
    for ax in axs.flatten():
        if (not ax.lines) and (not ax.collections) and (not ax.has_data()):
            ax.axis('off')

def fractiles(x, n=5):
    """
    Compute the fractiles of each column of x.
    """
    if isinstance( x, list ) or len(x.shape) == 1:
        ## 1-dimensional object
        return fractiles_1(x,n)
    y = x.copy()
    assert len(x.shape) == 2
    is_pandas = isinstance( x, pd.DataFrame )
    for i in range( x.shape[1] ):
        if is_pandas: y.iloc[:,i] = fractiles( x.iloc[:,i], n )
        else:         y     [:,i] = fractiles( x     [:,i], n )
    return y

def fractiles_1(x, n=5):
    """
    Fractiles of a 1-dimensional object (list, numpy array, pandas dataframe).
    The output is a vector (or list, etc.) with elements 1,2,...,n (and np.nan):
    the numbering does NOT start at 0.
    """
    assert isinstance(x, list) or len(x.shape) == 1
    y = x.copy()
    p = np.linspace(0,100,n+1)           # 0, 1/5, 2/5, 3/5, 4/5, 1
    q = np.nanpercentile(x, p)           # Corresponding quantiles
    q[0], q[n] = q[0] - 1, q[n] + 1      # Make sure that the first (last) value is lower (larger) than the minimum (maximum), to avoid problems with strict inequalities
    z = [ bisect_left(q,u) for u in x ]  # Fractiles (what we want)
    # Put the data back into y, preserving any additional structure (e.g., the index, for a Pandas Series)
    if isinstance(y, pd.Series): y.iloc[:] = z
    else:                        y[:] = z
    # Put back the missing values
    if isinstance( x, list ): y = [ u if u != 0 else np.NaN for u in y ]
    else:                     y[ y == 0 ] = np.NaN
    return y

# Numeric data 

## Simulated data

In [None]:
x = np.random.normal( size = 10_000 )
figsize = (3,3)
dpi = 100

fig, ax = plt.subplots( figsize = figsize, layout = 'constrained', dpi = dpi )
ax.hist( np.exp(x), bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
for side in ['left', 'top', 'right']: 
    ax.spines[side].set_visible(False)
ax.set_yticks([])
ax.set_xlabel("raw feature")
plt.show()

fig, ax = plt.subplots( figsize = figsize, layout = 'constrained', dpi = dpi )
ax.hist(x, bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
for side in ['left', 'top', 'right']: 
    ax.spines[side].set_visible(False)
ax.set_yticks([])
ax.set_xlabel("log( raw feature )")
plt.show()

## (Real) financial data: see other notebook (requires an older version of Pandas to download the data)

# Images

## Average / dominant colour of an image

In [None]:
def plot_colours(img, k=50, ax=None, **kwargs):
    ax_was_None = ax is None

    img = img.getdata()
    x = np.array( img )
    i = np.random.choice( x.shape[0], k**2, replace = False )
    x = x[i,:]
    x = pd.DataFrame(x + np.random.uniform(size = x.shape) ).head(k ** 2)
    
    #xy = UMAP(**kwargs).fit_transform( x )
    xy = sklearn.decomposition.PCA(2).fit_transform( x )  # Here, a linear transformation works much better

    #target = np.random.uniform( size = xy.shape )

    target = np.linspace(0,1,k)
    target = np.meshgrid( target, target )
    target = [ u.flatten() for u in target ]
    target = np.array(target).T

    cost = ot.dist(xy,target)
    cost /= cost.max()
    n = x.shape[0]
    uniform = np.ones((n,)) / n
    G0 = ot.emd(uniform, uniform, cost)
    i = np.apply_along_axis( np.argmax, 1, G0 )
    xy = target[i,:]

    if ax_was_None:
        fig, ax = plt.subplots(figsize = (3,3), layout = 'constrained' )
    ax.scatter( xy[:,0], xy[:,1], s = 50, color = x.values / 256 )
    ax.axis('off')
    if ax_was_None:
        plt.show()

fig, axs = plt.subplots( 2, 3, figsize = (6,4), layout = 'constrained', dpi = 100 )
for j in range(3): 
    img = Image.open( f"tmp{j+1}.jpg" )
    axs[0,j].imshow(img)
    axs[0,j].axis('off')
    plot_colours(img, ax=axs[1,j], min_dist=.5)
plt.show()

In [None]:
fig, axs = plt.subplots( 1, 3, figsize = (6,2), layout = 'constrained', dpi = 100 )
for j in range(3): 
    img = Image.open( f"tmp{j+1}.jpg" )
    axs[j].imshow(img)
    axs[j].axis('off')
plt.show()
fig, axs = plt.subplots( 1, 3, figsize = (6,2), layout = 'constrained', dpi = 100 )
for j in range(3): 
    img = Image.open( f"tmp{j+1}.jpg" )
    plot_colours(img, ax=axs[j], min_dist=.5)
    ax.set_xlim(0,1)
    ax.set_ylim(0,1)
plt.show()

In [None]:
# Try to change the UMAP parameters...
k = 20
img = Image.open( f"tmp1.jpg" )
for field, values in { 
    'n_neighbors': [5, 15, 50],
    'learning_rate': [ .3, 1.0, 2.0 ],
    'min_dist': [ .03, .1, .5 ],
}.items(): 
    fig, axs = plt.subplots( 1, 3, figsize = (9,3), layout = 'constrained' )
    for j, value in enumerate(values): 
        ax = axs[j]
        x = np.array( img.getdata() )
        i = np.random.choice( x.shape[0], k**2, replace = False )
        x = x[i,:]
        x = pd.DataFrame(x + np.random.uniform(size = x.shape) ).head(k ** 2)
        xy = UMAP( **{field: value} ).fit_transform( x )
        ax.scatter( xy[:,0], xy[:,1], alpha=.5, color = x.values / 256  )
        ax.axis('off')
        ax.set_title( f"{field} = {value}" )
    plt.show()

# Linear dimension reduction (PCA) works better...
k = 20
img = Image.open( f"tmp1.jpg" )
fig, ax = plt.subplots( figsize = (3,3), layout = 'constrained' )
x = np.array( img.getdata() )
i = np.random.choice( x.shape[0], k**2, replace = False )
x = x[i,:]
x = pd.DataFrame(x + np.random.uniform(size = x.shape) ).head(k ** 2)
xy = sklearn.decomposition.PCA(2).fit_transform( x )
ax.scatter( xy[:,0], xy[:,1], alpha=.5, color = x.values / 256  )
ax.axis('off')
ax.set_title( "PCA" )
plt.show()

## Image embeddings

In [None]:
from urllib.request import urlopen
from PIL import Image
import timm

model = timm.create_model(
    'vgg19.tv_in1k',
    pretrained=True,
    num_classes=0,  # remove classifier nn.Linear
)
model = model.eval()
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)
#output = model(transforms(img).unsqueeze(0))  # output is (batch_size, num_features) shaped tensor

In [None]:
duplicates = set( [
    'tmp3.jpg', 'tmp2.jpg', 'tmp1.jpg', 'i07.jpg',
    'text1.jpg',   # White border
] )
files = os.listdir("images")
files = [ f for f in files if f not in duplicates ]
files = [ f for f in files if f.endswith(".jpg") ]
files = [ f"images/{f}" for f in files ]
np.random.shuffle( files )
embeddings = {}

def read_image(file): 
    """
    Copilot (DALL-E) does not allow me to choose the aspect ratio.
    It used to generate only square images, but it now only produces 7×4 images.
    Crop the images, if needed.
    """
    img = Image.open(file)
    w, h = img.size
    assert w >= h, f"I expect the images to have a square or landscape aspect ratio: {file} has a portrait aspect ratio, {w}×{h}"
    a = ( w - h ) // 2
    img = img.crop( (a, 0, a+h, h) )
    return img    
    
for file in tqdm(files): 
    img = read_image(file)
    output = model(transforms(img).unsqueeze(0))
    embeddings[ file ] = output.detach().numpy().flatten()
embeddings = pd.DataFrame( embeddings ).T    

In [None]:
xy = UMAP().fit_transform( embeddings )
xy = pd.DataFrame( xy, index = embeddings.index )
xy = ( xy - xy.mean() ) / xy.std()
xy.T[[ 
    'images/cat.jpg',
    'images/fox1.jpg',
    'images/lemon tart.jpg',
]].T.round(2)

In [None]:
if False: 
    file = "images/text1.jpg" 
    img = Image.open(file)
    plt.imshow(img)

In [None]:
xy = UMAP().fit_transform( embeddings )
xmin, ymin = xy.min(axis=0)
xmax, ymax = xy.max(axis=0)
d = .4

fig, ax = plt.subplots( figsize = (9,9), dpi = 100)
for i, file in enumerate(files):
    img = read_image( file )
    x, y = xy[i,:]
    ax.imshow( img, extent = [x,x+d,y,y+d] )
ax.set_xlim( xmin, xmax+d )
ax.set_ylim( ymin, ymax+d )
ax.axis('off')
plt.show()

In [None]:
def griddify(xy, nr = None, nc = None):
    assert isinstance( xy, np.ndarray )
    assert len(xy.shape) == 2
    assert xy.shape[1] == 2
    if nr is None and nc is None: 
        # Square grid
        nr = nc = math.floor( math.sqrt( xy.shape[0] ))
    # Only keep nr*nc points
    xy = xy[:nr*nc,:]


    target = np.meshgrid( 
        np.linspace(0,1, nr),  # TODO: Check that nr and nc are in the correct order
        np.linspace(0,1, nc),
    )
    target = [ u.flatten() for u in target ]
    target = np.array(target).T[:xy.shape[0],:]

    cost = ot.dist(xy,target)
    cost /= cost.max()
    n = xy.shape[0]
    uniform = np.ones((n,)) / n
    G0 = ot.emd(uniform, uniform, cost)
    i = np.apply_along_axis( np.argmax, 1, G0 )
    xy = target[i,:]
    return xy.copy()

kx, ky = 16, 9
#kx, ky = 9,9
XY = griddify(xy, kx, ky)

In [None]:
xmin, ymin = XY.min(axis=0)
xmax, ymax = XY.max(axis=0)
dx = 1/(kx-1) * .96
dy = 1/(ky-1) * .96

#fig, ax = plt.subplots( figsize = (kx/2,ky/2), layout = 'constrained', dpi = 100 )
fig, ax = plt.subplots( figsize = (kx/2,ky/2), dpi = 300 )
for i, file in enumerate(files):
    if i >= XY.shape[0]:
        break
    img = read_image( file )
    x, y = XY[i,:]
    ax.imshow( img, extent = [x,x+dx,y,y+dy] )
    #ax.text( x+d/2, y, re.sub( '.*/', '', file ), ha='center', va='bottom', color = 'white' )
ax.set_xlim( xmin, xmax+dx )
ax.set_ylim( ymin, ymax+dy )
ax.set_xlim( 0, 1+dx )
ax.set_ylim( 0, 1+dy )
ax.axis('off')
ax.set_aspect('auto')
plt.show()

In [None]:
expr = r"^images/(Cat|Dog|night|day)_?[0-9]+[.]jpg$"
i = [ u for u in embeddings.index if re.match( expr, u ) ]
ids = {}
for u in i: 
    name = re.sub( expr, r"\1", u )
    if name not in ids: 
        ids[name] = []
    ids[name].append( embeddings.loc[u,:] )
e = { k: np.vstack(v).mean(axis=0) for k, v in ids.items() }    
e1 = e['Cat'] - e['Dog']
e2 = e['day'] - e['night']

expr = r"^images/(Cat|Dog|night|day).+[.]jpg$"
i = [ u for u in embeddings.index if re.match( expr, u ) ]
xy = { u: embeddings.loc[u,:] for u in i }
xy = { k: ( np.dot(e1,v), np.dot(e2,v) ) for k,v in xy.items() }
xy = pd.DataFrame( xy ).T
xy.columns = ['x', 'y']

def lighten_image(img, a=.7):
    # https://stackoverflow.com/questions/31360526/low-the-brightness-of-an-image-using-pillow
    source = img.split()
    R, G, B = 0, 1, 2
    f = lambda i: int( (1-a) * i + a * 255 )
    Red   = source[R].point(f)
    Green = source[G].point(f)
    Blue  = source[B].point(f)
    return Image.merge(img.mode, (Red, Green, Blue))
    
xy = ( xy - xy.min() ) / ( xy.max() - xy.min() )
d =.15

for expr in [ 
    r'^.*',
    r'^images/Cat_?[0-9]+[.]jpg$',
    r'^images/Dog_?[0-9]+[.]jpg$',
    r'^images/night_?[0-9]+[.]jpg$',
    r'^images/day_?[0-9]+[.]jpg$',
    r'^images/Cat_Night_?[0-9]+[.]jpg$',
]:    
    aspect = 16/9
    fig, ax = plt.subplots( figsize = (9*aspect,9), dpi = 100)
    for file in xy.index: 
        if re.match( expr, file ): continue
        img = read_image( file )
        img = lighten_image(img)
        x, y = xy.loc[file,:]
        ax.imshow( img, extent = [x,x+d/aspect,y,y+d] )
    for file in xy.index: 
        if re.match( expr, file ):
            img = read_image( file )
            x, y = xy.loc[file,:]
            ax.imshow( img, extent = [x,x+d/aspect,y,y+d] )    
    ax.set_xlim( xmin, xmax+d )
    ax.set_ylim( ymin, ymax+d )
    ax.axis('off')
    ax.set_xlim( 0, 1+d )
    ax.set_ylim( 0, 1+d )
    ax.set_aspect(1/aspect)
    plt.show()

In [None]:
file = 'images/Cat1.jpg'
re.match( r'^images/Cat_?[0-9]+[.]jpg$', file )
xy.index

In [None]:
img = Image.open("images/cat.jpg")

k = 12

la = .8
a = img
a = a.resize((k,k))
a = ( 1 - la ) * np.asarray( a ) + la * 255
a = np.floor(a)
a = a.astype(np.uint8)
img2 = Image.fromarray(a)


a = img.resize((k,k))
a = a.convert('L')
a = np.asarray(a)
#a = a.mean(axis=2).astype(np.uint8)

fig, ax = plt.subplots(figsize = (5,5), dpi = 100)
ax.imshow( img,  extent = [0, a.shape[0], 0, a.shape[1]] )
ax.imshow( img2, extent = [0, a.shape[0], 0, a.shape[1]], alpha = .8 )
for i in range(a.shape[0]):
    for j in range(a.shape[1]):
        ax.text( j+.5 , i + .5, str(a[i,j]), va = 'center', ha = 'center' )
for i in range(a.shape[0]):
    ax.axhline( i, color = 'white' )
for j in range(a.shape[1]):
    ax.axvline( j, color = 'white' )
ax.axis('off')
plt.show()

## MNIST

In [None]:
from sklearn.datasets import fetch_openml
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)

In [None]:
I = X[0,:].reshape((28,28))
#plt.imshow(I, cmap='Blues')

In [None]:
xy = UMAP().fit_transform(X) 
#xy = PCA(n_components=2).fit_transform(X)

In [None]:
all_colours = {
        "colours_db":            colours_db,
        "colours_quintile":      colours_quintile,
        'Pastel1':               'Pastel1',
        'Pastel2':               'Pastel2',
        'Paired':                'Paired',
        'Accent':                'Accent',
        'Dark2':                 'Dark2',
        'Set1':                  'Set1',
        'Set2':                  'Set2',
        'Set3':                  'Set3',
        'tab10':                 'tab10',
        'tab20':                 'tab20',
        'tab20b':                'tab20b',
        'tab20c':                'tab20c',
        "colours_kelly":         colours_kelly,
        "colours_glasbey":       colours_glasbey,
        "colors_green_armytage": colors_green_armytage,
        "colours_polychrome_36": colours_polychrome_36,
        "colours_alphabet":      colours_alphabet,
        "colours_dark24":        colours_dark24,
        "colours_light24":       colours_light24,
        "colours_sky":           colours_sky,
    }
nr, nc = mfrow( len( all_colours ) )
fig, axs = plt.subplots( nr, nc, figsize = (16,16), layout = 'constrained' )
for i, (title, colours) in enumerate( all_colours.items() ): 
    ax = axs.flatten()[i]    
    ax.scatter( xy[:,0], xy[:,1], alpha = .01, s = 5, c = y.astype(int), cmap = get_cmap( colours ) )
    ax.set_title( title )
    ax.axis('off')
remove_empty_axes(axs)
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
c = DBSCAN(min_samples=100).fit(xy).labels_

In [None]:
# Rescale the corrdinates to the desired aspect ratio: if we do not want the images to be deformed, we need to keep set_aspect(1)
xy = xy - xy.min(axis=0)
xy = xy / xy.max(axis=0)
xy *= np.array([16,9])

In [None]:
#SAVE = xy.copy()
xy[:,0] = - xy[:,0]

In [None]:
for s in [.1,.2,.3,.5,.7,.8,.9,1,1.5,2,2.5,3,3.5,4,5,6,7,8,9,10]:
    c = DBSCAN(min_samples=1000).fit(xy*s).labels_
    fig, ax = plt.subplots( figsize = (3,3), layout = 'constrained' )
    ax.scatter( xy[:,0], xy[:,1], alpha = .02, s = 20, c = c, cmap = get_cmap('tab10') )
    ax.set_title( s )
    plt.show()

In [None]:
for which in [0,1,2,3,4]: 
        
    fig, ax = plt.subplots( figsize = (16,9), layout = 'constrained', dpi = 100 )
    if which == 0:
        ax.scatter( xy[:,0], xy[:,1], alpha = .02, s = 20 )
    if which == 1:
        ax.scatter( xy[:,0], xy[:,1], alpha = .02, s = 20, c = c, cmap = get_cmap('tab10') )
    if 2 <= which <= 3:
        ax.scatter( xy[:,0], xy[:,1], alpha = .02, s = 20, c = y.astype(int), cmap = get_cmap('tab10') )
    if which >= 3:
        for i in range(1000): 
            img = X[i,:].reshape((28,28))
            x_, y_ = xy[i,:]
            dx = dy = .3 / 2
            ax.imshow( 
                img, 
                extent = [ 
                    x_ - dx,
                    x_ + dx,
                    y_ - dy,
                    y_ + dy,
                ], 
                cmap = 'Greys', 
                #vmin=0, vmax=256,
            )
    ax.axis('off')
    xmin, ymin = xy.min(axis=0)
    xmax, ymax = xy.max(axis=0)
    dx = .04 * (xmax - xmin )
    dy = .04 * (ymax - ymin )
    ax.set_xlim( xmin - dx, xmax + dx)
    ax.set_ylim( ymin - dy, ymax + dy )
    ax.set_aspect(1)  # Otherwise, the images are deformed...
    plt.show()

# Text

## Simple features

In [None]:
kantlipsum = """As any dedicated reader can clearly see, the Ideal of practical reason is a representation
of, as far as I know, the things in themselves; as I have shown elsewhere, the phenomena
should only be used as a canon for our understanding. The paralogisms of practical
reason are what first give rise to the architectonic of practical reason. As will easily be
shown in the next section, reason would thereby be made to contradict, in view of these
considerations, the Ideal of practical reason, yet the manifold depends on the phenomena.
Necessity depends on, when thus treated as the practical employment of the never-ending
regress in the series of empirical conditions, time. Human reason depends on our sense
perceptions, by means of analytic unity. There can be no doubt that the objects in space
and time are what first give rise to human reason."""

trump_tweet = """Time Magazine called to say that I was PROBABLY going to be named “Man (Person) of the Year,” like last year, but I would have to agree to an interview and a major photo shoot. I said probably is no good and took a pass. Thanks anyway!"""
trump2 = """So interesting to see “Progressive” Democrat Congresswomen, who originally came from countries whose governments are a complete and total catastrophe, the worst, most corrupt and inept anywhere in the world (if they even have a functioning government at all), now loudly......"""

financial_news = """Unprecedented demand for Nvidia's chips and data center services has fueled a new wave of growth for the company. With shares up over 220% in the last year, many investors probably think they've missed the boat."""

seuss = """
At the far end of town where the Grickle-grass grows and the wind smells slow-and-sour when it blows and no
birds ever sing excepting old crows... is the Street of the Lifted Lorax.
And deep in the Grickle-grass, some people say, if you look deep enough you can still see, today, where the
Lorax once stood just as long as it could before somebody lifted the Lorax away.
What was the Lorax? Any why was it there? And why was it lifted and taken somewhere from the far end of
town where the Grickle-grass grows? The old Once-ler still lives here.
Ask him, he knows.
"""

In [None]:

def get_tense(text):
    words = word_tokenize(text)
    tagged = pos_tag(words)
    future  = len([word for word in tagged if word[1] in ["VBC", "VBF"] or word[0] == 'will'])
    past    = len([word for word in tagged if word[1] in ["VBN", "VBD"]])
    present = len([word for word in tagged if word[1] in ["VB", "VBG", "VBP", "VBZ"]])
    n = present + future + past
    if n == 0: 
        return 0
    return ( future - past ) / n

In [None]:
for text in [ kantlipsum, trump_tweet, trump2, financial_news, seuss ]:
    print( {
        'numbers':     re.search( r'[0-9]', text ) is not None,
        'exclamation': '!' in text,
        'all_caps':    re.search( r'[A-Z]{5}', text ) is not None,
        'readability': textstat.gunning_fog(text),
        'sentiment':   SIA().polarity_scores(text)['compound'],
        'tense':       np.round( get_tense(text), 3 ),
    } )

In [None]:
for label, text in {
    'kant': kantlipsum, 
    'seuss': seuss, 
    'news': financial_news, 
    'trump': trump_tweet 
}.items():
    text = text.lower()
    text = re.sub( r'[^A-Za-z]+', ' ', text)
    text = text.split()
    display( pd.DataFrame( { label: text } ).head(13) )

In [None]:
d = []
for label, text in {
    'kant': kantlipsum, 
    'seuss': seuss, 
    'news': financial_news, 
    'trump': trump_tweet 
}.items():
    text = text.lower()
    text = re.sub( r'[^A-Za-z]+', ' ', text)
    text = text.split()
    for word in text:
        d.append( { 'label': label, word: 1 } )
d = pd.DataFrame(d)
d.fillna(0, inplace=True)
#d.set_index('label', inplace = True )
d = d.melt(id_vars=['label'])
d = d.groupby(['label','variable']).sum()
d = d.reset_index()
d.columns = ['document', 'word', 'value']
d = d.pivot( index = 'word', columns = 'document', values = 'value' )
( d > 0 ) * 1
d = d.astype(int)
d

In [None]:
from nltk.corpus import stopwords
remove = stopwords.words('english')
keep = [ i for i in d.index if i not in remove ]
d = d.loc[ keep, : ]
d.head(15)

In [None]:
i = d.sum(axis=1).sort_values(ascending=False).index
d = d.loc[i,:]
d.head(15)

In [None]:
d = []
for label, text in {
    'kant': kantlipsum, 
    'seuss': seuss, 
    'news': financial_news, 
    'trump': trump_tweet 
}.items():
    text = text.lower()
    text = re.sub( r'[^A-Za-z]+', ' ', text)
    text = text.split()
    text = [ u for u in text if u not in remove ]
    for w1, w2 in zip( text, text[1:] ):
        d.append( { 'label': label, f"{w1} {w2}": 1 } )  
    for word in text:
        d.append( { 'label': label, word: 1 } )
d = pd.DataFrame(d)
d.fillna(0, inplace=True)
#d.set_index('label', inplace = True )
d = d.melt(id_vars=['label'])
d = d.groupby(['label','variable']).sum()
d = d.reset_index()
d.columns = ['document', 'word', 'value']
d = d.pivot( index = 'word', columns = 'document', values = 'value' )
d = d.astype(int)

i = d.sum(axis=1).sort_values(ascending=False).index
d = d.loc[i,:]
d.head(15)

In [None]:
from nltk.corpus import stopwords

text = kantlipsum
text = text.lower()
text = re.sub( r'[^A-Za-z]+', ' ', text)
text = text.split()

remove = stopwords.words('english')
text = [ w for w in text if w not in remove ]

bigrams = [ f"{a} {b}" for a, b in zip(text, text[1:]) ]

c = Counter( text + bigrams )
c.most_common(10)

## Word embeddings

In [None]:
words = """
Aardvark
Pig
Penguin
Butterfly
Snake
Hound
Bullfrog
Elephant
Civet
Frog
Fish
Eagle
Cat
Parrot
Toad
Dog
Bee
Lizard
Agouti
Husky
Tuna
Albatross
Tortoise
Alpaca
Goat
Shrimp
Dolphin
Boa
Beetle
Alligator
Cockroach
Tick
Eel
Robin
Leopard
Shark
Caterpillar
Ferret
Hummingbird
Ant
Worm
Anteater
Antelope
Sheep
Ape
Wolf
Chihuahua
Moth
Cobra
Turtle
Fox
Hare
Scorpion
Rattlesnake
Tarantula
Catfish
Carp
Hornet
Bear
Caterpillar
Bug
Snail
Cod
Salmon
Sturgeon
Perch
Chicken
Python
Axolotl

Baboon
Camel
Badger
Eagle
Whale
Lynx
Python
Rat
Shark
Eel
Spider

Catfish
Cuckoo
Bumblebee
Owl
Swallow
Barnacle
Barracuda
Barramundi
Fish
Bass
Hound
Bat
Beagle
Bear
Collie
Dragon
Vulture
Beaver
Bee
wasp
Canary
Kingfisher
Sturgeon
Beluga
Sturgeon
Tiger
Squid 
Sheep
Bird
Bison
Warbler
Aphids
Lizard
Mamba
Rhinoceros
Caterpillar
Tarantula
Moth
Duck
Ferret
Rattlesnake
Tuna
Slug
Iguana
Goat
Bonobo
Orangutan
Elephant
Dolphin
Jellyfish
Turtle
Brontosaurus
Trout
Hyena
Griffon
Buffalo
Bulldog
Viper
Mouse
Wren
Condor
Camel
Lynx
Horse
Rat
Lion
Capuchin
Capybara
Caracal
Caribou
Parakeet
Carp
Ant
Cat
Caterpillar
Catfish
Centipede
Chameleon
Chamois
Cheetah
Chicken
Chihuahua
Chimpanzee
Chinchilla
Alligator
Cobra
Goose
Chipmunk
Crab
Cicada
Vulture
Skate
Cockroach
Coelacanth
Jellyfish
Raven
Eel
Hawk
Cow
"""

In [None]:
words = [ u.lower() for u in words.split() ]
words = np.unique( words )

In [None]:
#vectors = gensim.downloader.load('glove-twitter-25')
vectors = gensim.downloader.load('glove-wiki-gigaword-300')

In [None]:
X = pd.DataFrame( { u: vectors[u] for u in words if u in vectors } ).T
xy = UMAP().fit_transform(X)
xy = ( xy - xy.mean(axis=0) ) / xy.std(axis=0)

In [None]:
X.head(10)

In [None]:
xmin, ymin = xy.min(axis=0)
xmax, ymax = xy.max(axis=0)

fig, ax = plt.subplots( figsize = (8,4.5), layout = 'constrained', dpi = 100 )
ax.scatter( xy[:,0], xy[:,1], color = 'white' )
texts = []
for i, label in enumerate( X.index ): 
    t = ax.text( xy[i,0], xy[i,1], label, va = 'center', ha = 'center', font = 'Bitter', weight = 'semibold' )
    texts.append( t )
ax.axis(False)
ax.axhline(0, linewidth = 1, color = 'grey')
ax.axvline(0, linewidth = 1, color = 'grey')

dx = dy = .25
x = 0
while x < xmax: 
    ax.scatter( [x], [0], marker = '+', color = 'grey', linewidth = 1 )
    x += dx
x = 0
while x > xmin: 
    ax.scatter( [x], [0], marker = '+', color = 'grey', linewidth = 1 )
    x -= dx
y = 0
while y < ymax: 
    ax.scatter( [0], [y], marker = '+', color = 'grey', linewidth = 1 )
    y += dy
y = 0
while y > ymin: 
    ax.scatter( [0], [y], marker = '+', color = 'grey', linewidth = 1 )
    y -= dy

adjust_text(texts)
plt.show()

In [None]:
vectors['cat'][:10]

## Document embeddings

In [None]:
text = """
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, “and what is the use of a book,” thought Alice “without pictures or conversations?”
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so _very_ remarkable in that; nor did Alice think it so _very_ much out of the way to hear the Rabbit say to itself, “Oh dear! Oh dear! I shall be late!” (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually _took a watch out of its waistcoat-pocket_, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled “ORANGE MARMALADE”, but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody underneath, so managed to put it into one of the cupboards as she fell past it.
"""
sentence = """
Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, “and what is the use of a book,” thought Alice “without pictures or conversations?”
"""

In [None]:
from sentence_transformers import SentenceTransformer, util
model = "all-mpnet-base-v2"
model = SentenceTransformer(model)
Z = model.encode( text )

In [None]:
for u in Z[:11]:
    print( f"{10*u:0.3f}" )

In [None]:
ids = {
    'The Great Gatsby':                 64317,
    "Alice's Adventures in Wonderland":    11,
    'Romeo and Juliet':                  1513,
    'A tale of two cities':                98,
    'Winnie the Pooh':                  67098,
}

def remove_legalese(text):
    text = text.split("\n")
    i1 = np.argwhere( np.array( [ u.startswith( '*** START' ) for u in text ] ) )[0,0]
    i2 = np.argwhere( np.array( [ u.startswith( '*** END' ) for u in text ] ) )[0,0]
    text = text[i1+1:i2]
    text = '\n'.join(text)
    return text

raw_texts = {}
for title, id in ids.items():
    url = f'https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt'
    print( f"{id} {title} {url}" )
    r = requests.get(url)
    raw_texts[ title ] = r.text 

texts = { title: remove_legalese( text ) for title, text in raw_texts.items() }
sentences = { 
    title: nltk.sent_tokenize( text )
    for title, text in texts.items() 
}

In [None]:
if False: 
    # TOO SLOW?
    embeddings = { 
        title: [ 
            model.encode( sentence ) 
            for sentence in tqdm( novel )
        ]
        for title, novel in sentences.items()
    }

embeddings = { title: model.encode( novel ) for title, novel in sentences.items() }

In [None]:
d = { title: pd.DataFrame(e) for title, e in embeddings.items() }
for title, tmp in d.items(): 
    tmp['title'] = title
d = pd.concat( d.values() ).reset_index( drop = True )

In [None]:
xy = UMAP().fit_transform( d.iloc[:,:-1].values )

In [None]:
all = {
        "open gray":   colours_open_gray,
        "open red":    colours_open_red,
        "open pink":   colours_open_pink,
        "open grape":  colours_open_grape,
        "open violet": colours_open_violet,
        "open indigo": colours_open_indigo,
        "open blue":   colours_open_blue,
        "open cyan":   colours_open_cyan,
        "open teal":   colours_open_teal,
        "open green":  colours_open_green,
        "open lime":   colours_open_lime,
        "open yellow": colours_open_yellow,
        "open orange": colours_open_orange,

        "material red":       colours_material_red,
        "material pink":      colours_material_pink,
        "material purple":    colours_material_purple,
        "material d_purple":  colours_material_d_purple,
        "material indigo":    colours_material_indigo,
        "material blue":      colours_material_blue,
        "material l_blue":    colours_material_l_blue,
        "material cyan":      colours_material_cyan,
        "material teal":      colours_material_teal,
        "material green":     colours_material_green,
        "material l_green":   colours_material_l_green,
        "material lime":      colours_material_lime,
        "material yellow":    colours_material_yellow,
        "material amber":     colours_material_amber,
        "material orange":    colours_material_orange,
        "material d_orange":  colours_material_d_orange,
        "material brown":     colours_material_brown,
        "material grey":      colours_material_grey,
        "material blue_grey": colours_material_blue_grey,
    }

In [None]:
all = {
        "colours_db":            colours_db,
        "colours_quintile":      colours_quintile,
        'Pastel1':               plt.get_cmap('Pastel1').colors,
        'Pastel2':               plt.get_cmap('Pastel2').colors,
        'Paired':                plt.get_cmap('Paired').colors,
        'Accent':                plt.get_cmap('Accent').colors,
        'Dark2':                 plt.get_cmap('Dark2').colors,
        'Set1':                  plt.get_cmap('Set1').colors,
        'Set2':                  plt.get_cmap('Set2').colors,
        'Set3':                  plt.get_cmap('Set3').colors,
        'tab10':                 plt.get_cmap('tab10').colors,
        'tab20':                 plt.get_cmap('tab20').colors,
        'tab20b':                plt.get_cmap('tab20b').colors,
        'tab20c':                plt.get_cmap('tab20c').colors,
        "colours_kelly":         colours_kelly,
        "colours_glasbey":       colours_glasbey,
        "colors_green_armytage": colors_green_armytage,
        "colours_polychrome_36": colours_polychrome_36,
        "colours_alphabet":      colours_alphabet,
        "colours_dark24":        colours_dark24,
        "colours_light24":       colours_light24,
        "colours_sky":           colours_sky,
}

In [None]:
all = {
        "colours_quintile":      colours_quintile,
        'Set1':                  plt.get_cmap('Set1').colors,
        'tab10':                 plt.get_cmap('tab10').colors,
}

In [None]:
for name, colours in all.items():
    print( name )
    
    fig, ax = plt.subplots()
    for j, title in enumerate( d['title'].unique() ):
        i = d['title'] == title
        ax.scatter( xy[i,0], xy[i,1], label = title, s = 5, alpha = .5, color = colours[j] )
    alpha = .04
    m = np.quantile(xy, axis=0, q = [alpha,1-alpha])
    ax.set_xlim( m[0,0], m[1,0] )
    ax.set_ylim( m[0,1], m[1,1] )
    ax.axis('off')
    legend = ax.legend()
    
    for a in legend.legend_handles:
        a.set_alpha(1)
        a.set_sizes([50])
    
    plt.show()

In [None]:
colours = colours_quintile
for highlight in [None] + list( d['title'].unique() ):
    fig, ax = plt.subplots( figsize = (8,4.5), layout = 'constrained', dpi = 100 )
    for j, title in enumerate( d['title'].unique() ):
        i = d['title'] == title
        ax.scatter( 
            xy[i,0], xy[i,1], 
            label = title, 
            s = 50 if title == highlight else 2, 
            zorder = 1 if title == highlight else -1,
            alpha = .5, 
            color = colours[j],
        )
    alpha = .04
    m = np.quantile(xy, axis=0, q = [alpha,1-alpha])
    ax.set_xlim( m[0,0], m[1,0] )
    ax.set_ylim( m[0,1], m[1,1] )
    ax.axis('off')
    legend = ax.legend()
    
    for a in legend.legend_handles:
        a.set_alpha(1)
        a.set_sizes([50])
    plt.show()

In [None]:
words = { 
    k: pd.Series( re.sub( r'[^a-z]', ' ', v.lower() ).split() )
    for k, v in texts.items() 
}
c = { k: Counter(v) for k, v in words.items() }
c = { k: np.array( list( v.values() ) ) for k, v in c.items() }
c = { k: v / v.sum() for k, v in c.items() }

In [None]:
fig, ax = plt.subplots( figsize = (4,4), layout = 'constrained', dpi = 100)
for k, v in c.items(): 
    ax.scatter( np.arange(len(v)), sorted(v, reverse=True), label = k )
ax.set_xscale('log')
ax.set_yscale('log')
ax.legend( fontsize = 8.15, loc ='lower left' )
#ax.legend( fontsize = 8.15, loc ='upper right' )
ax.set_xlabel( "Words" )
ax.set_ylabel( "Frequency" )
plt.show()

# Feature engineering

## Data augmentation

In [None]:
np.random.seed(0)
d = pd.DataFrame( { 'x': np.random.normal( size = 513 ) } )
d['x2'] = d['x'] ** 2
d['x3'] = d['x'] ** 3

k = d.shape[1]
fig, axs = plt.subplots(k, k, figsize=(5,5), layout='constrained', dpi = 100)
for i, id1 in enumerate( d.columns ):
    for j, id2 in enumerate( d.columns ):
        ax = axs[i,j]
        if i == j: 
            ax.hist( d[id1], density = True, bins=20, facecolor='lightblue', edgecolor='tab:blue' )
            #ax.text(.5, .99, id1, horizontalalignment='center', verticalalignment='top', transform=ax.transAxes)
            t = ax.text(.5, .5, id1, horizontalalignment='center', verticalalignment='center', transform=ax.transAxes ) # , weight = 'bold')
            t.set_path_effects( [path_effects.Stroke(linewidth=3, foreground="white"), path_effects.Normal()] )            
            for side in ['left', 'right', 'top']:
                ax.spines[side].set_visible(False)
            ax.set_yticks([])
        if i != j:
            ax.scatter( d[id2], d[id1], alpha = .3 )
            #ax.text(.5, .99, f"{np.corrcoef( d[id1], d[id2] )[0,1]:.2f}", horizontalalignment='center', verticalalignment='top', transform=ax.transAxes)
            #ys = statsmodels.nonparametric.smoothers_lowess.lowess(d[id1], d[id2], frac=.5)
            #ax.plot( ys[:,0], ys[:,1], color = 'orange', linewidth = 3 )
            ax.axis('off')
plt.show()

In [None]:
## Polynomials

xs = np.linspace( -2.5, 2.5, 100 )
ys = PolynomialFeatures(
    degree = 5,
).fit_transform( pd.DataFrame( { 'x': xs } ) )
fig, axs = plt.subplots( 1, 5, figsize = (10,2), layout = 'constrained', dpi = 100 )
for i in range(5): 
    ax = axs[i]
    ax.plot( xs, ys[:,i], linewidth = 5 )
    ax.axis('off')
plt.show()

In [None]:
## Splines

xs = np.linspace( -2.5, 2.5, 100 )
ys = SplineTransformer( 
    knots = np.reshape( [-2,-1,0,1,2], (-1, 1) )
).fit_transform( pd.DataFrame( { 'x': xs } ) )
fig, axs = plt.subplots( 1, 5, figsize = (10,2), layout = 'constrained', dpi = 100 )
for i in range(5): 
    ax = axs[i]
    for j in range(5): 
        ax.plot( xs, ys[:,j], linewidth = 1, linestyle = ':', color = 'grey' )
    #for k in [-2,-1,0,1,2]: 
    #    ax.axvline( k, linewidth = 1, linestyle = ':', color = 'grey' )
    ax.plot( xs, ys[:,i], linewidth = 5 )
    ax.axis('off')
plt.show()

In [None]:
## Splines

xs = np.linspace( -2.5, 2.5, 100 )
ys = SplineTransformer( 
    knots = np.reshape( [-2,-1,0,1,2], (-1, 1) )
).fit_transform( pd.DataFrame( { 'x': xs } ) )
fig, axs = plt.subplots( 1, 5, figsize = (10,2), layout = 'constrained' )
for i in range(5): 
    ax = axs[i]
    #for j in range(5): 
    #    ax.plot( xs, ys[:,j], linewidth = 1, linestyle = ':', color = 'grey' )
    for k in [-2,-1,0,1,2]: 
        ax.axvline( k, linewidth = 1, linestyle = ':', color = 'grey' )
    ax.plot( xs, ys[:,i], linewidth = 5 )
    ax.axis('off')
plt.show()

In [None]:
## Radial basis functions (RBF)

xs = np.linspace( -2.5, 2.5, 100 )
ys = np.vstack( [ 
    scipy.stats.distributions.norm(loc = i, scale = .8).pdf(xs)
    for i in [-2,-1,0,1,2]
] ).T

fig, axs = plt.subplots( 1, 5, figsize = (10,2), layout = 'constrained', dpi = 100 )
for i in range(5): 
    ax = axs[i]
    for j in range(5): 
        ax.plot( xs, ys[:,j], linewidth = 1, linestyle = ':', color = 'grey' )
    ax.plot( xs, ys[:,i], linewidth = 5 )
    ax.axis('off')
plt.show()


## Scale

In [None]:
np.random.seed(0)
d = pd.DataFrame( { 'x': 200 + 200 * np.random.normal( size = 513 ) } )

In [None]:
# Scale 

fig, ax = plt.subplots( figsize = (5,3), layout = 'constrained', dpi = 100 )
ax.hist( d['x'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )
for side in ['left', 'top', 'right']:
    ax.spines[side].set_visible(False)
ax.set_yticks([])
ax.set_ylabel(None)    
plt.show()

In [None]:
## Quantile transformation

d['y'] = QuantileTransformer().fit_transform( d[['x']] )

fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
axs = [ 
    fig.add_subplot( gs[0,0] ),
    fig.add_subplot( gs[0,2] ),
]
ax = axs[0]
ax.hist( d['x'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )

ax = axs[1]
ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Transformed data" )

for ax in axs: 
    for side in ['left', 'top', 'right']:
        ax.spines[side].set_visible(False)
    ax.set_yticks([])
    ax.set_ylabel(None)    
plt.show()

In [None]:
## Quantile transformation

d['y'] = QuantileTransformer().fit_transform( d[['x']] )

fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
ax = axs[0]
ax.hist( d['x'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )

ax = axs[1]
ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Transformed data" )

for ax in axs: 
    for side in ['left', 'top', 'right']:
        ax.spines[side].set_visible(False)
    ax.set_yticks([])
    ax.set_ylabel(None)    
plt.show()

In [None]:
## Quantile transformation

d['y'] = QuantileTransformer( n_quantiles = 5 ).fit_transform( d[['x']] )

#fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
axs = [ 
    fig.add_subplot( gs[0,0] ),
    fig.add_subplot( gs[0,2] ),
]

ax = axs[0]
ax.hist( d['x'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )

ax = axs[1]
ax.hist( d['y'], bins=100, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Transformed data" )

for ax in axs: 
    for side in ['left', 'top', 'right']:
        ax.spines[side].set_visible(False)
    ax.set_yticks([])
    ax.set_ylabel(None)    
plt.show()

In [None]:
k = 15
a = d['x'].max() / k
b = np.floor( d['x'].min() / a ) * a
bins = list( np.linspace( b, 0, 1+int(round(-b/a)) ) ) + list( np.linspace( 0, d['x'].max(), k+1 )[1:] )


In [None]:
# Binning: sign

d['y'] = np.sign( d[['x']] )

k = 15
a = d['x'].max() / k
b = np.floor( d['x'].min() / a ) * a
bins = list( np.linspace( b, 0, 1+int(round(-b/a)) ) ) + list( np.linspace( 0, d['x'].max(), k+1 )[1:] )

for which in [0,1]: 
        
    #fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
    fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
    gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
    axs = [ 
        fig.add_subplot( gs[0,0] ),
        fig.add_subplot( gs[0,2] ),
    ]
    
    ax = axs[0]
    bars = ax.hist( d['x'], bins=bins, facecolor='lightblue', edgecolor='tab:blue', density=True)
    if which == 1:
        for b, (c1,c2) in zip( bars[2], [ ('pink','tab:red') if u < 0 else ('lightblue','tab:blue') for u in bins ][:-1] ):
            b.set_facecolor(c1)
            b.set_edgecolor(c2)
    ax.set_xlabel( "Raw data" )
    
    ax = axs[1]
    c = Counter( d['y'] )
    bars = ax.bar( [-1,1], [ c[-1], c[1] ] ) 
    #ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
    if which == 1:
        bars[0].set_color('tab:red')
    ax.set_xticks( [-1,1], ["-", "+"] )
    ax.set_xlim( -2,2)
    ax.set_xlabel( "Transformed data" )
    
    for ax in axs: 
        for side in ['left', 'top', 'right']:
            ax.spines[side].set_visible(False)
        ax.set_yticks([])
        ax.set_ylabel(None)    
    plt.show()

In [None]:
colours

In [None]:
# Binning: quintiles

d['y'] = fractiles( d['x'] )

k = 5
p = np.linspace(0,1,k+1)
q = np.nanquantile(d['x'], p)
bins = np.linspace( d['x'].min(), d['x'].max(), 31 )
#colours0 = [ 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:cyan' ]
colours0 = colours_quintile[:-1]
colours = np.array( [colours[0]] + colours0 ) [[ bisect_left(q,u) for u in bins ]]

for which in [0,1]: 
    
    #fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
    fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
    gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
    axs = [ 
        fig.add_subplot( gs[0,0] ),
        fig.add_subplot( gs[0,2] ),
    ]
    
    ax = axs[0]
    bars = ax.hist( d['x'], bins=bins, facecolor='lightblue', edgecolor='tab:blue', density=True)
    if which == 1:
        for b, c in zip( bars[2], colours ):
            b.set_facecolor( c )
            b.set_alpha(.6)
            b.set_edgecolor('black')
    ax.set_xlabel( "Raw data" )
    
    ax = axs[1]
    c = Counter( d['y'] )
    bars = ax.bar( [1,2,3,4,5], [ c[i] for i in [1,2,3,4,5] ] )
    if which == 1:
        for b,c in zip(bars, colours0):
            b.set_color(c)
            b.set_alpha(.7)
    #ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
    ax.set_xticks( [1,2,3,4,5] )
    ax.set_xlim( 0,6 )
    ax.set_xlabel( "Transformed data" )
    
    for ax in axs: 
        for side in ['left', 'top', 'right']:
            ax.spines[side].set_visible(False)
        ax.set_yticks([])
        ax.set_ylabel(None)    
    plt.show()

In [None]:
# Min-max scaling

d['y'] = ( d['x'] - d['x'].min() ) / ( d['x'].max() - d['x'].min() )

#fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
axs = [ 
    fig.add_subplot( gs[0,0] ),
    fig.add_subplot( gs[0,2] ),
]

ax = axs[0]
ax.hist( d['x'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )

ax = axs[1]
ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Transformed data" )

for ax in axs: 
    for side in ['left', 'top', 'right']:
        ax.spines[side].set_visible(False)
    ax.set_yticks([])
    ax.set_ylabel(None)    
plt.show()

In [None]:
# Standardization

d['y'] = ( d['x'] - d['x'].mean() ) / d['x'].std() 

#fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
axs = [ 
    fig.add_subplot( gs[0,0] ),
    fig.add_subplot( gs[0,2] ),
]

ax = axs[0]
ax.hist( d['x'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )

ax = axs[1]
ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Transformed data" )

for ax in axs: 
    for side in ['left', 'top', 'right']:
        ax.spines[side].set_visible(False)
    ax.set_yticks([])
    ax.set_ylabel(None)    
plt.show()

## Skewed distributions

In [None]:
# Skeweed distribution

np.random.seed(0)
d = pd.DataFrame( { 'x': np.exp( 1 * np.random.normal( size = 513 ) ) } )

In [None]:
# Skewed distribution

fig, ax = plt.subplots( figsize = (5,3), layout = 'constrained', dpi = 100 )
ax.hist( d['x'], bins=50, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )
for side in ['left', 'top', 'right']:
    ax.spines[side].set_visible(False)
ax.set_yticks([])
ax.set_ylabel(None)    
plt.show()

In [None]:
d['y'] = np.log( d['x'] )

#fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
axs = [ 
    fig.add_subplot( gs[0,0] ),
    fig.add_subplot( gs[0,2] ),
]

ax = axs[0]
ax.hist( d['x'], bins=50, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )

ax = axs[1]
ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Transformed data" )

for ax in axs: 
    for side in ['left', 'top', 'right']:
        ax.spines[side].set_visible(False)
    ax.set_yticks([])
    ax.set_ylabel(None)    
plt.show()

In [None]:
d['y'] = PowerTransformer( method='yeo-johnson' ).fit_transform( d[['x']] )
d['y'] = PowerTransformer( method = 'box-cox' ).fit_transform( d[['x']] )

## outliers

In [None]:
# Outliers

np.random.seed(0)
d = pd.DataFrame( { 'x': np.random.normal( size = 513 ) } )
d['x'].iloc[:10] = 10 * np.random.normal( size = 10 )

In [None]:
# Outliers

fig, ax = plt.subplots( figsize = (5,3), layout = 'constrained', dpi = 100 )
ax.hist( d['x'], bins=50, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )
for side in ['left', 'top', 'right']:
    ax.spines[side].set_visible(False)
ax.set_yticks([])
ax.set_ylabel(None)    
plt.show()

In [None]:
# Winsorization

lower, upper = d['x'].quantile([.02, .98])
d['y'] = np.clip( d['x'], lower, upper )

#fig, axs = plt.subplots( 1, 2, figsize = (8,2.5), layout = 'constrained', dpi = 100 )
fig = plt.figure( figsize = (8, 2.5), layout = 'constrained', dpi = 100 )
gs = fig.add_gridspec( 1, 3, width_ratios = (2.5,1,2.5) )
axs = [ 
    fig.add_subplot( gs[0,0] ),
    fig.add_subplot( gs[0,2] ),
]

ax = axs[0]
ax.hist( d['x'], bins=50, facecolor='lightblue', edgecolor='tab:blue', density=True)
ax.set_xlabel( "Raw data" )

ax = axs[1]
h = ax.hist( d['y'], bins=20, facecolor='lightblue', edgecolor='tab:blue', density=True)
for a in [ h[2][0], h[2][-1] ]:
    a.set_facecolor('tab:orange')
    a.set_edgecolor('tab:red')
ax.set_xlabel( "Transformed data" )

for ax in axs: 
    for side in ['left', 'top', 'right']:
        ax.spines[side].set_visible(False)
    ax.set_yticks([])
    ax.set_ylabel(None)    
plt.show()

## Missing values

In [None]:
d['x'].iloc[:10] = np.nan

d['x_knn'] = pd.DataFrame( KNNImputer().fit_transform(d), columns = d.columns )['x']
d['x_missing'] = pd.isnull( d['x'] )
d['x_mean'   ] = d['x'].fillna( d['x'].mean() )
d['x_median' ] = d['x'].fillna( d['x'].median() )

## Box-Cox and Yeo-Johnson transformations

$$ f_\lambda(y) = \begin{cases}
\dfrac{ y^\lambda - 1 }{ \lambda } & \text{if } \lambda \neq 0 \\
\log y & \text{if } \lambda = 0
\end{cases} $$

$$ f_\lambda(y) = \begin{cases}
\dfrac{ (y+1)^\lambda - 1 }{ \lambda } & \text{if } \lambda \neq 0, \ y \geq 0 \\
\log (y+1) & \text{if } \lambda = 0, \ y \geq 0 \\
- \dfrac{ (1-y)^{2-\lambda} - 1 }{ 2 - \lambda } & \text{if } \lambda \neq 2, \ y < 0 \\
- \log(1-y) & \text{if } \lambda = 2 , \ y < 0
\end{cases} $$

In [None]:
def box_cox(y, la): 
    if la == 0: 
        return np.log(y)
    return ( y ** la - 1 ) / la

def yeo_johnson(y, la): 
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        a = ( (y+1) ** la - 1 ) / la
        b = np.log( y + 1 )
        c = - ( (1-y) ** ( 2-la) - 1 ) / ( 2 - la )
        d = - np.log(1-y)
    if la == 0: 
        return np.where( y >= 0, b, c )
    if la == 2: 
        return np.where( y >= 0, a, d )
    return np.where( y >= 0, a, c )

fig, axs = plt.subplots(1, 2, figsize = (8,3), layout = 'constrained', dpi = 100)
ax = axs[1]
xs = np.linspace(-3,3,100)
for i in [3,2,1,0,-1,-2]:
    ax.plot( xs, yeo_johnson(xs, i), label=i )
ax.set_ylim(-5,5)
ax.set_title( "Yeo-Johnson transformations" )

ax = axs[0]
xs = np.linspace(0,3,100)
for i in [0,1,2,3]:
    ax.plot( xs, box_cox(xs, i), label=i )
ax.set_ylim(-5,5)
ax.set_title( "Box-Box transformations" )

for ax in axs: 
    leg = ax.legend()
    for i in leg.legend_handles:
        i.set_linewidth(7)
        i.set_solid_capstyle('butt')
plt.show()

## Qualitative data

In [None]:
x = [' Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly agree' ]
np.random.seed(0)
X = np.random.choice( x, size = 20 )
X = pd.DataFrame( { 'x': X  } )
X
a = dict( zip(x, 1+np.arange(5)) )
X['y'] = [ a[u] for u in X.iloc[:,0] ]

model = OrdinalEncoder( categories = [[' Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly agree' ]] )
X['z'] = model.fit_transform( X[['x']] )

In [None]:
np.random.seed(0)
x = ['False', 'True']
np.random.seed(0)
X = np.random.choice( x, size = 20 )
X = pd.DataFrame( { 'x': X  } )
X
a = dict( zip(x, np.arange(5)) )
X.iloc[:,0] = [ a[u] for u in X.iloc[:,0] ]
X

In [None]:
d = load_penguins().copy()

In [None]:
d = load_penguins().copy()
d['x'] = d['species']
LabelEncoder().fit_transform( d['x'] )

In [None]:
encoder = OneHotEncoder()
X = encoder.fit_transform( d[['x']] )
X = pd.DataFrame( X.todense(), columns = encoder.categories_ )

In [None]:
np.random.seed(1)
dd = d.sample(10).reset_index(drop=True)[['species']]
display(dd)
model = OneHotEncoder()
#model = LabelEncoder()
dd = model.fit_transform( dd )
dd = pd.DataFrame( dd.todense().astype(int), columns = model.categories_[0] )
display(dd)
#OrdinalEncoder
#TargetEncoder

In [None]:
np.random.seed(1)
dd = d.sample(10).reset_index(drop=True)[['species']]
model = LabelEncoder()
dd = model.fit_transform( dd )
dd = pd.DataFrame( { d.columns[0]: dd } )
display(dd)

## Time series

In [None]:
d = data('AirPassengers').copy()
d.tail()

In [None]:
d = data('AirPassengers').copy()
d = pd.concat( [ 
    d,
    pd.DataFrame( { 
        'time': 1961 + np.linspace(0,1,13)[:-1],
        'AirPassengers': np.nan
    } ),
] )
d = d.reset_index(drop=True)
d['year'] = np.floor( d['time'] ).astype(int)
d['month'] = np.round( ( d['time'] - d['year'] ) * 12  ).astype(int)+ 1
d['date'] = [ f"{u}-{v:02d}-20" for u, v in zip( d['year'], d['month'] ) ]
d['date'] = pd.to_datetime( d['date'] ) + datetime.timedelta( days = 20 )
d['date'] = [ f"{str(u)[:7]}-01" for u in d['date'] ]
d['date'] = pd.to_datetime( d['date'] ) - datetime.timedelta( days = 1 )
d = d[['date', 'AirPassengers']].copy()

In [None]:
for which in [0,1,2,3,4]:
        
    fig, ax = plt.subplots( figsize = (4,4), dpi = 100 )
    m = d['AirPassengers'].rolling(12).mean()
    s = d['AirPassengers'].rolling(12).std()
    a, b = (m-2*s).min(), (m+2*s).max()
    c = np.exp( .04 * np.log(b/a) )
    a, b = a/c, b*c
    ax.plot( d['date'], d['AirPassengers'], label = "Current" )
    if which == 1:
        ax.plot( d['date'], d['AirPassengers'].shift(12), label = "Last year" )
    if which in [2,3,4]: 
        ax.plot( d['date'], m, linewidth = 5, label = "Moving average" if which == 2 else None )
    if which == 3:
        ax.plot( d['date'], m+2*s, color = 'black', linestyle = ':', linewidth = 3, label = "MA±std" )
        ax.plot( d['date'], m-2*s, color = 'black', linestyle = ':', linewidth = 3 )
    if which == 4:
        ax.fill_between( d['date'], m-2*s, m+2*s, color = 'lightblue' )
    ax.set_xlim( d['date'].iloc[0], d['date'].iloc[-1] )
    ax.set_yscale('log')
    remove_scientific_notation_from_vertical_axis(ax)
    if which in [0,1,2,3]: 
        leg = ax.legend()
        for i in leg.legend_handles:
            i.set_linewidth( 7 if which in [0,1,2] else 3 )
            i.set_solid_capstyle('butt')
    ax.set_ylabel( "Air Passengers (thousands)" )
    ax.set_ylim(a,b)    
    plt.show()

## Date and time

In [None]:
tmp = pd.DataFrame( { 
    'year':  d['date'].dt.year,
    'month': d['date'].dt.month,
    'day':   d['date'].dt.day,
    #d['date'].dt.hour
    #d['date'].dt.minute
    #d['date'].dt.second
    'day_name': d['date'].dt.day_name(),
    'weekend': np.isin( d['date'].dt.day_name(), ['Sunday', 'Saturday'] ),
} )
dir( d['date'].dt )
tmp


# Feature Selection

In [None]:
!wget -nc https://github.com/INRIA/scikit-learn-mooc/raw/main/datasets/ames_housing_no_missing.csv
d = pd.read_csv( "ames_housing_no_missing.csv" )

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
X_new = SelectKBest(f_classif, k=2).fit_transform(X, y)


In [None]:
fig, ax = plt.subplots()
ax.scatter( d['LotArea'], d['SalePrice'], alpha = .5 )
ax.set_xlim( 0, 20_000) 
ax.set_ylim( 0, 400_000 )
ax.set_xlabel( "Lot Area (sq.ft.)" )
ax.set_ylabel( "Sale price (USD)" )
plt.show()

In [None]:
from uuid import uuid4  # Random id

In [None]:
d = load_penguins()
d.shape

In [None]:
from gapminder import gapminder
gapminder.shape

In [None]:
d = load_penguins()
tmp = d.copy()
tmp['animal'] = 'penguin'
tmp[ ['animal'] + list( tmp.columns )[:-1] ]

d = load_penguins()
tmp = d.copy()
tmp['id'] = [ str( uuid4() )[:5] for _ in range(d.shape[0]) ]
tmp[ ['id'] + list( tmp.columns )[:-1] ]

d = load_penguins()
tmp = d.copy()
tmp['one'] = 1.
tmp[ ['one'] + list( tmp.columns )[:-1] ]

d = load_penguins()
tmp = d.copy()
tmp['n'] = 1.
tmp['n'].iloc[-2] = 2.
tmp[ ['n'] + list( tmp.columns )[:-1] ]

In [None]:
X, y = sklearn.datasets.fetch_california_housing( return_X_y = True, as_frame = True )             # Numeric data only
X = X.copy()
X = X.round(2)
X['HouseAge'] = X['HouseAge'].astype(int)
X['Population'] = X['Population'].astype(int)
X

In [None]:
X, y = sklearn.datasets.fetch_california_housing( return_X_y = True, as_frame = True )             # Numeric data only
X = X.copy()
for i in [
    'MedInc', 
    #'HouseAge', 
    'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
    #'Latitude', 'Longitude',
]:
    X[i] = np.log(X[i])
# This is not that good an example: the correlation is not that high. 
# Increase it, to have a more striking example.
X['AveBedrms'] += 2 * X['AveRooms']
sns.pairplot(X)

In [None]:
def my_pair(d): 
    k = d.shape[1]
    fig, axs = plt.subplots(k, k, figsize=(9,9), layout='constrained')
    for i, id1 in enumerate( d.columns ):
        for j, id2 in enumerate( d.columns ):
            ax = axs[i,j]
            if i == j: 
                ax.hist( d[id1], bins=20, density = True )
                t = ax.text(.5, .99, id1, horizontalalignment='center', verticalalignment='top', transform=ax.transAxes)
                t.set_path_effects( [path_effects.Stroke(linewidth=3, foreground="white", alpha = .7), path_effects.Normal()] )
                for side in ['left', 'right', 'top']:
                    ax.spines[side].set_visible(False)
                ax.set_yticks([])
            if i != j:
                ax.scatter( d[id2], d[id1], alpha = .01, s = 10 )
                t = ax.text(.5, .99, f"{np.corrcoef( d[id1], d[id2] )[0,1]:.2f}", horizontalalignment='center', verticalalignment='top', transform=ax.transAxes)
                t.set_path_effects( [path_effects.Stroke(linewidth=3, foreground="white", alpha = .7), path_effects.Normal()] )
                ys = statsmodels.nonparametric.smoothers_lowess.lowess(d[id1], d[id2], frac=.5)
                ax.plot( ys[:,0], ys[:,1], color = 'orange', linewidth = 3 )
                ax.axis('off')
    plt.show()    
my_pair(X)  # SLOW!

##  Cluster the features

In [None]:
!wget -nc https://github.com/INRIA/scikit-learn-mooc/raw/main/datasets/ames_housing_no_missing.csv
#ames = pd.read_csv( "https://github.com/INRIA/scikit-learn-mooc/raw/main/datasets/ames_housing_no_missing.csv" )
ames = pd.read_csv( "ames_housing_no_missing.csv" )

In [None]:
numeric = [ u for u in ames.columns if ames[u].dtype in [int, float] ]
X = uniformize( ames[ numeric ] )
#xy = UMAP().fit_transform(X.T)
xy = PCA().fit_transform(X.T)
fig, ax = plt.subplots( figsize = (5,5), layout = 'constrained' )
ax.scatter( xy[:,0], xy[:,1], color = 'white' )
for i, column in enumerate( X.columns ):
    ax.text( xy[i,0], xy[i,1], column, ha = 'center', va = 'center' )
ax.axis('off')
plt.show()

In [None]:
C = X.corr()
C = C.melt(ignore_index = False).reset_index()
C.columns = [ 'v1', 'v2', 'correlation' ]
C = C[ C['v1'] < C['v2'] ]
C['abs'] = np.abs( C['correlation'] )
C = C.sort_values( 'abs', ascending = False )
C = C.drop('abs', axis=1)
C.head(10)

In [None]:
corrplot( X.corr(), order = True, figsize = (6,6) )

In [None]:
plt.rcParams['figure.dpi'] = 100
sns.clustermap( X.corr(), cmap = 'RdBu', vmin = -1, vmax = +1, annot = False, fmt = ".2f", figsize = (10,10) )

In [None]:
ames = pd.read_csv( "ames_housing_no_missing.csv" )
numeric = [ u for u in ames.columns if ames[u].dtype in [int, float] ]
X = uniformize( ames[ numeric ] )
y = X[ X.columns[-1] ]
X = X[ X.columns[:-1] ]

# Sort the features along their correlations with the target variable
cs = pd.Series( { column: np.corrcoef( X[column], y )[0,1] for column in X.columns } )
cs = pd.DataFrame( { 
    'correlation': cs,
    'abs': np.abs(cs),
} )
cs = cs.sort_values('abs', ascending = False)
columns = cs.index
X = X[columns]

nr, nc = mfrow( X.shape[1] )
scale = .8
fig, axs = plt.subplots( nr, nc, figsize = (16*scale,9*scale), dpi = 300 )
for i, column in enumerate( X.columns ):
    ax = axs.flatten()[i]
    ax.scatter( X[column], y, alpha = .1 )
    t = ax.text( .5, .6, column, va = 'center', ha = 'center', transform=ax.transAxes )
    t.set_path_effects( [path_effects.Stroke(linewidth=2, foreground="white", alpha=.7), path_effects.Normal()] )
    c = np.corrcoef( X[column], y )[0,1]
    t = ax.text( .5, .4, f"cor={c:.2f}", va = 'center', ha = 'center', transform=ax.transAxes )
    t.set_path_effects( [path_effects.Stroke(linewidth=2, foreground="white", alpha=.7), path_effects.Normal()] )
    #ax.axis('off')
    ax.set_xticks([])
    ax.set_yticks([])
fig.subplots_adjust(wspace = 0, hspace=0)
plt.show()

In [None]:
X = pd.read_csv( "ames_housing_no_missing.csv" )
y = X[ X.columns[-1] ]
X = X[ X.columns[:-1] ]
numeric = [ u for u in X.columns if X[u].dtype in [int, float] ]
strings = [ u for u in X.columns if X[u].dtype not in [int, float] ]
X[numeric] = uniformize( X[ numeric ] )

# Discard columns whose mode acocunts for more than 90% of the observations
largest = { u: Counter( X[u] ).most_common(1)[0][1] / X.shape[0] for u in strings }
strings = [ u for u in strings if largest[u] < .90 ]

# Discard columns with too many values
number = { u: len( np.unique( X[u].astype(str) ) ) for u in strings }
strings = [ u for u in strings if number[u] <= 20 ]

X = X[ numeric + strings ]

In [None]:
def stacked_bars(c, ax = None, proportion = False, legend = True):
    ax_was_None = ax is None
    if ax_was_None: 
        fig, ax = plt.subplots( layout = 'constrained' )
    s = 0 * c.iloc[:,0]
    S = c.T.sum()
    if not proportion: 
        S = 1 + 0 * S
    for u in c.columns:
        ax.bar( c.index, c[u] / S, bottom = s, label = u )
        s += c[u] / S
    if legend: 
        ax.legend( reverse = True, title = c.columns.name )
    ax.set_xlabel( c.index.name )
    ax.set_ylabel( "Proportion of observations" if proportion else "Number of observations" )
    if ax_was_None: 
        plt.show()

def my_pair(d, figsize = (9,9)): 
    k = d.shape[1]
    fig, axs = plt.subplots(k, k, figsize=figsize)
    for i, id1 in enumerate( d.columns ):
        for j, id2 in enumerate( d.columns ):
            ax = axs[i,j]
            try: 
                if i == j: 
                    if d[id1].dtype in [ int, float ]: 
                        ax.hist( d[id1], bins=20, density = True )
                        t = ax.text(.5, .99, id1, horizontalalignment='center', verticalalignment='top', transform=ax.transAxes)
                        t.set_path_effects( [path_effects.Stroke(linewidth=3, foreground="white", alpha = .7), path_effects.Normal()] )
                        for side in ['left', 'right', 'top']:
                            ax.spines[side].set_visible(False)
                        ax.set_yticks([])
                    else: 
                        c = [ str(u) for u in d[id1] ]
                        c = Counter( c )
                        c = pd.Series( c.values(), index = c.keys() )
                        c.sort_index(inplace=True)
                        ax.bar( c.index, c.values )
                        for side in ['left', 'right', 'top']:
                            ax.spines[side].set_visible(False)
                        ax.set_yticks([])
                        ax.set_xticks([])
                if i != j:
                    if d[id1].dtype in [ int, float ] and d[id2].dtype in [int, float]: 
                        ax.scatter( d[id2], d[id1], alpha = .01, s = 10 )
                        t = ax.text(.5, .99, f"{np.corrcoef( d[id1], d[id2] )[0,1]:.2f}", horizontalalignment='center', verticalalignment='top', transform=ax.transAxes)
                        t.set_path_effects( [path_effects.Stroke(linewidth=3, foreground="white", alpha = .7), path_effects.Normal()] )
                        ys = statsmodels.nonparametric.smoothers_lowess.lowess(d[id1], d[id2], frac=.5)
                        ax.plot( ys[:,0], ys[:,1], color = 'orange', linewidth = 3 )
                        ax.axis('off')
                    elif d[id1].dtype in [int, float]:
                        values = [ str(u) for u in d[id2] ]
                        values = sorted( np.unique( values ) )
                        ax.boxplot( 
                            [
                                d[id1][ d[id2] == v ] 
                                for v in values
                            ], 
                            labels = values,    
                        )
                        ax.axis('off')
                    elif d[id2].dtype in [int, float]:
                        values = [ str(u) for u in d[id1] ]
                        values = sorted( np.unique( values ) )
                        ax.boxplot( [ 
                                d[id2][ d[id1] == v ] 
                                for v in values
                            ], 
                            labels = values,
                            vert = False,
                        )    
                        ax.axis('off')
                    else: 
                        tmp = d[[id1,id2]].copy()
                        tmp['count'] = 1
                        c = tmp.groupby([id1,id2]).sum().reset_index().pivot( index = id2, columns = id1, values = 'count' )
                        stacked_bars( c, ax, legend = False )
                        for side in ['left', 'right', 'top']:
                            ax.spines[side].set_visible(False)
                        ax.set_yticks([])
                        ax.set_xticks([])
                        ax.set_xlabel(None)
                        ax.set_ylabel(None)
            except Exception as e:
                print( f"PROBLEM {id1} {id2} {e}" )
                ax.axis('off')
    fig.subplots_adjust(wspace = 0, hspace=0)                                
    plt.show()  

In [None]:
s = 3
#my_pair(X, figsize = (16*s, 9*s))

## Select individual features

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, f_regression
select = SelectKBest(f_regression, k=2)
X_new = select.fit_transform(X[numeric], y)
X_new = pd.DataFrame( X_new, columns = select.get_feature_names_out() )
X_new

## Select sets of features

In [None]:
X = pd.read_csv( "ames_housing_no_missing.csv" )
y = X[ X.columns[-1] ]
X = X[ X.columns[:-1] ]
numeric = [ u for u in X.columns if X[u].dtype in [int, float] ]
strings = [ u for u in X.columns if X[u].dtype not in [int, float] ]
X[numeric] = uniformize( X[ numeric ] )

X = X[numeric]
y = np.log(y)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

model = LinearRegression()
selector = RFE( estimator = model, n_features_to_select = 5, step = 1 )
selector.fit(X, y)
X_new = X[ selector.get_feature_names_out() ]

In [None]:
selector = RFE( estimator = model, n_features_to_select = 1 )
selector.fit(X, y)
features = pd.Series( dict( zip( X.columns, selector.ranking_ ) ) ).sort_values().index

scores = []
for k in range(1,30): 
    model.fit( X[features[:k]], y )
    scores.append( model.score(X[features[:k]],y) )
#plt.plot( scores )

scale = .5
fig, axs = plt.subplots( 3, 4, figsize = (scale*16,scale*9), layout = 'constrained', dpi = 100 )
for k in 1 + np.arange(len(axs.flatten())):
    ax = axs.flatten()[k-1]
    model.fit( X[features[:k]], y )
    y_hat = model.predict( X[features[:k]] )
    ax.scatter( y_hat, y, alpha = .01, s = 20 )
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(r'$\hat y$')
    ax.set_ylabel(r'$y$')
    ax.set_title( f"$k={k}$" )
plt.show()

In [None]:
y

In [None]:
scale = .5
fig, axs = plt.subplots( 3, 4, figsize = (scale*16,scale*9), layout = 'constrained', dpi = 100 )
for k in 1 + np.arange(len(axs.flatten())):
    ax = axs.flatten()[k-1]
    model.fit( X[features[:k]], y )
    y_hat = model.predict( X[features[:k]] )

    a = pd.DataFrame( { 'y_hat': y_hat, 'y': y.values } ).sort_values('y_hat').reset_index(drop=True).set_index('y_hat').rolling(50, center=True)
    m = a.mean().dropna().squeeze()
    s = a.std().dropna().squeeze()
    ax.fill_between( m.index, m + 2*s, m - 2*s, color = 'lightblue', alpha = .8, zorder = 0 )

    ax.axline( (12,12), slope = 1, color = 'tab:blue' )
    
    ax.scatter( y_hat, y, alpha = .5, color = 'black', s = 1 )
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(r'$\hat y$')
    ax.set_ylabel(r'$y$')
    ax.set_title( f"$k={k}$" )
plt.show()


plt.plot( a.mean() )
plt.plot( a.mean() + a.std() )

In [None]:
#plt.fill_between( m.index, m + 2*s, m - 2*s, color = 'lightblue' )
m.index
m.squeeze()

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection   import KFold
from sklearn.linear_model      import LinearRegression

model    = LinearRegression()
cv       = KFold(5)
selector = RFECV( estimator = model, cv = cv, scoring = 'r2' )
selector.fit(X, np.log(y))
X_new = X[ selector.get_feature_names_out() ]
selector.n_features_, selector.n_features_in_

In [None]:
fig, ax = plt.subplots( figsize = (5,4), layout = 'constrained', dpi = 100 )
x = np.arange( X.shape[1] ) + 1
m = selector.cv_results_['mean_test_score']
ax.plot( x, m )
ys = np.vstack( [ selector.cv_results_[f'split{i}_test_score'] for i in range(5) ] ).T
ax.fill_between( x, ys.min(axis=1), ys.max(axis=1), color = 'lightblue' )
ax.axvline( selector.n_features_, color = 'black', linestyle = ':' )
ax.set_xlabel( "Number of predictors" )
ax.set_ylabel( "Cross-validation score" )
plt.show()

In [None]:
scores = []
for k in range(1,30): 
    features = selector.get_feature_names_out()[:k]   # The features are NOT in the correct order...
    model.fit( X[features], y )
    scores.append( model.score(X[features],y) )
plt.plot( scores )

In [None]:
tmp = load_iris(as_frame=True)['data']

In [None]:
# Remove constant features
X[:] = VarianceThreshold().fit_transform(X)

In [None]:
# Select the "best" features, in a univariate way
# - Choose a metric to measure how useful the features are: 
#     regression:     r_regression, f_regression, mutual_info_regression
#     classification: chi2, f_classif, mutual_info_classif
#   F tests: linear dependence
#   Mutual information: non-parametric, reqiures more data
#   chi2: if the features are frequencies (counts) -- in particular, the should ne non-negative#   
# - You can then pick the top k, or the top x%, or those for which the FPR, FDR or FWE exceeds some threshold

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif
X, y = load_iris(return_X_y=True, as_frame = True)
select = SelectKBest(f_classif, k=2)
X_new = select.fit_transform(X, y)
X_new = pd.DataFrame( X_new, columns = select.get_feature_names_out() )

f_classif(X, y)  # Returns F-statistics and corresponding p-values

In [None]:
# Recursive feature elimination
# RFE: you specify the desired number of features
# RFECV: the number of features to keep is chosen by cross-validation
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

rfecv = RFECV(
    estimator = LogisticRegression(),
    step      = 1,
    cv        = StratifiedKFold(5),
    scoring   = "accuracy",
    min_features_to_select = 1,
)
rfecv.fit(X, y)


In [None]:
from sklearn.feature_selection import *
f_regression
f_oneway
f_classif
RFE, RFECV
SelectPercentile
VarianceThreshold

# TO DELETE

In [None]:
from pydataset import data
data("quakes").head(20)
data("diamonds")[['clarity','color']].head(20)
data("faithful")[['waiting']].head(20)

In [None]:
from bisect import bisect_left
x = np.random.normal( size = 100 )
d = pd.DataFrame( { 'x': x } )
k = 5
p = np.linspace(0,1,k+1)  
q = np.nanquantile(d['x'], p)
y = [ bisect_left(q,u) for u in d['x'] ]

In [None]:
from sklearn.datasets import fetch_openml
housing = fetch_openml(name="house_prices", as_frame=True)

fig, ax = plt.subplots( figsize = (4,3), layout = 'constrained' )
ax.scatter(
    housing['data']['GrLivArea'],
    housing['target'],
    alpha = .1,
)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel( "Area (sq.ft.)" )
ax.set_ylabel( "Price (USD)" )
ax.set_title( "House prices" )
plt.show()

# Unsorted

In [None]:
def my_hist(x, ax=None, facecolor='lightblue', edgecolor='tab:blue', density  = True, **kwargs):
    ax_was_None = ax is None
    if ax_was_None: 
        fig, ax = plt.subplots()
    ax.hist(x, facecolor=facecolor, edgecolor=edgecolor, density=density, **kwargs)
    if False:
        xs = np.linspace(x.min(), x.max(), 100)
        de = scipy.stats.gaussian_kde(x)
        ys = de(xs)
        ax.plot( xs, ys, linewidth = 5 )
    if ax_was_None: 
        plt.show()

# TO DELETE

In [None]:
from sklearn.pipeline        import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base            import BaseEstimator, ClassifierMixin
from sklearn.linear_model    import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics         import roc_auc_score
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.compose         import ColumnTransformer
from sklearn.impute          import SimpleImputer

In [None]:
pipe = Pipeline([

    ('scaler',     StandardScaler()),
    ('selector',   VarianceThreshold()),
    ('classifier', KNeighborsClassifier())
])
pipe

In [None]:
numeric_preprocessor = Pipeline( [
    ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("scaler",          StandardScaler()),
] )
categorical_preprocessor = Pipeline( [
    ("imputation_constant", SimpleImputer(fill_value="missing", strategy="constant")),
    ("onehot",              OneHotEncoder(handle_unknown="ignore")),
] )
preprocessor = ColumnTransformer( [
    ("categorical", categorical_preprocessor, ["state", "gender"]),
    ("numerical",   numeric_preprocessor,     ["age", "weight"  ]),
] )
pipe = Pipeline( [
    ("preprocessor", preprocessor),
    ("classifier",   LogisticRegression(max_iter=500)),
] )
pipe

In [None]:
param_grid = {
    "classifier__n_estimators": [200, 500],
    "classifier__max_features": ["auto", "sqrt", "log2"],
    "classifier__max_depth":    [4, 5, 6, 7, 8],
    "classifier__criterion":    ["gini", "entropy"],
}
grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=1)
grid_search

In [None]:
steps =[
  ('wc_tfidfs', FeatureUnion([
    ('tfidf_w', TfidfVectorizer(max_features=5, analyzer='word')),
    ('tfidf_c', TfidfVectorizer(max_features=5, analyzer='char')),
  ])),
  ( 'lr', LogisticRegression() ),
]
Pipeline(steps)

In [None]:
print( dir( sklearn.preprocessing ) )

def f(dd, title = None ):
    fig, ax = plt.subplots( figsize = (4,4), layout = 'constrained', dpi = 100 )
    b = ax.boxplot(dd, vert=False, patch_artist=True)
    for u in b['boxes']:
        u.set_facecolor('None')
        u.set_edgecolor('black')
    for u in b['medians']:
        u.set_color('black')    
    ax.set_yticks( 1 + np.arange(dd.shape[1]) )
    ax.set_yticklabels( dd.columns )    
    a = .3
    for i in range(dd.shape[1]): 
        ax.scatter( dd.iloc[:,i], dd.shape[0] * [i+1] + np.random.uniform(-a,a,size=dd.shape[0]), alpha = .2 )
    ax.set_ylim( dd.shape[1]+.5, .5 )
    ax.set_title( title )
    plt.show()
    
n = 200
X = pd.DataFrame( { 
    'x': np.random.normal(size=n),
    'y': np.random.lognormal(size=n),
    'z': np.random.standard_cauchy(size=n),
} )

Pipeline( [ ( 'scaler', MinMaxScaler() ) ] )

f(X) 
f( pd.DataFrame( Pipeline( [ ( 'scaler', MinMaxScaler()        ) ] ).fit_transform(X), columns = X.columns ), "MinMaxScaler" )
f( pd.DataFrame( Pipeline( [ ( 'scaler', StandardScaler()      ) ] ).fit_transform(X), columns = X.columns ), "StandardScaler" )
f( pd.DataFrame( Pipeline( [ ( 'scaler', RobustScaler()        ) ] ).fit_transform(X), columns = X.columns ), "RobustScaler" )
f( pd.DataFrame( Pipeline( [ ( 'scaler', QuantileTransformer() ) ] ).fit_transform(X), columns = X.columns ), "QuantileTransformer" )

In [None]:
from sklearn.pipeline import make_pipeline  # Wrapper around Pipeline(); no names
from sklearn.impute import KNNImputer, MissingIndicator, SimpleImputer
import sklearn.compose
KNNImputer
MissingIndicator
SimpleImputer
sklearn.pipeline.FeatureUnion
sklearn.compose.ColumnTransformer  # Allows a different transformation for each column (or group of columns)
Normalizer         # Rescale each row so it has L^2 norm 1
PowerTransformer   # Box-Cox or (default) Yeo-Johnson (which also allows negative values)
VarianceThreshold  # Remove low-variance (or zero-variance) features


In [None]:
k = X.shape[1]
fig, axs = plt.subplots( k, 1, figsize = (8,2*k), layout = 'constrained', dpi=100 )
for i in range(k): 
    ax = axs.flatten()[i]
    x = X.iloc[:,i].values
    b = ax.boxplot(x, vert=False, patch_artist=True)
    for u in b['boxes']:
        u.set_facecolor('lightblue')
        u.set_edgecolor('black')
    for u in b['medians']:
        u.set_color('black')    
    ax.set_yticks([])
    for side in ['left', 'right', 'top']:
        ax.spines[side].set_visible(False)
    ax.scatter( x[0], 3, color = 'white' )
    ax.set_xlabel( X.columns[i] )
plt.show()

In [None]:
alpha = 1.05
beta = 1.02
n = 100
x = 3.2e6 * np.ones(shape=n)
c = .1e6 * np.ones(shape=n) * beta ** np.arange(n)
for i in range(1,n):
    x[i] = alpha * x[i-1] - c[i]
fig, ax = plt.subplots()
ax.plot(x)
ax.set_ylim( -1e4, x.max() * 1.02 )
ax.axhline( 0, linewidth = 1, color = 'black' )
plt.show()
np.where( x > 0 )[0][-1]