In [15]:
%useLatestDescriptors
%use dataframe, kotlin-dl, kandy

In [4]:
val rawDf = DataFrame.readCSV(fileOrUrl = "./data/winequality-red.csv", delimiter = ';')
rawDf

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [39]:
rawDf.describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
fixed acidity,Double,1599,96,0,7.2,67,8.319637,1.741096,4.6,7.9,15.9
volatile acidity,Double,1599,143,0,0.6,47,0.527821,0.17906,0.12,0.52,1.58
citric acid,Double,1599,80,0,0.0,132,0.270976,0.194801,0.0,0.26,1.0
residual sugar,Double,1599,91,0,2.0,156,2.538806,1.409928,0.9,2.2,15.5
chlorides,Double,1599,153,0,0.08,66,0.087467,0.047065,0.012,0.079,0.611
free sulfur dioxide,Double,1599,60,0,6.0,138,15.874922,10.460157,1.0,14.0,72.0
total sulfur dioxide,Double,1599,144,0,28.0,43,46.467792,32.895324,6.0,38.0,289.0
density,Double,1599,436,0,0.9972,36,0.996747,0.001887,0.99007,0.99675,1.00369
pH,Double,1599,89,0,3.3,57,3.311113,0.154386,2.74,3.31,4.01
sulphates,Double,1599,96,0,0.6,69,0.658149,0.169507,0.33,0.62,2.0


In [11]:
val correlationTable = rawDf
    .convert{ valueCols() }.toDouble()
    .corr{ valueCols()}.withItself()
    .gather{ allAfter("column") }.into("row", "value")

correlationTable

column,row,value
fixed acidity,fixed acidity,1.0
fixed acidity,volatile acidity,-0.256131
fixed acidity,citric acid,0.671703
fixed acidity,residual sugar,0.114777
fixed acidity,chlorides,0.093705
fixed acidity,free sulfur dioxide,-0.153794
fixed acidity,total sulfur dioxide,-0.113181
fixed acidity,density,0.668047
fixed acidity,pH,-0.682978
fixed acidity,sulphates,0.183006


In [37]:
// JetBrains color palette
object JetBrainsColors {
    val lightOrange = Color.hex("#ffb59e")
    val orange = Color.hex("#ff6632")
    val lightGrey = Color.hex("#a6a6a6")
    val darkGrey = Color.hex("#4c4c4c")
}

fun scaleContinuousColorGradientN() = continuousColorGradientN(
    gradientColors = listOf(
        Color.RED,
        Color.WHITE,
        Color.GREEN
    ),
    domainMin = -1.0,
    domainMax = 1.0,
)

In [38]:
correlationTable.plot {
    tiles {
        x(row) { axis.name = "" }
        y(column) { axis.name = "" }
        fillColor(value) { scale = scaleContinuousColorGradientN() }
    }
    layout {
        size = 800 to 600
    }
}

In [40]:
val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH }
df

fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol,quality
7.4,0.7,0.0,0.076,34.0,0.9978,0.56,9.4,5
7.8,0.88,0.0,0.098,67.0,0.9968,0.68,9.8,5
7.8,0.76,0.04,0.092,54.0,0.997,0.65,9.8,5
11.2,0.28,0.56,0.075,60.0,0.998,0.58,9.8,6
7.4,0.7,0.0,0.076,34.0,0.9978,0.56,9.4,5
7.4,0.66,0.0,0.075,40.0,0.9978,0.56,9.4,5
7.9,0.6,0.06,0.069,59.0,0.9964,0.46,9.4,5
7.3,0.65,0.0,0.065,21.0,0.9946,0.47,10.0,7
7.8,0.58,0.02,0.073,18.0,0.9968,0.57,9.5,7
7.5,0.5,0.36,0.071,102.0,0.9978,0.8,10.5,5


In [42]:
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
    return OnHeapDataset.create(
        dataframe = this,
        yColumn = labelColumnName
    )
}

fun OnHeapDataset.Companion.create(
    dataframe: DataFrame<Any?>,
    yColumn: String
): OnHeapDataset {
    fun extractX(): Array<FloatArray> =
        dataframe.remove(yColumn).rows()
            .map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()

    fun extractY(): FloatArray =
        dataframe.get { yColumn<Float>() }.toList().toFloatArray()

    return create(
        ::extractX,
        ::extractY
    )
}

In [43]:
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
    .toOnHeapDataset(labelColumnName = "quality")
    .split(0.8)

In [52]:
val inputNeurons = train.x[0].size.toLong()