### Predict High Risk Windows for Burglary

In [1]:
%use smile

In [95]:
import okhttp3.internal.format
import smile.classification.RandomForest
import smile.data.DataFrame
import smile.data.formula.Formula
import smile.data.type.DataTypes
import smile.data.vector.IntVector
import smile.data.vector.StringVector
import smile.validation.metric.Accuracy

// Load your dataset. Assuming CSV format for simplicity
val dataPath:String = "/Users/urs/development/github/ai/kotlin-ai-talk/langchain4j/src/main/resources/dataset/burglaries_enriched.csv"
val rawData: DataFrame = read.csv(dataPath, header=true)

// Transform 'hourOfDay' into categorical time windows: morning (0), afternoon (1), evening (2)
var data = rawData
    .merge(IntVector.of("dayOfWeekNumber", rawData.stringVector("DayOfWeek").toStringArray().map { dayOfWeek -> 
        when (dayOfWeek) {
        "MONDAY" -> 1
        "TUESDAY" -> 2
        "WEDNESDAY" -> 3    
        "THURSDAY" -> 4 
        "FRIDAY" -> 5   
        "SATURDAY" -> 6 
        "SUNDAY" -> 7
            else -> 0
    }
}.toIntArray()))
    .merge(StringVector.of("class", * rawData.intVector("hourOfDay").toIntArray().map { hour ->
        when (hour) {
            in 0..11 -> "Morning"   // Morning
            in 12..17 -> "Afternoon"  // Afternoon
            else -> "Evening"       // Evening
        }
    }.toTypedArray()))
    .drop("DayOfWeek").drop("CategoryOfItemsStolen").drop("TypeOfProperty").drop("AmountStolen")

// Assuming 'dayOfWeek' and 'AmountStolen' are already suitable for use and do not require transformation
// Setup formula: Predicting time windows as multi-labels
val formula = Formula.lhs("class")

// Convert categorical features to dummy variables if needed
data = formula.frame(data)
println("Feature Schema: \n${data}")
println(data.summary())

// Split data into training and test sets

//val splits = (data, 0.8)
//val trainData = splits.train
//val testData = splits.test
//
 //Train the model
val model = RandomForest.fit(formula, data)
println("OOB error = ${rf.metrics()}")
//// Predict on test data
val prediction = model.predict(data)
val accuracy = Accuracy.of(data.intVector("timeWindow").toIntArray(), prediction)
//println("Test Accuracy: $accuracy")

//// Print the predictions (for the first few for brevity)
//prediction.take(10).forEach { println("Predicted time window: $it") }

Feature Schema: 
[class: String, hourOfDay: int, dayOfWeekNumber: int]
+-------+---------+---------------+
|  class|hourOfDay|dayOfWeekNumber|
+-------+---------+---------------+
|Morning|        1|              3|
|Morning|        1|              3|
|Morning|        7|              6|
|Morning|        8|              4|
|Evening|       19|              7|
|Morning|        1|              3|
|Morning|        1|              3|
|Morning|        7|              6|
|Morning|        8|              4|
|Evening|       19|              7|
+-------+---------+---------------+
35 more rows...

[column: String, count: long, min: double, avg: double, max: double]
+---------------+-----+---+--------+---+
|         column|count|min|     avg|max|
+---------------+-----+---+--------+---+
|      hourOfDay|   45|  0|9,244444| 20|
|dayOfWeekNumber|   45|  1|4,355556|  7|
+---------------+-----+---+--------+---+



java.lang.UnsupportedOperationException: class:String

In [8]:
// Create some data. 
import smile.data.DataFrame
import smile.data.formula.Formula
import smile.regression.*
var data = Array<DoubleArray>(13){doubleArrayOf(0.0)}
data[0] = doubleArrayOf(57.3142861,45.0,14.0,164900.0,116910.0,48.7392861,133.930123 )
data[1] = doubleArrayOf(23.0073691,43.0,12.0,138633.333,116910.0,30.9357145,138.55321)
data[2] = doubleArrayOf(43.8676314,66.0,21.0,151266.667,120633.333,57.3142861,139.144051)
data[3] = doubleArrayOf(20.0705358,47.0,10.0,128500.0,120633.333,23.0073691,123.355951)
data[4] = doubleArrayOf(27.6794644,40.0,9.0,148766.667,122181.667,43.8676314,122.667478)
data[5] = doubleArrayOf(27.4578573,146.0,66.0,138566.667,126548.333,20.0705358,109.957522)
data[6] = doubleArrayOf(15.9874061,118.0,71.0,134733.333,128621.667,27.4578573,83.9544647)
data[7] = doubleArrayOf(14.8142858,138.0,63.0,127533.333,137826.333,15.9874061,43.041923)
data[8] = doubleArrayOf(15.6678573,150.0,86.0,109466.667,141869.667,14.8142858,35.7447188)
data[9] = doubleArrayOf(14.3601192,128.0,72.0,96800.0,141120.333,15.6678573,37.8162068)
data[10] = doubleArrayOf(14.2407408,145.0,81.0,97166.6667,144084.333,14.3601192,45.3637191)
data[11] = doubleArrayOf(13.6261905,134.0,70.0,98900.0,141082.333,14.2407408,39.4444231)
data[12] = doubleArrayOf(14.1465714,135.0,72.0,112233.333,127618.0,13.6261905,30.6519913)
val features = arrayOf("x1","x2","x3","x4","x5","x6")

// Convert the data into a SMILE DataFrame
val df = DataFrame.of(data, "y", *features) // The first column is the target value (y), and the other columns are the features.

// Make a formula
val frm = Formula.of("y", *features)

// Regression
val lm = ridge(frm, df, lambda=1.0)

// Print some results
println( "\nIntercept: ${"%.2e".format(lm.intercept())}" )
println( "Coefficients:" )
for ( ix in 0 until features.size ){
    println( "\t${features[ix]}: ${"%.2e".format(lm.coefficients()[ix])}")
}

// Make predictions
val predictions = lm.predict(df)
println("\nActual\tPredicted")
for ( ix in 0 until data.size ) {
    println("${"%.2f".format(data[ix][0])}\t${"%.2f".format(predictions[ix])}")
}


Intercept: -1,97e+01
Coefficients:
	x1: 3,51e-02
	x2: -8,79e-03
	x3: 1,99e-04
	x4: -2,60e-06
	x5: 5,06e-01
	x6: 2,05e-02

Actual	Predicted
57,31	41,61
23,01	27,43
43,87	44,01
20,07	21,24
27,68	35,57
27,46	24,45
15,99	25,86
14,81	18,54
15,67	14,42
14,36	11,73
14,24	11,80
13,63	11,68
14,15	13,89


In [96]:
val iris = read.arff("/Users/urs/development/github/ai/kotlin-ai-talk/langchain4j/src/main/resources/dataset/iris.arff")
println(iris)
iris.summary()

[sepallength: float, sepalwidth: float, petallength: float, petalwidth: float, class: byte nominal[Iris-setosa, Iris-versicolor, Iris-virginica]]
+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5,1|       3,5|        1,4|       0,2|Iris-setosa|
|        4,9|         3|        1,4|       0,2|Iris-setosa|
|        4,7|       3,2|        1,3|       0,2|Iris-setosa|
|        4,6|       3,1|        1,5|       0,2|Iris-setosa|
|          5|       3,6|        1,4|       0,2|Iris-setosa|
|        5,4|       3,9|        1,7|       0,4|Iris-setosa|
|        4,6|       3,4|        1,4|       0,3|Iris-setosa|
|          5|       3,4|        1,5|       0,2|Iris-setosa|
|        4,4|       2,9|        1,4|       0,2|Iris-setosa|
|        4,9|       3,1|        1,5|       0,1|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
140 more rows.

[column: String, count: long, min: double, avg: double, max: double]
+-----------+-----+---+--------+---+
|     column|count|min|     avg|max|
+-----------+-----+---+--------+---+
|sepallength|  150|4,3|5,843333|7,9|
| sepalwidth|  150|  2|   3,054|4,4|
|petallength|  150|  1|3,758667|6,9|
| petalwidth|  150|0,1|1,198667|2,5|
+-----------+-----+---+--------+---+


In [97]:
val rf = RandomForest.fit(Formula.lhs("class"), iris)
println("OOB error = ${rf.metrics()}")

OOB error = {
  fit time: 110,906 ms,
  score time: 4,512 ms,
  validation data size: 150,
  error: 8,
  accuracy: 94,67%
}


In [98]:
import smile.data.Tuple
import smile.data.vector.BaseVector

//rf.predict(Array.ofDim<Double>(iris, 4))
fun randomSample(data: DataFrame, sampleSize: Int): DataFrame {
    val indices: IntRange = data.shuffled().take(sampleSize).indices
    return data.select(*indices.toList().toIntArray())  
}

// Sample 10 random rows from the iris dataset
val sampleSize = 10
val sampledData = randomSample(iris, sampleSize)

// Print the sampled rows
//println(sampledData)

val predictions = sampledData.stream().map { row ->
    val tuple = Tuple.of(row.toArray(), iris.schema())
    rf.predict(tuple) to tuple.get("class")
}.toArray()

// Print the predictions
println("Predictions for sampled data:")
predictions.forEach { println(it) }
println()




Predictions for sampled data:
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(0, 0.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(2, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(2, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(2, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(1, 1.0)
(2, 2.0)
(2, 2.0)
(2, 2.0)
(2, 2.0)
(2, 2.0)
(2, 2.0)
(1, 2.0)
(2, 2.0

In [73]:
import kotlinx.coroutines.processNextEventInCurrentThread

// Function to split the dataset into training and test sets
fun trainTestSplit(data: DataFrame, trainSize: Double): Pair<DataFrame, DataFrame> {
    val trainCount = (data.nrow() * trainSize).toInt()
    println("Train count: $trainCount, ${data.nrow()}")
    val trainData = data.slice(0, trainCount)      
    //println("Train data: ${trainData.size}")
    val testData = data.slice(trainCount - 1, data.nrow() - 1)   
    return trainData to testData
}

// Split the dataset (e.g., 80% train, 20% test)
val (trainData, testData) = trainTestSplit(iris, 0.8)
val predictions: MutableList<Pair<Int, Any>> = testData.stream().map { row ->
    val tuple = Tuple.of(row.toArray(), iris.schema())
    rf.predict(tuple) to tuple.get("class")
}.toList()
predictions.fold(0 to 0){(ok, nok), (pred, real) ->  if(pred.toDouble() == real)ok + 1 to nok else ok to nok + 1}.let { (ok, nok) -> println("Accuracy: ${ok.toDouble() / (ok + nok)}") }


Train count: 120, 150
Accuracy: 0.9


In [131]:
import io.kotest.assertions.print.printWithType
import smile.regression.RandomForest

val bdata = Read.arff("/Users/urs/development/github/ai/kotlin-ai-talk/langchain4j/src/main/resources/dataset/burglaries.arff")
println(bdata)
println(bdata.summary())

val (trainData, testData) = trainTestSplit(bdata, 0.8)

val rf2 = RandomForest.fit(Formula.lhs("class"), trainData)
println("OOB error = ${rf2.metrics()}")

val predictionsRf: MutableList<Pair<Double, Any>> = testData.stream().map { row ->
    val tuple = Tuple.of(row.toArray(), bdata.schema())
    (rf2.predict(tuple) to tuple.get("class")).also { println(it) }
}.toList()

predictionsRf.fold(0 to 0){(ok, nok), (pred, real) ->  if(pred.roundToInt() == real.toString().toDouble().toInt())ok + 1 to nok else ok to nok + 1}.let { (ok, nok) -> println("Accuracy: ${ok.toDouble() / (ok + nok)}") }

    val hours = intArrayOf(10)  // Example hour
    val daysOfWeek = intArrayOf(1)  // Example day of week (Monday)

    val newData = DataFrame.of(
        IntVector.of("hour", hours),
        IntVector.of("dayOfWeek", daysOfWeek)
    )

    val timeWindowPrediction = rf2.predict(newData)

    // Output the predicted highest risk period
    println("Predicted High-Risk Time Window: ${timeWindowPrediction[0]}")


val dt = DecisionTree.fit(Formula.lhs("class"), trainData)
println("OOB error = ${dt}")

val predictionsDT: MutableList<Pair<Int, Any>> = testData.stream().map { row ->
    val tuple = Tuple.of(row.toArray(), bdata.schema())
    (dt.predict(tuple) to tuple.get("class")).also { println(it) }
}.toList()

predictionsDT.fold(0 to 0){(ok, nok), (pred, real) ->  if(pred == real.toString().toDouble().toInt())ok + 1 to nok else ok to nok + 1}.let { (ok, nok) -> println("Accuracy: ${ok.toDouble() / (ok + nok)}") }



[hour: float, dayOfWeek: float, class: byte nominal[Breakfast-Time, Lunch-Time, Dinner-Time]]
+----+---------+--------------+
|hour|dayOfWeek|         class|
+----+---------+--------------+
|   9|        1|Breakfast-Time|
|  12|        1|    Lunch-Time|
|  21|        1|   Dinner-Time|
|   9|        2|Breakfast-Time|
|  10|        2|Breakfast-Time|
|  14|        2|    Lunch-Time|
|  20|        2|   Dinner-Time|
|   8|        3|Breakfast-Time|
|  14|        3|    Lunch-Time|
|  22|        3|   Dinner-Time|
+----+---------+--------------+
71 more rows...

[column: String, count: long, min: double, avg: double, max: double]
+---------+-----+---+---------+---+
|   column|count|min|      avg|max|
+---------+-----+---+---------+---+
|     hour|   81|  6|14,691358| 23|
|dayOfWeek|   81|  1|  4,08642|  7|
+---------+-----+---+---------+---+

Train count: 64, 81
OOB error = {
  fit time: 8,738 ms,
  score time: 0,748 ms,
  validation data size:: 64,
  RSS: 3,1456,
  MSE: 0,0491,
  RMSE: 0,2217,


In [137]:
import io.kotest.assertions.print.printWithType
import smile.regression.RandomForest

val burglaries = read.csv("/Users/urs/development/github/ai/kotlin-ai-talk/langchain4j/src/main/resources/dataset/burglaries_small.csv")
println(burglaries)
println(burglaries.summary())

val (trainData, testData) = trainTestSplit(burglaries, 0.8)

val rf2 = DecisionTree.fit(Formula.lhs("hour"), trainData)
println("OOB error = ${rf2}")

val predictionsRf: MutableList<Pair<Int, Any>> = testData.stream().map { row ->
    val tuple = Tuple.of(row.toArray(), burglaries.schema())
    (rf2.predict(tuple) to tuple.get("hour")).also { println(it) }
}.toList()

predictionsRf.fold(0 to 0){(ok, nok), (pred, real) ->  if(pred == real.toString().toDouble().toInt())ok + 1 to nok else ok to nok + 1}.let { (ok, nok) -> println("Accuracy: ${ok.toDouble() / (ok + nok)}") }

val daysOfWeek = intArrayOf(5)  // Example day of week (Monday)

val newData = DataFrame.of(
    IntVector.of("dayOfWeek", daysOfWeek)
)

val predictedHour = rf2.predict(newData)

// Output the predicted highest risk period
println("Predicted High-Risk Time Window: ${predictedHour}")

[hour: int, dayOfWeek: int]
+----+---------+
|hour|dayOfWeek|
+----+---------+
|   9|        1|
|  12|        1|
|  21|        1|
|   9|        2|
|  10|        2|
|  14|        2|
|  20|        2|
|   8|        3|
|  14|        3|
|  22|        3|
+----+---------+
71 more rows...

[column: String, count: long, min: double, avg: double, max: double]
+---------+-----+---+---------+---+
|   column|count|min|      avg|max|
+---------+-----+---+---------+---+
|     hour|   81|  6|14,691358| 23|
|dayOfWeek|   81|  1|  4,08642|  7|
+---------+-----+---+---------+---+

Train count: 64, 81
OOB error = n=64
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 64 327,24 11 (0,025000 0,050000 0,087500 0,075000 0,025000 0,050000 0,11250 0,050000 0,050000 0,037500 0,062500 0,15000 0,11250 0,050000 0,037500 0,025000)
 2) dayOfWeek<=5,50000 46 241,88 2 (0,032258 0,048387 0,11290 0,080645 0,032258 0,064516 0,11290 0,064516 0,064516 0,048387 0,064516 0,096774 0,064516 0,032258 0,048387 