## 0. Prepare and import data

#### 0. Clone git. run cmd$ `git clone https://github.com/vidardb/util.git && cd util`

#### 1. Install postgres.  run cmd$ `bash ./util-bash/install-pg-madlib.sh install_pg` 

#### 2. Create chicago_taxi_trips database，then import data.  run cmd$ `psql -U postgres -W -f ./madlib-notebook/kmeans/import-data_chicago_taxi_trips.sql`

#### 3. Install madlib plugin to database chicago_taxi_trips. run cmd$ `bash ./util-bash/install-pg-madlib.sh install_madlib` 

## 1. Load sql ext and connect to database

In [1]:
# Load sql ext for notebook.
%load_ext sql

In [2]:
# PostgreSQL local
%sql postgresql://postgres:postgres@localhost:5432/chicago_taxi_trips

'Connected: postgres@chicago_taxi_trips'

In [3]:
%sql select madlib.version();   -- Check madlib version.

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
1 rows affected.


version
"MADlib version: 1.16, git revision: unknown, cmake configuration time: Tue Jul 2 20:42:19 UTC 2019, build type: Release, build system: Linux-4.9.125-linuxkit, C compiler: gcc 7, C++ compiler: g++ 7"


## 2. Prepare data for madlib. 

In [4]:
%%sql 
select * from chicago_taxi_trips order by taxi_id limit 5; 

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
5 rows affected.


taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
3,2016-12-21 23:00:00,2016-12-21 23:00:00,360,1.6,,607,32.0,8.0,7.5,0.0,0.0,0.0,7.5,Cash,43,18,610,767,733
3,2016-12-03 02:00:00,2016-12-03 02:15:00,300,0.4,,225,8.0,8.0,5.0,0.0,0.0,0.0,5.0,Cash,43,170,351,754,410
7,2016-12-14 15:45:00,2016-12-14 16:30:00,2760,0.0,,959,76.0,8.0,45.75,12.4,0.0,4.0,62.15,Credit Card,109,225,6,167,754
7,2016-12-29 20:00:00,2016-12-29 20:15:00,660,0.7,,225,8.0,8.0,7.25,3.0,0.0,1.5,11.75,Credit Card,109,210,470,754,410
20,2016-12-20 23:15:00,2016-12-20 23:30:00,420,0.0,,792,8.0,8.0,6.75,0.0,0.0,1.0,7.75,Cash,107,454,453,419,615


In [5]:
%%sql
drop table if exists chicago_taxi_trips_change;

create table chicago_taxi_trips_change
(row_id serial,
 taxi_id int,
 pickup_latitude decimal(10, 2),  
 pickup_longitude decimal(10, 2),
 row_vec double precision[]);         -- Double precision array for (pickup_latitude,pickup_longitude)

insert into chicago_taxi_trips_change (taxi_id,pickup_latitude,pickup_longitude, row_vec)   
select taxi_id,   
       pickup_latitude,  
       pickup_longitude, 
        array_cat(array[pickup_latitude], array[pickup_longitude])
from chicago_taxi_trips 


 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
Done.
Done.
999 rows affected.


[]

## 3. Training data get km_result table:

In [6]:
%%sql 
DROP TABLE IF EXISTS km_result;

-- Run kmeans algorithm
CREATE TABLE km_result AS
SELECT * FROM madlib.kmeanspp( 'chicago_taxi_trips_change',   -- Table of source data
                               'row_vec',           -- Column containing point co-ordinates 
                               5,                   -- Number of centroids to calculate
                               'madlib.squared_dist_norm2',   -- Distance function
                               'madlib.avg',        -- Aggregate function
                               20,            -- Number of iterations
                               0.001          -- Fraction of centroids reassigned to keep iterating 
                             );

SELECT * FROM km_result;


 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
Done.
1 rows affected.
1 rows affected.


centroids,cluster_variance,objective_fn,frac_reassigned,num_iterations
"[[55.9051094890511, 629.069343065693], [707.757062146893, 296.005649717514], [566.664150943396, 597.543396226415], [201.411764705882, 125.426470588235], [234.65306122449, 426.680272108844]]","[1794859.21532847, 2629291.5480226, 7745150.86037737, 2356920.20588235, 1993107.27891156]",16519329.1085224,0.0,5


## 4. Calculate the simplified silhouette coefficient:

In [7]:
%%sql
SELECT * FROM madlib.simple_silhouette( 'chicago_taxi_trips_change',     -- Input points table
                                        'row_vec',             -- Column containing points
                                        (SELECT centroids FROM km_result),  -- Centroids
                                        'madlib.dist_norm2'   -- Distance function
                                      );

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
1 rows affected.


simple_silhouette
0.582995460000751


## 5. Find the cluster assignment for each point:

In [8]:
%%sql
SELECT data.*, (madlib.closest_column(centroids, row_vec)).column_id as cluster_id
FROM chicago_taxi_trips_change as data, km_result
ORDER BY data.row_id desc limit 10;


 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
10 rows affected.


row_id,taxi_id,pickup_latitude,pickup_longitude,row_vec,cluster_id
999,7040,18.0,610.0,"[18.0, 610.0]",0
998,7145,411.0,545.0,"[411.0, 545.0]",2
997,7864,433.0,757.0,"[433.0, 757.0]",2
996,6620,167.0,754.0,"[167.0, 754.0]",0
995,393,170.0,351.0,"[170.0, 351.0]",4
994,1082,688.0,206.0,"[688.0, 206.0]",1
993,55,18.0,610.0,"[18.0, 610.0]",0
992,7749,754.0,410.0,"[754.0, 410.0]",1
991,7564,744.0,605.0,"[744.0, 605.0]",2
990,7065,210.0,470.0,"[210.0, 470.0]",4
