## 0. Prepare and import data

#### 0. Clone git.  https://github.com/vidardb/util.git

#### 1. Install postgres.  run cmd#  bash install-pg-madlib.sh install_pg 

#### 2. Create chicago_taxi_trips database，then import data.  run cmd#  psql -U postgres -W -f import-data_chicago_taxi_trips.sql

#### 3. Install madlib plugin to database chicago_taxi_trips. run cmd# bash install-pg-madlib.sh install_madlib 

## 1. Load sql ext and connect to database

In [23]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [11]:
# Greenplum 4.3.10.0
#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib

# PostgreSQL local
%sql postgresql://postgres:postgres@localhost:5432/chicago_taxi_trips

'Connected: postgres@chicago_taxi_trips'

In [12]:
%sql select madlib.version();
#%sql select version();

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
1 rows affected.


version
"MADlib version: 1.16, git revision: unknown, cmake configuration time: Tue Jul 2 20:42:19 UTC 2019, build type: Release, build system: Linux-4.9.125-linuxkit, C compiler: gcc 7, C++ compiler: g++ 7"


## 2. Prepare data for madlib. 

In [13]:
%%sql
select taxi_id, pickup_latitude, pickup_longitude from chicago_taxi_trips limit 5;

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
5 rows affected.


taxi_id,pickup_latitude,pickup_longitude
5400,688,206
1257,618,407
5998,64,231
2538,170,351
5856,767,733


In [14]:
%%sql
drop table if exists mat;    
create table mat (id integer,    
                  row_vec double precision[] );


 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
Done.
Done.


[]

In [15]:
%%sql 
select *, (pickup_latitude, pickup_longitude) from chicago_taxi_trips order by taxi_id limit 5; 


 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
5 rows affected.


taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,row
3,2016-12-21 23:00:00,2016-12-21 23:00:00,360,1.6,,607,32.0,8.0,7.5,0.0,0.0,0.0,7.5,Cash,43,18,610,767,733,"(18,610)"
3,2016-12-03 02:00:00,2016-12-03 02:15:00,300,0.4,,225,8.0,8.0,5.0,0.0,0.0,0.0,5.0,Cash,43,170,351,754,410,"(170,351)"
7,2016-12-14 15:45:00,2016-12-14 16:30:00,2760,0.0,,959,76.0,8.0,45.75,12.4,0.0,4.0,62.15,Credit Card,109,225,6,167,754,"(225,6)"
7,2016-12-29 20:00:00,2016-12-29 20:15:00,660,0.7,,225,8.0,8.0,7.25,3.0,0.0,1.5,11.75,Credit Card,109,210,470,754,410,"(210,470)"
20,2016-12-20 23:15:00,2016-12-20 23:30:00,420,0.0,,792,8.0,8.0,6.75,0.0,0.0,1.0,7.75,Cash,107,454,453,419,615,"(454,453)"


In [16]:
%%sql
drop table if exists t_source_change;

create table t_source_change
(row_id serial,
 taxi_id int,
 pickup_latitude decimal(10, 2),  
 pickup_longitude decimal(10, 2));  

insert into t_source_change (taxi_id,pickup_latitude,pickup_longitude)   
select taxi_id,   
       pickup_latitude,  
       pickup_longitude
from chicago_taxi_trips 


 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
Done.
Done.
999 rows affected.


[]

In [17]:
%sql select * from t_source_change limit 5;

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
5 rows affected.


row_id,taxi_id,pickup_latitude,pickup_longitude
1,5400,688.0,206.0
2,1257,618.0,407.0
3,5998,64.0,231.0
4,2538,170.0,351.0
5,5856,767.0,733.0


In [18]:
%%sql 
drop table if exists km_sample;    
create table km_sample (id integer,    
                  row_vec double precision[]);

insert into km_sample select row_id, array_cat(array[pickup_latitude], array[pickup_longitude]) from t_source_change;

select * from km_sample limit 10;

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
Done.
Done.
999 rows affected.
10 rows affected.


id,row_vec
1,"[688.0, 206.0]"
2,"[618.0, 407.0]"
3,"[64.0, 231.0]"
4,"[170.0, 351.0]"
5,"[767.0, 733.0]"
6,"[294.0, 113.0]"
7,"[225.0, 6.0]"
8,"[618.0, 407.0]"
9,"[411.0, 545.0]"
10,"[18.0, 610.0]"


In [19]:
%%sql 
DROP TABLE IF EXISTS km_result;

-- Run kmeans algorithm
CREATE TABLE km_result AS
SELECT * FROM madlib.kmeanspp( 'km_sample',   -- Table of source data
                               'row_vec',      -- Column containing point co-ordinates 
                               5,             -- Number of centroids to calculate
                               'madlib.squared_dist_norm2',   -- Distance function
                               'madlib.avg',  -- Aggregate function
                               20,            -- Number of iterations
                               0.001          -- Fraction of centroids reassigned to keep iterating 
                             );

SELECT * FROM km_result;


 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
Done.
1 rows affected.
1 rows affected.


centroids,cluster_variance,objective_fn,frac_reassigned,num_iterations
"[[726.239583333333, 425.940972222222], [421.256830601093, 554.431693989071], [56.5418181818182, 628.832727272727], [56.3877551020408, 267.142857142857], [235.705882352941, 258.509803921569]]","[9595950.4652778, 1795763.82513661, 1829624.57454545, 285355.632653061, 7578729.33333333]",21085423.8309463,0.0,4


## 3. Calculate the simplified silhouette coefficient:

In [20]:
%%sql
SELECT * FROM madlib.simple_silhouette( 'km_sample',          -- Input points table
                                        'row_vec',             -- Column containing points
                                        (SELECT centroids FROM km_result),  -- Centroids
                                        'madlib.dist_norm2'   -- Distance function
                                      );

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
1 rows affected.


simple_silhouette
0.595379697575799


## 4. Find the cluster assignment for each point:

In [21]:
%%sql
SELECT data.*, (madlib.closest_column(centroids, row_vec)).column_id as cluster_id
FROM km_sample as data, km_result
ORDER BY data.id desc limit 10;

 * postgresql://postgres:***@localhost:5432/chicago_taxi_trips
10 rows affected.


id,row_vec,cluster_id
999,"[18.0, 610.0]",2
998,"[411.0, 545.0]",1
997,"[433.0, 757.0]",1
996,"[167.0, 754.0]",2
995,"[170.0, 351.0]",4
994,"[688.0, 206.0]",0
993,"[18.0, 610.0]",2
992,"[754.0, 410.0]",0
991,"[744.0, 605.0]",0
990,"[210.0, 470.0]",4
