In [25]:
import time
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans, DBSCAN
from sklearn.cluster import MiniBatchKMeans
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.io
import math
import json
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline  

import pandas as pd

In [26]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

In [27]:
with open('data/tweets_1M.json','r') as f:
    tweets = json.load(f)

In [28]:
X = np.array([[tweets[x]['lat'],tweets[x]['lng']] for x in range(0, len(tweets))])
#100K subset
sample = 100000
total = len(X)
subset = X[0::int(total/sample)]

Part 1.1 K-means

In [7]:
# use this to get a n (that is close to 60 secs)
n = 370
## initialize with K-means++, a good way of speeding up convergence
k_means = KMeans(init='k-means++', n_clusters=n, n_init=10)
## record the current time
t_km = time.time()
# start clustering!
k_means.fit(X)
## get the time to finish clustering
t_fin_km = time.time() - t_km

KeyboardInterrupt: 

In [None]:
t_fin_km

In [8]:
#detect maximum k
# start with relatively reasonable n 
n = 35
t_fin_km = 0
while t_fin_km <= 60:
    print ('testing n equal to ' + str(n))
    ## initialize with K-means++, a good way of speeding up convergence
    k_means = KMeans(init='k-means++', n_clusters=n, n_init=10)
    ## record the current time
    t_km = time.time()
    # start clustering!
    k_means.fit(X)
    ## get the time to finish clustering
    t_fin_km = time.time() - t_km
    #
    print (t_fin_km)
    n += 1

#print (t_fin_km)
# max k = 380    

testing n equal to 35
54.9270977973938
testing n equal to 36
60.50568079948425


In [None]:
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
ft = (k_means_labels, k_means_cluster_centers, k_means_labels_unique)

Part 1.2 MiniBatch k-means

In [31]:
n = 850
perc = 0.01
t_mini_batch = 0
batch_size=int(len(X)*perc)
while t_mini_batch <= 60:
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=n, batch_size=batch_size,
                          n_init=10, max_no_improvement=10, verbose=0)
    t0 = time.time()
    mbk.fit(X)
    t_mini_batch = time.time() - t0
    n += 100
    print (n, t_mini_batch)

950 38.39891600608826
1050 42.738019943237305
1150 44.84724998474121
1250 55.631016969680786
1350 60.259560108184814


In [None]:
# picked a relatively ideal perc based on previous calculation; not entirely sure why...
for perc in [0.1]:
    print ('perc is '+ str(perc))
    batch_size=int(len(X)*perc)
    n = 100
    t_mini_batch = 0
    while t_mini_batch <= 60:
        print ('testing n equal to ' + str(n))
        mbk = MiniBatchKMeans(init='k-means++', n_clusters=n, batch_size=batch_size,
                            n_init=10, max_no_improvement=10, verbose=0)
        t0 = time.time()
        mbk.fit(X)
        t_mini_batch = time.time() - t0
        n += 10
        print (t_mini_batch)
# max n = 1490

In [None]:
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)

part 1.3 DBSCAN

In [None]:
import utm

In [None]:
for n in range(0, len(X)):
    meters = utm.from_latlon(X[n][0],X[n][1])
    X[n][0] = meters[0]
    X[n][1] = meters[1]

In [None]:
#recordedtime = {}
ncluster = {}
for eps in frange(0.0005, 0.004, 0.0005):
    
    t_db = time.time()
    db = DBSCAN(eps=eps, min_samples=100).fit(X)
    t_fin_db = time.time() - t_db
    
    db_labels = db.labels_
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    n_clusters_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)
    ncluster[eps] = n_clusters_
    #db_labels_unique = np.unique(db_labels)
    #recordedtime[eps] = t_fin_db

In [None]:
t_db = time.time()
db = DBSCAN(eps=0.001, min_samples=100).fit(X_meter)
t_fin_db = time.time() - t_db

#array of numbers, one number represents one cluster
db_labels = db.labels_
db_labels_unique = set(db_labels)
# minus if there are unclustered noises
n_clusters_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)
db_labels_unique = np.unique(db_labels)


In [None]:
X = np.array([[tweets[x]['lat'],tweets[x]['lng']] for x in range(0, len(tweets))])

In [None]:
# get colors and plot all the points, color-coded by cluster (or gray if not in any cluster, aka noise)
fig, ax = plt.subplots(figsize=[8,10])
ax.set_xlim([-130, -112])
ax.set_ylim([32, 44])
colors = plt.cm.rainbow(np.linspace(0, 1, len(db_labels_unique)))

# for each cluster label and color, plot the cluster's points
for db_label, color in zip(db_labels_unique, colors):
    
    size = 10
    if db_label == -1: #make the noise (which is labeled -1) appear as smaller gray points
        color = 'gray'
        size = 3
        alpha=0.01
    
    # plot the points that match the current cluster label
    x_coords = X[db_labels==db_label][:,1]
    y_coords = X[db_labels==db_label][:,0]
    ax.scatter(x=x_coords, y=y_coords, c=color, edgecolor='', s=size, alpha=0.5)

ax.set_title('Number of clusters: {}'.format(n_clusters_)) #string concatenation
plt.show()

Part 2. Clustering: Scalability 

In [None]:
records = {}
for n in range(100, 100100, 100):
    
    sample = n
    total = len(X)
    subset = X[0::int(total/sample)]
    
    ## initialize with K-means++, a good way of speeding up convergence
    k_means = KMeans(init='k-means++', n_clusters=100, n_init=10)
    ## record the current time
    t_km = time.time()
    # start clustering!
    k_means.fit(subset)
    ## get the time to finish clustering
    t_fin_km = time.time() - t_km
    
    records[sample] = t_fin_km

In [None]:
records_df = pd.DataFrame.from_dict(records, orient = 'index')
records_df.to_csv('data/part2_1.csv')

In [None]:
fig, ax = plt.subplots(figsize=[8,10])
ax.scatter(x=list(records.keys()), y=list(records.values()))
#plt.savefig('part2.pdf')

In [None]:
#K-means....need to rerun
#Number of requested clusters k (consider the range of 2 to the k_max)
k_max = 380
records = {}
for n in range(2, k_max+1):

    ## initialize with K-means++, a good way of speeding up convergence
    k_means = KMeans(init='k-means++', n_clusters=n, n_init=10)
    ## record the current time
    t_km = time.time()
    # start clustering!
    k_means.fit(X)
    ## get the time to finish clustering
    t_fin_km = time.time() - t_km
    
    records[n] = t_fin_km

In [None]:
records_df = pd.DataFrame.from_dict(records, orient = 'index')
records_df.to_csv('data/part2_1_b_Kmeans.csv')

In [None]:
fig, ax = plt.subplots(figsize=[8,10])
ax.scatter(x=list(records.keys()), y=list(records.values()))
#plt.savefig('part2_.pdf')

In [None]:
#mini batch k_means

In [15]:
records = {}
for n in range(2000, 101000, 1000):
    
    print ('begin ' + str(n))
    sample = n
    total = len(X)
    subset = X[0::int(total/sample)]

    perc=0.0005
    batch_size=int(len(subset)*perc)

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=batch_size,
                          n_init=10, max_no_improvement=10, verbose=0)
    t0 = time.time()
    mbk.fit(subset)
    t_mini_batch = time.time() - t0
    
    records[n] = t_mini_batch

begin 2000
begin 3000
begin 4000

  init_size=init_size)
  init_size=init_size)



begin 5000
begin 6000
begin 7000
begin 8000

  init_size=init_size)
  init_size=init_size)



begin 9000
begin 10000
begin 11000
begin 12000

  init_size=init_size)
  init_size=init_size)



begin 13000
begin 14000
begin 15000
begin 16000

  init_size=init_size)
  init_size=init_size)



begin 17000
begin 18000
begin 19000
begin 20000

  init_size=init_size)
  init_size=init_size)



begin 21000
begin 22000
begin 23000
begin 24000

  init_size=init_size)
  init_size=init_size)



begin 25000
begin 26000
begin 27000
begin 28000

  init_size=init_size)
  init_size=init_size)



begin 29000
begin 30000
begin 31000
begin 32000

  init_size=init_size)
  init_size=init_size)



begin 33000
begin 34000
begin 35000
begin 36000

  init_size=init_size)
  init_size=init_size)



begin 37000
begin 38000
begin 39000

  init_size=init_size)
  init_size=init_size)



begin 40000
begin 41000
begin 42000
begin 43000
begin 44000

  init_size=init_size)
  init_size=init_size)



begin 45000
begin 46000
begin 47000
begin 48000

  init_size=init_size)
  init_size=init_size)



begin 49000
begin 50000
begin 51000
begin 52000
begin 53000

  init_size=init_size)
  init_size=init_size)



begin 54000
begin 55000
begin 56000
begin 57000
begin 58000
begin 59000

  init_size=init_size)
  init_size=init_size)



begin 60000
begin 61000
begin 62000
begin 63000
begin 64000
begin 65000
begin 66000
begin 67000
begin 68000
begin 69000
begin 70000
begin 71000
begin 72000
begin 73000
begin 74000
begin 75000
begin 76000
begin 77000
begin 78000
begin 79000
begin 80000
begin 81000
begin 82000
begin 83000
begin 84000
begin 85000
begin 86000
begin 87000
begin 88000
begin 89000
begin 90000
begin 91000
begin 92000
begin 93000
begin 94000
begin 95000
begin 96000
begin 97000
begin 98000
begin 99000
begin 100000


  init_size=init_size)


In [16]:
records_df = pd.DataFrame.from_dict(records, orient = 'index')
records_df.to_csv('data/part2_1_a_MiniBatchK_0.0005.csv')

In [None]:
fig, ax = plt.subplots(figsize=[8,10])
ax.scatter(x=list(records.keys()), y=list(records.values()))
plt.savefig('part2_1_a_minibatch.pdf')

In [32]:
# minibatch kmeans
#Number of requested clusters k (consider the range of 2 to the k_max)
k_max = 1350
records = {}
for n in range(2, k_max+1, 100):

    perc=0.01
    batch_size=int(len(X)*perc)
    
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=n, batch_size=batch_size,
                          n_init=10, max_no_improvement=10, verbose=0)
    t0 = time.time()
    mbk.fit(X)
    t_mini_batch = time.time() - t0
    
    records[n] = t_mini_batch

In [33]:
records_df = pd.DataFrame.from_dict(records, orient = 'index')
records_df.to_csv('data/part21MiniBatchKCluster_0.01.csv')

In [None]:
fig, ax = plt.subplots(figsize=[8,10])
ax.scatter(x=list(records.keys()), y=list(records.values()))
plt.savefig('part2_1_b_minibatch.pdf')

In [None]:
list(y)

In [None]:
recordedtime = {}
#ncluster = {}
for n in range(100, 100100, 100):
    eps=0.001
    
    sample = n
    total = len(X)
    subset = X[0::int(total/sample)]
    
    t_db = time.time()
    db = DBSCAN(eps=eps, min_samples=100).fit(subset)
    t_fin_db = time.time() - t_db
    
    recordedtime[n] = t_fin_db

In [None]:
recordedtime_df = pd.DataFrame.from_dict(recordedtime, orient = 'index')
recordedtime_df.to_csv('data/part2_2.csv')

In [None]:
fig, ax = plt.subplots(figsize=[8,10])
ax.scatter(x=list(recordedtime.keys()), y=list(recordedtime.values()))
plt.savefig('part2_2.pdf')