# Размещение баннеров

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift
from geopy.distance import great_circle

## Load data

In [2]:
df = pd.read_csv("data/checkins.dat", sep='|', na_values = ' ', skiprows = 2, 
                 names = ['id', 'user_id', 'venue_id', 'latitude', 'longitude', 'created_at'])

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984301,2041916.0,5222.0,,,2012-04-21 17:39:01
1,984222,15824.0,5222.0,38.8951118,-77.0363658,2012-04-21 17:43:47
2,984315,1764391.0,5222.0,,,2012-04-21 17:37:18
3,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
4,984249,2146840.0,5222.0,,,2012-04-21 17:42:58


## Transforming data

In [4]:
df.latitude = [str(x).replace(' ', '') for x in df.latitude]
df.longitude = [str(x).replace(' ', '') for x in df.longitude]
df = df[df.latitude != '']
df = df[df.latitude != 'nan']
df = df[df.longitude != 'nan']
df = df[df.longitude != '']
df['latitude'] = df['latitude'].astype('float64')
df['longitude'] = df['longitude'].astype('float64')

In [5]:
len(df)

396634

In [6]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
1,984222,15824.0,5222.0,38.895112,-77.036366,2012-04-21 17:43:47
3,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
7,984291,105054.0,5222.0,45.523452,-122.676207,2012-04-21 17:39:22
9,984318,2146539.0,5222.0,40.764462,-111.904565,2012-04-21 17:35:46
10,984232,93870.0,380645.0,33.448377,-112.074037,2012-04-21 17:38:18


## Clustering

### Subset first 100k rows

In [9]:
x = df[['latitude', 'longitude']][:100000]

### Train MeanShift

In [10]:
ms = MeanShift(bandwidth=0.1)
ms.fit(x)

MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

### Get labels and clusters' centers

In [11]:
labels = ms.labels_
cluster_centers = ms.cluster_centers_
x['labels'] = labels
x.head()

Unnamed: 0,latitude,longitude,labels
1,38.895112,-77.036366,5
3,33.800745,-84.41052,7
7,45.523452,-122.676207,30
9,40.764462,-111.904565,66
10,33.448377,-112.074037,1


### Counting unique clusters

In [12]:
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_

3229

### Subset clusters with more than 15 items

In [13]:
idx = x['labels'].value_counts()
good_labels = idx.index[idx > 15]
cluster_centers = cluster_centers[good_labels]

### Add offices locations

In [14]:
office = np.array([[33.751277, -118.188740],[25.867736, -80.324116],[51.503016, -0.075479], [52.378894, 4.885084],
                   [39.366487, 117.036146], [-33.868457, 151.205134]])

### Calculate distance

In [15]:
store_dist = []
for i in cluster_centers:
    for j in office:
        store_dist.append([great_circle(i, j).miles, i])

### Sort by miles from offices

In [16]:
store_dist.sort()
store_dist[0]

[0.5109844004230676, array([ 52.37296399,   4.89231722])]

### Write answer

In [20]:
text_file = open("answer.txt", "w")
text_file.write(str(store_dist[1]))
text_file.close()