# Размещение баннеров

In [203]:
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift
from geopy.distance import great_circle

## Load data

In [170]:
df = pd.read_csv("data/checkins.dat", sep='|', na_values = ' ', skiprows = 2, 
                 names = ['id', 'user_id', 'venue_id', 'latitude', 'longitude', 'created_at'])

In [171]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984301,2041916.0,5222.0,,,2012-04-21 17:39:01
1,984222,15824.0,5222.0,38.8951118,-77.0363658,2012-04-21 17:43:47
2,984315,1764391.0,5222.0,,,2012-04-21 17:37:18
3,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
4,984249,2146840.0,5222.0,,,2012-04-21 17:42:58


## Transforming data

In [174]:
df.latitude = [str(x).replace(' ', '') for x in df.latitude]
df.longitude = [str(x).replace(' ', '') for x in df.longitude]
df = df[df.latitude != '']
df = df[df.latitude != 'nan']
df = df[df.longitude != 'nan']
df = df[df.longitude != '']
df['latitude'] = df['latitude'].astype('float64')
df['longitude'] = df['longitude'].astype('float64')

In [177]:
len(df)

396634

In [178]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
1,984222,15824.0,5222.0,38.895112,-77.036366,012-04-2117:43:47
3,984234,44652.0,5222.0,33.800745,-84.41052,012-04-2117:43:43
7,984291,105054.0,5222.0,45.523452,-122.676207,012-04-2117:39:22
9,984318,2146539.0,5222.0,40.764462,-111.904565,012-04-2117:35:46
10,984232,93870.0,380645.0,33.448377,-112.074037,012-04-2117:38:18


## Clustering

### Subset first 100k rows

In [None]:
x = df[['latitude', 'longitude']][:100000]

### Train MeanShift

In [184]:
ms = MeanShift(bandwidth=0.1)
ms.fit(x)

MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

### Get labels and clusters' centers

In [207]:
labels = ms.labels_
cluster_centers = ms.cluster_centers_
x['labels'] = labels
x.head()

Unnamed: 0,latitude,longitude,labels
1,38.895112,-77.036366,5
3,33.800745,-84.41052,7
7,45.523452,-122.676207,30
9,40.764462,-111.904565,66
10,33.448377,-112.074037,1


### Counting unique clusters

In [208]:
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
n_clusters_

3229

### Subset clusters with more than 15 items

In [None]:
idc = x['labels'].value_counts()
good_labels = idx.index[idx > 15]
cluster_centers = cluster_centers[good_labels]

### Add offices locations

In [210]:
office = np.array([[33.751277, -118.188740],[25.867736, -80.324116],[51.503016, -0.075479], [52.378894, 4.885084],
                   [39.366487, 117.036146], [-33.868457, 151.205134]])

### Calculate distance

In [217]:
store_dist = []
for i in cluster_centers:
    for j in office:
        store_dist.append([great_circle(i, j).miles, i])

### Sort by miles from offices

In [254]:
store_dist.sort()

### Subset twenty closest locations and sort them

In [248]:
answer = [x[1] for x in store_dist[:20]]
answer.sort(key=lambda x: x[0])
answer

[array([ -33.86063043,  151.20477593]),
 array([ 25.70534972, -80.28342874]),
 array([ 25.78581242, -80.21793804]),
 array([ 25.84567226, -80.3188906 ]),
 array([ 26.01009825, -80.19999059]),
 array([ 26.12086266, -80.15890668]),
 array([ 26.13884379, -80.33434684]),
 array([ 26.20058464, -80.25071613]),
 array([  33.65089599, -117.75207397]),
 array([  33.67430266, -117.85878927]),
 array([  33.80987796, -118.14892381]),
 array([  33.81730643, -117.89124917]),
 array([  33.87298601, -118.36209115]),
 array([  33.88832534, -118.04892817]),
 array([  33.97257482, -118.16837067]),
 array([  33.98393587, -118.00740497]),
 array([  34.03548695, -118.43899772]),
 array([  34.06039755, -118.24870903]),
 array([ 51.50299126,  -0.12553729]),
 array([ 52.37296399,   4.89231722])]

### Write answer

In [249]:
final = ''
for i in range(0,20):
    for j in range(0,2):
        final += ' ' + str(answer[i][j])

In [250]:
text_file = open("answer.txt", "w")
text_file.write(final.strip())
text_file.close()

In [252]:
final.strip()

'-33.8606304286 151.204775929 25.7053497211 -80.2834287382 25.78581242 -80.2179380368 25.8456722643 -80.3188905964 26.0100982493 -80.1999905857 26.1208626586 -80.1589066802 26.1388437868 -80.3343468368 26.200584641 -80.2507161256 33.6508959929 -117.752073973 33.6743026598 -117.858789268 33.8098779553 -118.148923807 33.8173064339 -117.891249171 33.8729860116 -118.362091147 33.8883253428 -118.048928172 33.9725748214 -118.168370667 33.983935874 -118.007404973 34.0354869531 -118.438997719 34.0603975546 -118.248709027 51.5029912609 -0.12553728871 52.3729639903 4.89231722258'