### Read-in downloaded data

In [1]:
import pandas as pd

In [2]:
## 2021-09-16

In [3]:
data = pd.read_csv("0e9cd673-8e06-4810-b370-8b9930ee312a.csv", index_col = False)

In [4]:
data.head()

Unnamed: 0,viewer_id,broadcaster_id,viewer_age,viewer_gender,viewer_longitude,viewer_latitude,viewer_lang,viewer_country,broadcaster_age,broadcaster_gender,broadcaster_longitude,broadcaster_latitude,broadcaster_lang,broadcaster_country,duration,viewer_network,broadcaster_network,count
0,pof:333765857,pof:306180728,30.0,male,-96.7,32.9,en,US,37.0,female,-90.0,35.0,en,US,11891,pof,pof,7
1,meetme:277560515,meetme:234128276,40.0,female,151.2071,-33.8671,en,US,33.0,male,-118.3801,34.0939,en,US,173,meetme,meetme,2
2,pof:290015550,meetme:318374085,25.0,male,-82.4,29.6,en,US,43.0,female,-82.5995,29.6476,en,US,248,pof,meetme,2
3,meetme:57803631,meetme:272213699,24.0,male,-77.6036,-9.3225,es,ES,33.0,female,-71.349,-33.0503,es,CL,10544,meetme,meetme,3
4,meetme:177581880,skout:178541757,39.0,male,-95.5769,30.724,en,US,36.0,female,-117.210999,34.132999,en,US,98,meetme,skout,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5243852 entries, 0 to 5243851
Data columns (total 18 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   viewer_id              object 
 1   broadcaster_id         object 
 2   viewer_age             float64
 3   viewer_gender          object 
 4   viewer_longitude       float64
 5   viewer_latitude        float64
 6   viewer_lang            object 
 7   viewer_country         object 
 8   broadcaster_age        float64
 9   broadcaster_gender     object 
 10  broadcaster_longitude  float64
 11  broadcaster_latitude   float64
 12  broadcaster_lang       object 
 13  broadcaster_country    object 
 14  duration               int64  
 15  viewer_network         object 
 16  broadcaster_network    object 
 17  count                  int64  
dtypes: float64(6), int64(2), object(10)
memory usage: 720.1+ MB


In [6]:
import os
import tempfile
from typing import Dict, Text
import pprint 

In [7]:
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [8]:
data.shape

(5243852, 18)

### Clearning, transforming, and preparing the data

In [9]:
df_geo = data[['broadcaster_latitude', 'broadcaster_longitude']]

In [10]:
df_geo.head()

Unnamed: 0,broadcaster_latitude,broadcaster_longitude
0,35.0,-90.0
1,34.0939,-118.3801
2,29.6476,-82.5995
3,-33.0503,-71.349
4,34.132999,-117.210999


In [11]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5243852 entries, 0 to 5243851
Data columns (total 2 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   broadcaster_latitude   float64
 1   broadcaster_longitude  float64
dtypes: float64(2)
memory usage: 80.0 MB


In [12]:
missing_values = df_geo.isnull().values.any()
missing_values

False

In [13]:
df_geo[df_geo.isnull().any(axis=1)]

Unnamed: 0,broadcaster_latitude,broadcaster_longitude


In [14]:
data_train = df_geo.values.astype('float32')

In [15]:
data_train

array([[  35.    ,  -90.    ],
       [  34.0939, -118.3801],
       [  29.6476,  -82.5995],
       ...,
       [  40.8   ,  -73.9   ],
       [  28.5883,  -81.3312],
       [  30.2   ,  -81.5   ]], dtype=float32)

In [16]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [21]:
from datetime import datetime

In [22]:
kmeans = KMeans(n_clusters = 10, verbose = 0, n_init=32, n_jobs=1, random_state=2021)

In [31]:
job_name = "kmeans-geo-job{}".format(datetime.now().strftime('%Y%m%d%H%M%S'))
job_name

'kmeans-geo-job20210916180243'

In [33]:
# %%time
# kmeans.fit(kmeans.record_set(data_train), jobname=job_name)

In [34]:
kmeans.fit(data_train)



KMeans(n_clusters=10, n_init=32, n_jobs=1, random_state=2021)

In [35]:
labels = kmeans.labels_

In [36]:
labels

array([0, 6, 5, ..., 5, 5, 5], dtype=int32)

In [39]:
df_geo['label'] = labels
data = df_geo.groupby(['label']).mean().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [40]:
data

Unnamed: 0,broadcaster_latitude,broadcaster_longitude
0,33.415877,-95.937398
1,23.41483,77.411977
2,50.518293,-0.999632
3,-22.635131,-49.617258
4,12.772511,115.607486
5,37.788638,-79.562197
6,36.809697,-119.097408
7,-33.517721,148.077296
8,7.586199,-74.437513
9,30.286964,38.766805


In [41]:
data.to_csv("centroid.csv", index=False)