In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from geopy.geocoders import ArcGIS

%matplotlib inline

**warning**: este notebook usa un servicio de geocoding y tarda **mucho** en correr. Para facilitar las cosas guardo los datos obtenidos en `geo.cvs` y hago el análisis en un notebook separado (`analyze_location.ipynb`).

In [2]:
data = pd.read_csv('../dataset/train.csv', index_col=0, usecols=['id', 'location', 'target'])
data = data[data['location'].notnull()]
data['location'] = data['location'].map(lambda x: x.lower())
data

Unnamed: 0_level_0,location,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
48,birmingham,1
49,est. september 2012 - bristol,0
50,africa,1
52,"philadelphia, pa",0
53,"london, uk",0
...,...,...
10826,tn,0
10829,#newcastleupontyne #uk,0
10831,"vancouver, canada",0
10832,london,0


### Coordenadas
Se puede ver que algunas ubicaciones son coordenadas. Trato de idenificarlas y buscar sus datos

In [3]:
def to_coord(loc):
    try:
        x, y = loc.split(',')
        x = float(re.findall(r'-?[\d\.]+', x)[0])
        if not (-90 < x < 90): return np.nan
        y = float(re.findall(r'-?[\d\.]+', y)[0])
        if not (-180 < y <180): return np.nan
        return x, y
    except:
        return np.nan

coords = data['location'].map(to_coord).dropna()
coords

id
196                       (19.600858, -99.047821)
1350                      (30.307558, -81.403118)
1433                      (39.982988, -75.261624)
1973                     (-27.499212, 153.011072)
2460                      (-6.152261, 106.775995)
2499                      (41.252426, -96.072013)
2568                     (21.462446, -158.022017)
2616                       (52.479722, 62.184971)
2984                      (-26.695807, 27.837865)
3101     (10.614817868480726, 12.195582811791382)
3114                        (46.950109, 7.439469)
3256                                   (0.0, 0.0)
3389                      (36.142163, -95.979189)
3402                      (40.736324, -73.990062)
3569                      (33.209923, -87.545328)
3725                            (6.4682, 3.18287)
3787                       (19.123127, 72.825133)
4065                        (48.870833, 2.399227)
4847                       (43.631838, -79.55807)
4849                         (44.41451, 8.94249

In [4]:
geocoder = ArcGIS(
    timeout=500,
    user_agent='tp1_datos_1c2020',
    username='tp1_datos_1c2020',
    password='datos_1c2020',
    referer='www.datos_1c2020.com'
)

In [5]:
def geo_info_by_coords(lat, lon, geocoder):
    response = geocoder.reverse(query=f'{lat},{lon}')
    return {
            'lat': lat,
            'lon': lon,
            'country': response.raw['CountryCode'],
            'city': response.raw.get('City', np.nan),
            'match_score': 100,
        }

In [6]:
coords_info = pd.DataFrame(list(coords.map(lambda x: geo_info_by_coords(x[0], x[1], geocoder))), index=coords.index)
coords_info

Unnamed: 0_level_0,lat,lon,country,city,match_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
196,19.600858,-99.047821,MEX,Ecatepec de Morelos,100
1350,30.307558,-81.403118,USA,Neptune Beach,100
1433,39.982988,-75.261624,USA,Philadelphia,100
1973,-27.499212,153.011072,AUS,Brisbane,100
2460,-6.152261,106.775995,IDN,Grogol Petamburan,100
2499,41.252426,-96.072013,USA,Omaha,100
2568,21.462446,-158.022017,USA,Mililani,100
2616,52.479722,62.184971,KAZ,Денисов ауданы,100
2984,-26.695807,27.837865,ZAF,Emfuleni,100
3101,10.614818,12.195583,NGA,Biu,100


### Dirección
El resto de las ubicaciones se buscan como string

In [7]:
def suspicious(name):
    susp = ['#', '?', 'narnia', 'world', 'where', 'here', 'there', 'slytherin', '@', 'www', 'http', 'wonderland', 'home', 'webcam', 'house']
    for e in susp:
        if e in name: return True
    return False

In [8]:
data['coords'] = coords
names = data[data['coords'].isnull()]['location']
names = names[names.map(lambda x: not suspicious(x))]
names

id
48                          birmingham
49       est. september 2012 - bristol
50                              africa
52                    philadelphia, pa
53                          london, uk
                     ...              
10825                           global
10826                               tn
10831                vancouver, canada
10832                          london 
10833                          lincoln
Name: location, Length: 4631, dtype: object

In [9]:
def geo_info_by_string(query, geocoder):
    response = geocoder.geocode(query=query, out_fields=['Country', 'Score', 'City'])
    try:
        return {
            'lat': response.point.latitude,
            'lon': response.point.longitude,
            'country': response.raw['attributes'].get('Country', np.nan),
            'city': response.raw['attributes'].get('City', np.nan),
            'match_score': response.raw['attributes'].get('Score', np.nan),
        }
    except (KeyError, AttributeError):
        return {
            'lat': np.nan,
            'lon': np.nan,
            'country': np.nan,
            'city': np.nan,
            'match_score': 0,
        }

In [10]:
names_info = pd.DataFrame(list(names.map(lambda x: geo_info_by_string(x, geocoder))), index=names.index)
names_info

Unnamed: 0_level_0,lat,lon,country,city,match_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
48,52.478910,-1.905920,GBR,Birmingham,100.0
49,51.453790,-2.591680,GBR,Bristol,73.0
50,7.188100,21.093750,,,100.0
52,39.952220,-75.162180,USA,Philadelphia,100.0
53,51.506420,-0.127210,GBR,London,100.0
...,...,...,...,...,...
10825,-25.109990,30.752180,ZAF,Thaba Chweu,100.0
10826,34.116318,9.608516,TUN,,100.0
10831,49.260380,-123.113360,CAN,Vancouver,100.0
10832,51.506420,-0.127210,GBR,London,100.0


In [11]:
all = pd.concat([names_info, coords_info])
all['location_string'] = data['location']
all.to_csv('../dataset/geo.csv')
all

Unnamed: 0_level_0,lat,lon,country,city,match_score,location_string
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
48,52.478910,-1.905920,GBR,Birmingham,100.0,birmingham
49,51.453790,-2.591680,GBR,Bristol,73.0,est. september 2012 - bristol
50,7.188100,21.093750,,,100.0,africa
52,39.952220,-75.162180,USA,Philadelphia,100.0,"philadelphia, pa"
53,51.506420,-0.127210,GBR,London,100.0,"london, uk"
...,...,...,...,...,...,...
8685,42.910975,-78.865828,USA,Buffalo,100.0,"ìït: 42.910975,-78.865828"
9081,27.913602,-81.607853,USA,Lake Wales,100.0,"ìït: 27.9136024,-81.6078532"
9161,19.600858,-99.047821,MEX,Ecatepec de Morelos,100.0,"19.600858, -99.047821"
10192,1.502250,103.742992,MYS,Johor Bahru,100.0,"ìït: 1.50225,103.742992"
