In [None]:
!gdown --id 1iwTU12iQhT8kPPMcG9z06OvL1IJdINFv

Downloading...
From: https://drive.google.com/uc?id=1iwTU12iQhT8kPPMcG9z06OvL1IJdINFv
To: /content/train.csv.zip
1.67GB [00:16, 98.8MB/s]


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split

from toolz import curry

In [None]:
df = pd.read_csv('train.csv.zip', compression='zip', nrows=1000000,
                   error_bad_lines=False)#.drop(columns='key')

In [None]:
# ' , '.join(df[['pickup_latitude','pickup_longitude']].iloc[3].apply(lambda x: np.round(x, decimals=6)).astype(str).to_list())

df_train, df_test, _, _ = train_test_split(df, df.index, test_size=0.2, random_state=42)

Fazendo o pré-processamento mínimo pra ter a base igual.

In [None]:
@curry
def swap_long_lat(df, coluna, coluna_swap, lim_inf, lim_sup):

  df_aux1 = df.query(f"{lim_inf}<{coluna}<{lim_sup} or {coluna}==0")
  
  df_aux2 = df.query(f"(not ({lim_inf}<{coluna}<{lim_sup})) and {coluna}!=0")
  i = df_aux2.columns.to_list()
  a, b = i.index(coluna), i.index(coluna_swap)
  i[b], i[a] = i[a], i[b]
  df_aux2.columns = i

  df_aux3 = df_aux2.query(f"{lim_inf}<{coluna}<{lim_sup}")
  
  df_aux4 = df_aux2.query(f"not({lim_inf}<{coluna}<{lim_sup})")
  i = df_aux4.columns.to_list()
  a, b = i.index(coluna), i.index(coluna_swap)
  i[b], i[a] = i[a], i[b]
  df_aux4.columns = i

  return pd.concat([df_aux1, df_aux3, df_aux4])

@curry
def flag_erro(df, lim_inf_lat, lim_sup_lat, lim_inf_long, lim_sup_long):
  df_aux = df.copy()
  query = " or ".join([f"(not({lim_inf_long}<{col}<{lim_sup_long}))" for col in ['pickup_longitude', 'dropoff_longitude']]
                      + [f"(not({lim_inf_lat}<{col}<{lim_sup_lat}))" for col in ['pickup_latitude', 'dropoff_latitude']])
  df_aux['flag_error'] = 0
  df_aux.loc[df_train.query(query).index, 'flag_error'] = 1
  return df_aux

@curry
def fill_error(df, coluna, lim_inf, lim_sup):
  df_aux = df.copy()
  df_aux.loc[df.query(f"(not({lim_inf}<{coluna}<{lim_sup}))").index, coluna] = np.nan
  return df_aux.fillna(dict(df_train[[coluna]].median()))

In [None]:
def preprocess_latlong(df, lim_inf_lat, lim_sup_lat, lim_inf_long, lim_sup_long):
  return (df
          .fillna(0)
          .pipe(swap_long_lat(coluna='pickup_longitude', coluna_swap='pickup_latitude', lim_inf=lim_inf_long, lim_sup=lim_sup_long))
          .pipe(swap_long_lat(coluna='pickup_latitude', coluna_swap='pickup_longitude', lim_inf=lim_inf_lat, lim_sup=lim_sup_lat))
          .pipe(swap_long_lat(coluna='dropoff_longitude', coluna_swap='dropoff_latitude', lim_inf=lim_inf_long, lim_sup=lim_sup_long))
          .pipe(swap_long_lat(coluna='dropoff_latitude', coluna_swap='dropoff_longitude', lim_inf=lim_inf_lat, lim_sup=lim_sup_lat))
          .pipe(flag_erro(lim_inf_lat=lim_inf_lat, lim_sup_lat=lim_sup_lat, lim_inf_long=lim_inf_long, lim_sup_long=lim_sup_long))
          .pipe(fill_error(coluna='pickup_longitude', lim_inf=lim_inf_long, lim_sup=lim_sup_long))
          .pipe(fill_error(coluna='pickup_latitude', lim_inf=lim_inf_lat, lim_sup=lim_sup_lat))
          .pipe(fill_error(coluna='dropoff_longitude', lim_inf=lim_inf_long, lim_sup=lim_sup_long))
          .pipe(fill_error(coluna='dropoff_latitude', lim_inf=lim_inf_lat, lim_sup=lim_sup_lat))
          )

In [None]:
lim_inf_lat, lim_sup_lat =     39, 42  #  40.3,  41.2
lim_inf_long, lim_sup_long = -76, -72  # -74.3, -73.5

df_train = preprocess_latlong(df_train, lim_inf_lat, lim_sup_lat, lim_inf_long, lim_sup_long)

## Construindo variável de CEP a partir de uma consulta.

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent='datalab')
geocode = RateLimiter(geolocator.reverse, min_delay_seconds=0.1)

In [None]:
def fun(row):
  return ' , '.join(row[['pickup_latitude','pickup_longitude']].apply(lambda x: np.round(x, decimals=6)).astype(str).to_list())

In [None]:
df_aux = df_train.sample(1000).copy()
df_aux['api'] = df_aux.apply(fun, axis=1)

In [None]:
df_aux['api'].head(3)

379876    40.717063 , -74.012197
619528      40.7677 , -73.966187
189652     40.74982 , -73.983218
Name: api, dtype: object

In [None]:
def try_except(loc, info):
  try:
    return loc['address'][info]
  except:
    return "NaN"

def geolocar(x):
  infos = ['suburb', 'postcode']
  if x['pickup_latitude'] != 0 and x['pickup_longitude'] != 0 :
    loc = geocode(x['api']).raw
    return [try_except(loc, info) for info in infos]
  else:
    return ['NaN' for info in infos]

Aplicando a consulta de fato.

In [None]:
# %%time
df_aux['response_api'] = df_aux.progress_apply(geolocar, axis=1)

100%|██████████| 1000/1000 [16:07<00:00,  1.03it/s]


In [None]:
df_aux['suburb'] = df_aux.apply(lambda x: x['response_api'][0], axis=1)
df_aux['postcode'] = df_aux.apply(lambda x: x['response_api'][1][:5], axis=1)

Filtrando apenas para o CEPs que aparecem pelo menos 2% das vezes.

In [None]:
counts = df_aux.postcode.value_counts()
repl = counts[counts < 20].index
df_aux['postcode'] = df_aux['postcode'].replace(repl, '10000')

## Criando modelo de vizinhos para preencher os missings das consultas

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

In [None]:
X = df_aux[['pickup_latitude','pickup_longitude']]
y = df_aux['postcode']

Métrica específica levando em conta a distância no globo.

In [None]:
from geopy.distance import distance

def geodesic_distance(X1,X2):
  return distance((X1[0],X1[1]),(X2[0],X2[1])).km

In [None]:
kkn = KNeighborsClassifier(n_neighbors=1, metric=geodesic_distance)
kkn.fit(X.head(900), y.head(900))

KNeighborsClassifier(algorithm='auto', leaf_size=30,
                     metric=<function geodesic_distance at 0x7f16d5382170>,
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [None]:
%%time
accuracy_score(y.tail(100), kkn.predict(X.tail(100)))

CPU times: user 17.7 s, sys: 564 ms, total: 18.3 s
Wall time: 17 s


0.74

Resultados OK! Acertamos cerca de 75% das vezes o CEP exato. Quando erramos, pela própria construção do problema, provavelmente estamos escolhendo um CEP perto então tá tudo bem.

O problema é que essa distância demora pra calcular. Por sorte, não temos muitos prejuízos em usar a distância euclidiana, como vemos a seguir:

In [None]:
cv_results = cross_validate(KNeighborsClassifier(n_neighbors=1), X, y, cv=5)

print(cv_results['test_score'].mean())
pd.DataFrame(cv_results)

0.7799999999999999


Unnamed: 0,fit_time,score_time,test_score
0,0.005822,0.008013,0.78
1,0.002793,0.007355,0.765
2,0.002634,0.007269,0.82
3,0.002756,0.007313,0.775
4,0.002627,0.007411,0.76


Fluxo para preenchimento dos missings que vamos utilizar no notebook principal.

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter

# geolocator = Nominatim(user_agent='datalab')

class filling_zipcode:
  
  def __init__(self):
    self.geocode = RateLimiter(Nominatim(user_agent='datalab').reverse, min_delay_seconds=0.1)

  def fun(self, row):
    return ' , '.join(row[['pickup_latitude','pickup_longitude']].apply(lambda x: np.round(x, decimals=6)).astype(str).to_list())

  def try_except(self, loc, info):
    try:
      return loc['address'][info]
    except:
      return "NaN"

  def geolocar(self, x):
    infos = ['suburb', 'postcode']
    if x['pickup_latitude'] != 0 and x['pickup_longitude'] != 0 :
      loc = self.geocode(x['api']).raw
      return [self.try_except(loc, info) for info in infos]
    else:
      return ['NaN' for info in infos]


  def consulta_fit(self, df, n_consultas=1000, frequencia_min=20):

    df_aux = df.head(n_consultas).copy()
    df_aux['api'] = df_aux.apply(self.fun, axis=1)

    df_aux['response_api'] = df_aux.progress_apply(self.geolocar, axis=1)

    df_aux['suburb'] = df_aux.apply(lambda x: x['response_api'][0], axis=1)
    df_aux['postcode'] = df_aux.apply(lambda x: x['response_api'][1][:5], axis=1)

    counts = df_aux.postcode.value_counts()
    repl = counts[counts < frequencia_min].index
    df_aux['postcode'] = df_aux['postcode'].replace(repl, '10000')

    X = df_aux[['pickup_latitude','pickup_longitude']]
    y = df_aux['postcode']

    self.knn = KNeighborsClassifier(n_neighbors=1).fit(X, y)

  def predict(self, df):
    df_aux = df.copy()
    df_aux['pickup_zipcode'] = self.knn.predict(df_aux[['pickup_latitude','pickup_longitude']])
    df_aux['dropoff_zipcode'] = self.knn.predict(df_aux[['dropoff_latitude','dropoff_longitude']])
    return df_aux

In [None]:
preenchedor = filling_zipcode()

In [None]:
%%time
preenchedor.consulta_fit(df_train, 10)

100%|██████████| 10/10 [00:09<00:00,  1.05it/s]

CPU times: user 143 ms, sys: 23.4 ms, total: 167 ms
Wall time: 9.52 s





In [None]:
%%time
preenchedor.predict(df_train)

CPU times: user 44.8 s, sys: 771 ms, total: 45.6 s
Wall time: 44.7 s


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,flag_error,pickup_zipcode,dropoff_zipcode
566853,2010-11-06 14:32:00.000000216,4.9,2010-11-06 14:32:00 UTC,-73.984177,40.733533,-73.991077,40.724042,2,0,10000,10000
382311,2009-07-17 20:54:00.000000158,11.3,2009-07-17 20:54:00 UTC,-73.996732,40.720380,-73.984070,40.760597,1,0,10000,10000
241519,2013-03-22 12:16:00.000000124,11.0,2013-03-22 12:16:00 UTC,-73.973800,40.753053,-73.986327,40.752865,2,0,10000,10000
719220,2014-04-07 08:56:49.0000004,5.0,2014-04-07 08:56:49 UTC,-73.957089,40.774476,-73.957199,40.783044,1,0,10000,10000
905718,2012-11-23 19:44:56.0000002,5.0,2012-11-23 19:44:56 UTC,-73.956669,40.784126,-73.953913,40.778918,3,0,10000,10000
...,...,...,...,...,...,...,...,...,...,...,...
70511,2009-10-20 11:51:00.0000009,10.1,2009-10-20 11:51:00 UTC,-73.981783,40.752695,-73.980137,40.753155,2,1,10000,10000
452282,2011-10-27 09:43:00.00000044,7.7,2011-10-27 09:43:00 UTC,-73.981783,40.752695,-73.980137,40.753155,1,1,10000,10000
66433,2010-07-28 09:22:00.000000254,7.3,2010-07-28 09:22:00 UTC,-73.981783,40.752695,-73.980137,40.753155,5,1,10000,10000
675302,2009-12-22 06:19:00.00000026,9.3,2009-12-22 06:19:00 UTC,-73.981783,40.752695,-73.980137,40.753155,5,1,10000,10000


Como a consulta nos dá o endereço:

In [None]:
location = geolocator.reverse("40.721319, -73.844311")
location.raw

{'address': {'city': 'New York',
  'city_district': 'Queens County',
  'country': 'United States',
  'country_code': 'us',
  'postcode': '11375',
  'road': 'Queens Boulevard',
  'shop': 'The Vitamin Shoppe',
  'state': 'New York',
  'suburb': 'Queens'},
 'boundingbox': ['40.7211907', '40.7212907', '-73.8443553', '-73.8442553'],
 'display_name': 'The Vitamin Shoppe, Queens Boulevard, Queens, Queens County, New York, 11375, United States',
 'lat': '40.7212407',
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'lon': '-73.8443053',
 'osm_id': 8572566493,
 'osm_type': 'node',
 'place_id': 310837130}