In [1]:
import re
import numpy as np
import pandas as pd
import math
import cPickle as pickle

In [2]:
homes = pd.read_csv('../data/homeInferences_user.tsv', sep = '\t', names=['uid','longitude', 'latitude']).dropna()
homes['longitude'] = homes['longitude'].apply(float)
homes.head()

Unnamed: 0,uid,longitude,latitude
0,15,-122.385897,37.616909
1,18,-116.349939,33.754546
2,76,-121.775986,36.231914
3,87,-119.359636,36.352733
4,259,-122.399341,37.803324


In [3]:
homes.dtypes

uid            int64
longitude    float64
latitude     float64
dtype: object

### Reverse Geolocation
Reverse geolocation using: http://api.geonames.org/countryCode?lat={}&lng={}&username=vrmpx

In [4]:
geo = pd.read_csv('../data/countryData/GS.csv', index_col=0)
geo.head()

Unnamed: 0,Country
0,US
1,US
2,US
3,US
4,US


In [5]:
sum(geo['Country'] == 'US') / float(len(geo))

0.9337561663143058

In [6]:
geo = geo[np.logical_not(geo['Country'].str.contains("ERR"))]

In [7]:
geo['Country'].unique()

array(['US', 'MX', 'BR', 'AR', 'PH'], dtype=object)

In [8]:
len(geo)

2838

## Learn
Clusterizamos las coordenadas en dos clusters, para ver si eso funciona comparamos contra G.S. geo

In [9]:
def score(df, gs, colname, colnamegs):
    
    tmp = pd.merge(df, gs, left_index=True, right_index=True)

    tp = sum(np.logical_and(tmp[colname] == "US", tmp[colnamegs] == "US"))
    tn = sum(np.logical_and(tmp[colname] == "MX", tmp[colnamegs] == "MX"))
    fp = sum(np.logical_and(tmp[colname] == "US", tmp[colnamegs] == "MX"))
    fn = sum(np.logical_and(tmp[colname] == "MX", tmp[colnamegs] == "US"))

    total = float(tp + tn + fp + fn)

    print "Acc: ", (tp + tn) / total

In [10]:
#from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier

tmp = pd.merge(homes, geo, left_index=True, right_index=True).dropna()
data = tmp[['longitude', 'latitude']]
Y = tmp['Country']

#clf = tree.DecisionTreeClassifier()
clf = GradientBoostingClassifier()
clf = clf.fit(data, Y)

homes['tree'] = clf.predict(homes[['longitude', 'latitude']])
score(homes, geo, 'tree', 'Country')

Acc:  0.967648127953


In [11]:
homes.head()

Unnamed: 0,uid,longitude,latitude,tree
0,15,-122.385897,37.616909,US
1,18,-116.349939,33.754546,US
2,76,-121.775986,36.231914,US
3,87,-119.359636,36.352733,US
4,259,-122.399341,37.803324,US


In [12]:
sum(homes['tree'] == 'US') / float(len(homes))

0.9677582543590435

In [None]:
#save to file
homes.to_csv('../data/countryInference_user.tsv', sep = '\t', index = False)

### Test set 
Obtenemos otras 500? medidas de pais para ver que tan mal estamos

In [13]:
sampled_df_idx = pickle.load(open('../data/countryData/sample.idx.pickle'))
sampled_df = homes.ix[sampled_df_idx]
sampled_df.head()

Unnamed: 0,uid,longitude,latitude,tree
99487,316931323,-121.798275,36.631606,US
79617,212447118,-117.1587,32.746894,US
114946,395151697,-120.87475,37.383071,US
134539,533816927,-117.716254,34.047615,US
139332,569677531,-121.665114,36.73379,US


In [14]:
sampled_results = pd.read_csv('../data/countryData/testResults.txt', names = ['Country'])
sampled_results.index = sampled_df.index
sampled_results = sampled_results[np.logical_not(sampled_results['Country'] == 'ERR:15:no country code found')]
sampled_results.head()

Unnamed: 0,Country
99487,US
79617,US
114946,US
134539,US
139332,US


In [15]:
res = pd.merge(sampled_df, sampled_results, left_index=True, right_index=True)
print "Acc: ", sum(res['tree'] == res['Country']) / float(len(res))

Acc:  0.991886409736


In [16]:
res[np.logical_and(res['tree'] == "US", res['Country'] == 'MX')]

Unnamed: 0,uid,longitude,latitude,tree,Country
39676,50267031,-117.025943,32.52234,US,MX
103908,338849597,-116.938693,32.523069,US,MX
204751,2269344084,-117.020522,32.471028,US,MX


In [None]:
from sklearn.externals import joblib
joblib.dump(clf, '../models/GBM_country.pickle')