# A preliminary pipeline to collect features

In [1]:
import jusipy
import pandas as pd

In [2]:
# Load datasets
LM        = jusipy.deals.LandMatrix()
A_country = jusipy.country_features.All()
A_latlong = jusipy.latlong_features.All()
GLCF_8km  = A_latlong.datasets[2]         # extract from A_latlong, just to not load this one twice...

Loading GLCF(8km)                                                                                                     reas)                                                                                                      

In [3]:
# Load GIS tools
GC = jusipy.GIS.GoogleCode(cache=jusipy.utils.FireDict(sub='googlecode'))
CC = jusipy.GIS.CountryCode()

## Select the points, and get the relevant data

### Positive points from LandMatrix

In [None]:
positive_points = LM.M[['lat', 'long', 'target_country_iso3', 'year']]
positive_points = positive_points.rename(columns={'target_country_iso3' : 'country'})
positive_points['positive'] = 1
positive_points['random']   = 0

### Random points selected from around the earth

In [None]:
#random_points = pd.DataFrame(jusipy.GIS.sample.random_latlong(land=True, glcf=GLCF_8km, size=10000),
#                             columns=['lat', 'long'])

random_points = pd.read_pickle('data/10000_random_points.pkl')

# For each random point, get the iso3 country ID
rp_iso3 = random_points.apply(lambda r: CC[GC.latlong(r.lat, r.long).get('country_political', 'XXXXXXX')].iso3,
                              axis=1)
random_points['country'] = rp_iso3
random_points['year'] = ['newest'] * len(random_points.lat)
random_points['positive'] = 0
random_points['random']   = 1

In [22]:
#random_points_5000 = random_points.iloc[:5000]
#iso3 = random_points_5000.apply(lambda r: CC[GC.latlong(r.lat, r.long).get('country_political', 'XXXXXXX')].iso3,
#                                axis=1)
#random_points_5000['country'] = iso3
#random_points_5000 = random_points_5000[~pd.isna(random_points_5000.country)]
#random_points_5000.to_pickle('data/random_points_with_country.pkl')

In [21]:
random_points_5000[~pd.isna(random_points_5000.country)].shape

(4860, 6)

### Combine the two stacks of data

In [None]:
all_points = pd.zcat([positive_points, random_points])
all_points

## Visualize the Spatial data

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(all_points.long, all_points.lat, c=all_points.positive, s=2, zorder=1)

## Collect features for each point

In [None]:
country_features = jusipy.country_features.get(all_points, A_country)
latlong_features = jusipy.latlong_features.get(all_points, A_latlong)
all_features = country_features.join(latlong_features)

## Store the output

In [None]:
all_points.to_pickle('data/all_labels.pkl')
all_features.to_pickle('data/all_features.pkl')

In [9]:
len(GC._cache._cache.keys())

5010

In [12]:
random_points

Unnamed: 0,lat,long,year,positive,random
0,21.097739,-98.915843,newest,0,1
1,46.687457,64.653645,newest,0,1
2,59.078491,12.060274,newest,0,1
3,-30.945640,-69.034903,newest,0,1
4,14.954319,38.412615,newest,0,1
5,-44.450818,-65.659892,newest,0,1
6,-25.466296,126.171925,newest,0,1
7,-79.398492,139.786460,newest,0,1
8,21.366154,75.127272,newest,0,1
9,20.243429,-100.795276,newest,0,1


In [13]:
random_points.to_pickle('data/10000_random_points.pkl')