# Anonmyize data

Anonymize the fixtures used for regression tests 

## Read Data

In [None]:
import os
import pandas as pd
from uuid import uuid4
import numpy
from pathlib import Path

In [None]:
src = os.getcwd()
srcpath = 'test/components/fixtures/sample_workspace/sample'
file_name1 = 'schools.csv'
file_name2 = 'fiber.csv'
file_name3 = 'cellular.csv'
file1 = os.path.join(src, srcpath, file_name1)
file2 = os.path.join(src, srcpath, file_name2)
file3 = os.path.join(src, srcpath, file_name3)

In [None]:
def uniform_iid_generator():
    x = numpy.random.uniform(low=-0.1, high=0.1)
    return x

In [None]:
def random_number_generator():
    x = numpy.random.randint(0, 50)
    return x

## Schools

**Schools:**
- id, school_id, giga_id_school, and name should all be replaced with uuids (e.g. uuid4 works well here)
- lon/lat should be slightly perturbed: randomly sample iid uniform(-0.1, 0.1) and add each sample to lat/lon
- education_level, country, country_id, admin_name_1, admin_name_2, admin_name_3, admin_name_4 should be renamed to "test" in all rows

In [None]:
schools_df = pd.read_csv(file1)

In [None]:
len(schools_df)

In [None]:
schools_df = schools_df.drop('Unnamed: 0', axis=1)

In [None]:
schools_df.head()

In [None]:
# id, school_id, giga_id_school, and name should all be replaced with uuids
cols = ['id', 'school_id', 'giga_id_school', 'name']
for items in cols:
    schools_df[items] = schools_df[items].apply(lambda _: uuid4())

In [None]:
cols = ['country_id']
for items in cols:
    schools_df[items] = schools_df[items].apply(lambda _: random_number_generator())

In [None]:
# lon/lat should be slightly perturbed: randomly sample iid uniform(-0.1, 0.1) and add each sample to lat/lon
schools_df['lon'] = schools_df['lon'].apply(lambda x: x + uniform_iid_generator())
schools_df['lat'] = schools_df['lat'].apply(lambda x: x + uniform_iid_generator())

In [None]:
# education_level, country, country_id, admin_name_1, admin_name_2, admin_name_3, admin_name_4 should be renamed to "test" in all rows
temp = ['country', 'admin_1_name', 'admin_2_name', 'admin_3_name', 'admin_5_name']

for items in temp:
    schools_df[items] = "test"


In [None]:
temp = ['education_level']
for items in temp:
    schools_df[items] = "Other"

In [None]:
schools_df.columns

In [None]:
schools_df.head()['country_id']

In [None]:
len(schools_df)

In [None]:
schools_df.to_csv(file1)

## Cellular
**Cellular:**
- Site ID should be replaced with a uuid in all rows
- State, Region, Ownership of site, Site Type, Site power topology should all be replaced with "test" in all rows
- Indoor /outdoor should be replaced with "Outdoor" in all rows
- Site power topology should be replaced with "Grid" in all rows
- Tower Height should be replaced with a random sample height taken from integer(uniform(5, 50)) for each row independently 
- Technology should be replaced with "4G" in all rows
- lon/lat should be slightly perturbed: randomly sample iid uniform(-0.1, 0.1) and add each sample to lat/lon

In [None]:
def tower_value_generator():
    val = numpy.random.uniform(low=5, high=50)
    return int(val)

In [None]:
cellular_df = pd.read_csv(file3)

In [None]:
len(cellular_df)

In [None]:
cellular_df = cellular_df.drop('Unnamed: 0', axis=1)

In [None]:
cellular_df.columns

In [None]:
cellular_df['Site ID'] = cellular_df['Site ID'].apply(lambda _: str(uuid4()))

In [None]:
temp = ['State', 'Region', 'Ownership of site', 'Site Type', 'Site power topology']

for items in temp:
    cellular_df[items] = "test"

In [None]:
cellular_df['Indoor /outdoor'] = "Outdoor"
cellular_df['Site power topology'] = "Grid"
cellular_df['Technology'] = "4G"

In [None]:
cellular_df['Tower Height'] = cellular_df['Tower Height'].apply(lambda _: tower_value_generator())

In [None]:
# lon/lat should be slightly perturbed: randomly sample iid uniform(-0.1, 0.1) and add each sample to lat/lon
cellular_df['Longitude'] = cellular_df['Longitude'].apply(lambda x: x + uniform_iid_generator())
cellular_df['Latitude'] = cellular_df['Latitude'].apply(lambda x: x + uniform_iid_generator())

In [None]:
len(cellular_df)

In [None]:
cellular_df.to_csv(file3)

## Fiber

**Fiber:**
- lon/lat should be slightly perturbed: randomly sample iid uniform(-0.1, 0.1) and add each sample to lat/lon

In [None]:
fiber_df = pd.read_csv(file2)

In [None]:
len(fiber_df)

In [None]:
fiber_df.head()

In [None]:
fiber_df = fiber_df.drop('Unnamed: 0', axis=1)

In [None]:
# lon/lat should be slightly perturbed: randomly sample iid uniform(-0.1, 0.1) and add each sample to lat/lon
fiber_df['lon'] = fiber_df['lon'].apply(lambda x: x + uniform_iid_generator())
fiber_df['lat'] = fiber_df['lat'].apply(lambda x: x + uniform_iid_generator())

In [None]:
fiber_df.head()

In [None]:
len(fiber_df)

In [None]:
fiber_df.to_csv(file2)