In [1]:
import numpy as np
import pandas as pd


def check_vals_in_arr(data, ref):
    cond = [value in ref for value in data]
    return cond

def approx_cords(df,key,approx):
    return [str(round(cor,approx)) for cor in df[key]]

def notArray(arr):
    return ~ np.array(arr)

def removeSampleValues(df,sample):
    condiX = check_vals_in_arr(df['Coordenada X'],sample['Coordenada X'].values)
    condiY = check_vals_in_arr(df['Coordenada Y'],sample['Coordenada Y'].values)
    condiDir = check_vals_in_arr(df['Unnamed: 11'],sample['Unnamed: 11'].values)
    condi = notArray(condiX and condiY and condiDir)
    return df[condi]
    

## __Estimation of the population__

In [None]:
totalN = 22324
p2 = 10011
p1 = 4527
dt = 4
k = np.log()

## __Caculation of the sample size__

### Urban Residential Sample sizes


In [34]:
# weights = [0.02,0.37,0.59,0.02]
weights = [0.02,0.37,0.59,0.02]
sdv = 0.2 # kg/dia
mean = 0.53 # kg/dia
N = 15900
Z = 1.96
E = 0.08 * mean
sample_size = int(1.1*(Z**2 * N * sdv**2)/((N-1) * E**2 + (Z**2 * sdv**2))+1)
sample_sizes = [int(weights[i]*sample_size+1) for i in range(len(weights))]
print(sample_size)
sample_sizes

94


[2, 35, 56, 2]

### Rural Residential Sample sizes


In [35]:
weights = [0.08,0.62,0.22,0.05,0.02,0.01]
sdv = 0.2 # kg/dia
mean = 0.4737 # kg/dia
N = 5391
Z = 1.96
E = 0.08 * mean
sample_size = int(1.1*(Z**2 * N * sdv**2)/((N-1) * E**2 + (Z**2 * sdv**2))+1)
sample_sizes = [int(weights[i]*sample_size+1) for i in range(len(weights))]
print(sample_size)
sample_sizes

116


[10, 72, 26, 6, 3, 2]

### Non Residential Sample sizes


In [36]:
sdv = 0.2 # kg/dia
mean = 0.4737 # kg/dia
N = 1886
Z = 1.96
E = 0.08 * mean
sample_size = int(1.1*(Z**2 * N * sdv**2)/((N-1) * E**2 + (Z**2 * sdv**2))+1)
print(sample_size)
# sample_sizes

112


# __Urban Sample__

In [None]:
file_path = r'db_guarne.xlsx'
urban_df = pd.read_excel(r'rural_raw.xlsx')
urban_df = urban_df[urban_df['Uso'] == 'Residencial']
urban_df = urban_df.drop(columns=['FID'])
urban_df = urban_df.rename(columns={'Coordenada':'Coordenada X','Coordena_1':'Coordenada Y'})

### Filtering the data to ensure that no spatially redundant data is present in the dataset

In [None]:
approx = 8
urban_df['Coordenada X'] = approx_cords(urban_df, 'Coordenada X', approx)
urban_df['Coordenada Y'] = approx_cords(urban_df, 'Coordenada Y', approx)
rural_df = urban_df.drop_duplicates(subset=['Coordenada X', 'Coordenada Y'])

### Choosing the sample using the filtered data

In [None]:
keys = ['codeid'] + [i for i in urban_df.keys()] 
urban_sample = pd.DataFrame(columns=keys)
sizes = [10,73,26,6,2,2]
for stratum in range(1,7):
    codeid = []
    stratified_df = urban_df[urban_df['Estrato'] == stratum]
    sample_size = sizes[stratum-1]
    sample = stratified_df.sample(n=sample_size)
    for cod in range(sample_size):
        codeid.append('RR'+str(stratum*1000 + cod + 1))
    sample.insert(1,'codeid',codeid,False)
    urban_sample = pd.concat([urban_sample,sample])
urban_sample['Coordenada X'] =  [float(i) for i in urban_sample['Coordenada X'].values]
urban_sample['Coordenada Y'] =  [float(i) for i in urban_sample['Coordenada Y'].values]

### Writing the sample file to an excel

In [None]:
# urban_sample.to_excel(r'urban_sample.xlsx',index=False)

# __Rural Sample__

In [None]:
file_path = r'db_guarne.xlsx'
rural_df = pd.read_excel(r'rural_raw.xlsx')
rural_df = rural_df[rural_df['Uso'] == 'Residencial']
rural_df = rural_df.drop(columns=['FID'])
rural_df = rural_df.rename(columns={'Coordenada':'Coordenada X','Coordena_1':'Coordenada Y'})

### Filtering the data to ensure that no spatially redundant data is present in the dataset

In [None]:
approx = 8
rural_df['Coordenada X'] = approx_cords(rural_df, 'Coordenada X', approx)
rural_df['Coordenada Y'] = approx_cords(rural_df, 'Coordenada Y', approx)
rural_df = rural_df.drop_duplicates(subset=['Coordenada X', 'Coordenada Y'])

### Choosing the sample using the filtered data

In [None]:
keys = ['codeid'] + [i for i in rural_df.keys()] 
rural_sample = pd.DataFrame(columns=keys)
sizes = [10,73,26,6,2,2]
for stratum in range(1,7):
    codeid = []
    stratified_df = rural_df[rural_df['Estrato'] == stratum]
    sample_size = sizes[stratum-1]
    sample = stratified_df.sample(n=sample_size)
    for cod in range(sample_size):
        codeid.append('RR'+str(stratum*1000 + cod + 1))
    sample.insert(1,'codeid',codeid,False)
    rural_sample = pd.concat([rural_sample,sample])
rural_sample['Coordenada X'] =  [float(i) for i in rural_sample['Coordenada X'].values]
rural_sample['Coordenada Y'] =  [float(i) for i in rural_sample['Coordenada Y'].values]

### Writing the sample file to an excel

In [None]:
# rural_sample.to_excel(r'rural_sample.xlsx',index=False)

## **Non Residential Sample**
In the next cell, the dataframe is filtered to choose only the values which correspond to the uses *Industrial, Comercial* y *Oficial*.

In [139]:
df = pd.read_excel('db_raw_guarne.xlsx')
non_residential_sample_size = 140

# Conditions to filter the dataframe
a = df['Uso'] == 'Industrial'
b = df['Uso'] == 'Comercial'
c = df['Uso'] == 'Oficial'

nr_df = df[a|b|c]
nr_df

nr_sample = nr_df.sample(n=non_residential_sample_size,replace=False)
# nr_sample.to_excel(r'Muestra no residencial guarne.xlsx',index=False)