In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import os

In [None]:
# data bps
provinsi = pd.read_csv("../datawilayah/provinsi.csv")
kota = pd.read_csv("../datawilayah/kabupaten-kota.csv")
# data wikipedia
provinsi_populasi = pd.read_csv("../datawilayah/provinsi-and-populasi-2020.csv")

In [None]:
provinsi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      38 non-null     int64 
 1   name    38 non-null     object
dtypes: int64(1), object(1)
memory usage: 736.0+ bytes


In [None]:
kota.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 514 entries, 0 to 513
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      514 non-null    float64
 1   name    514 non-null    object 
dtypes: float64(1), object(1)
memory usage: 8.2+ KB


In [None]:
kota.head(38)

Unnamed: 0,id,name
0,11.01,Aceh Selatan
1,11.02,Aceh Tenggara
2,11.03,Aceh Timur
3,11.04,Aceh Tengah
4,11.05,Aceh Barat
5,11.06,Aceh Besar
6,11.07,Pidie
7,11.08,Aceh Utara
8,11.09,Simeulue
9,11.1,Aceh Singkil


In [None]:
provinsi.head(38)

Unnamed: 0,id,name
0,11,Aceh (NAD)
1,12,Sumatera Utara
2,13,Sumatera Barat
3,14,Riau
4,15,Jambi
5,16,Sumatera Selatan
6,17,Bengkulu
7,18,Lampung
8,19,Kepulauan Bangka Belitung
9,21,Kepulauan Riau


### merge province.csv with kabupaten-kota.csv

In [None]:
provinsi.columns = provinsi.columns.str.strip()
kota.columns = kota.columns.str.strip()

provinsi['id'] = provinsi['id'].astype(str)
kota['id'] = kota['id'].astype(str)

# extract province code 
kota['provinsi_id'] = kota['id'].apply(lambda x: x.split('.')[0])
city_counts = kota.groupby('provinsi_id').size().reset_index(name='jumlah_kota')

# merge back to get province names
result = provinsi.merge(city_counts, left_on='id', right_on='provinsi_id', how='left')
result['total_city'] = result['jumlah_kota'].fillna(0).astype(int)
result = result.sort_values(by='total_city', ascending=False)

# result
print(result[['id', 'name', 'total_city']])

    id                       name  total_city
14  35                 Jawa Timur          38
12  33                Jawa Tengah          35
1   12             Sumatera Utara          33
11  32                 Jawa Barat          27
26  73           Sulawesi Selatan          24
0   11                 Aceh (NAD)          23
18  53  Nusa Tenggara Timur (NTT)          22
2   13             Sumatera Barat          19
27  74          Sulawesi Tenggara          17
5   16           Sumatera Selatan          17
24  71             Sulawesi Utara          15
7   18                    Lampung          15
20  62          Kalimantan Tengah          14
19  61           Kalimantan Barat          14
21  63         Kalimantan Selatan          13
25  72            Sulawesi Tengah          13
3   14                       Riau          12
4   15                      Jambi          11
30  81                     Maluku          11
31  82               Maluku Utara          10
6   17                   Bengkulu 

In [None]:
print("Total provinces:", result['id'].nunique())
print("Total kabupaten/kota:", result['total_city'].sum())

Total provinces: 38
Total kabupaten/kota: 514


### merge province.csv with the province-and-population.csv

In [None]:
provinsi["name"] = provinsi["name"].str.strip()
provinsi_populasi["name"] = provinsi_populasi["name"].str.strip()

# Merge province codes with population
merged = pd.merge(provinsi, provinsi_populasi, on="name", how="left")

# Sort by population
merged = merged.sort_values(by="population", ascending=False)

print(merged)

    id                       name  population
11  32                 Jawa Barat  48274162.0
14  35                 Jawa Timur  40665696.0
12  33                Jawa Tengah  36516035.0
1   12             Sumatera Utara  14799361.0
15  36                     Banten  11904562.0
10  31                DKI Jakarta  10562088.0
26  73           Sulawesi Selatan   9073509.0
7   18                    Lampung   9007848.0
5   16           Sumatera Selatan   8467432.0
3   14                       Riau   6394087.0
2   13             Sumatera Barat   5534472.0
19  61           Kalimantan Barat   5414390.0
18  53  Nusa Tenggara Timur (NTT)   5325566.0
17  52  Nusa Tenggara Barat (NTB)   5320092.0
0   11                 Aceh (NAD)   5274871.0
16  51                       Bali   4317404.0
32  91                      Papua   4303707.0
21  63         Kalimantan Selatan   4073584.0
22  64           Kalimantan Timur   3766039.0
13  34              DI Yogyakarta   3668719.0
4   15                      Jambi 

### combine jumlah_kota + population (column)

In [None]:
final = pd.merge(
    result[['id', 'name', 'total_city']],
    merged[['id', 'name', 'population']],
    on=['id', 'name'],
    how='left'
)

In [None]:
# proporsi = populasi_n / total_populasi
total_pop = final['population'].sum()

final['proporsi'] = (final['population'] / total_pop).round(3)
final['proporsi (%)'] = (final['proporsi'] * 100).round(2)
final = final.sort_values(by='population', ascending=False)

# sampel = proporsi_i x n
n = 1000
sampel = (final["proporsi"] * n).round(3)
final["sample"] = sampel

final.head(38)

Unnamed: 0,id,name,total_city,population,proporsi,proporsi (%),sample
3,32,Jawa Barat,27,48274162.0,0.179,17.9,179.0
0,35,Jawa Timur,38,40665696.0,0.151,15.1,151.0
1,33,Jawa Tengah,35,36516035.0,0.135,13.5,135.0
2,12,Sumatera Utara,33,14799361.0,0.055,5.5,55.0
27,36,Banten,8,11904562.0,0.044,4.4,44.0
34,31,DKI Jakarta,6,10562088.0,0.039,3.9,39.0
4,73,Sulawesi Selatan,24,9073509.0,0.034,3.4,34.0
11,18,Lampung,15,9007848.0,0.033,3.3,33.0
9,16,Sumatera Selatan,17,8467432.0,0.031,3.1,31.0
16,14,Riau,12,6394087.0,0.024,2.4,24.0


In [None]:
scaling_factor = 1.2  # 10% increase
final["sample_scaled"] = final["sample"] * scaling_factor
final["sample_scaled"] = final["sample_scaled"].round().astype("Int64")  # allow NaN
final.head(38)


Unnamed: 0,id,name,total_city,population,proporsi,proporsi (%),sample,sample_scaled
3,32,Jawa Barat,27,48274162.0,0.179,17.9,179.0,215.0
0,35,Jawa Timur,38,40665696.0,0.151,15.1,151.0,181.0
1,33,Jawa Tengah,35,36516035.0,0.135,13.5,135.0,162.0
2,12,Sumatera Utara,33,14799361.0,0.055,5.5,55.0,66.0
27,36,Banten,8,11904562.0,0.044,4.4,44.0,53.0
34,31,DKI Jakarta,6,10562088.0,0.039,3.9,39.0,47.0
4,73,Sulawesi Selatan,24,9073509.0,0.034,3.4,34.0,41.0
11,18,Lampung,15,9007848.0,0.033,3.3,33.0,40.0
9,16,Sumatera Selatan,17,8467432.0,0.031,3.1,31.0,37.0
16,14,Riau,12,6394087.0,0.024,2.4,24.0,29.0


In [None]:
# current total sample
sample_before = final["sample"].sum()
print(f"sample keseluruhan sebelum scale: {sample_before}")

sample_after = final["sample_scaled"].sum()
print(f"sample keseluruhan setelah scale: {sample_after}")


sample keseluruhan sebelum scale: 1002.0
sample keseluruhan sebelum scale: 1204


#### 