In [123]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler


In [355]:
df = pd.read_csv('Pendapatan.csv')

In [356]:
df.isnull().any()

id                       False
Umur                     False
Kelas Pekerja            False
Berat Akhir              False
Pendidikan               False
Jmlh Tahun Pendidikan    False
Status Perkawinan        False
Pekerjaan                False
Jenis Kelamin            False
Keuntungan Kapital       False
Kerugian Capital         False
Jam per Minggu           False
Gaji                     False
dtype: bool

In [357]:
df['Kelas Pekerja'].value_counts()

Wiraswasta                       26589
Pekerja Bebas Bukan Perusahan     3072
Pemerintah Lokal                  2454
?                                 2204
Pemerintah Negara                 1579
Pekerja Bebas Perusahaan          1345
Pemerintah Provinsi               1128
Tanpa di Bayar                      16
Tidak Pernah Bekerja                 6
Name: Kelas Pekerja, dtype: int64

In [358]:
df[(df['Kelas Pekerja'] != '?') & (df['Pekerjaan'] == '?')]
df['Pekerjaan'] = np.where(((df['Kelas Pekerja'] != '?') & (df['Pekerjaan'] == '?')),'Pengangguran',df['Pekerjaan'])
df['Pekerjaan'].value_counts() 

Spesialis                4911
Ekesekutif Managerial    4790
Perbaikan Kerajinan      4788
Pemuka Agama             4408
Sales                    4323
Servis Lainnya           3859
Mesin Inspeksi           2380
?                        2204
Supir                    1859
Pembersih                1650
Petani                   1150
Tech-support             1122
Penjaga                   734
Asisten Rumah Tangga      198
Tentara                    11
Pengangguran                6
Name: Pekerjaan, dtype: int64

In [359]:
df = df[(df['Kelas Pekerja'] != '?') & (df['Pekerjaan'] != '?')]

In [360]:
df['Jmlh Tahun Pendidikan'].value_counts()

9     11789
10     7869
13     6115
14     2045
11     1546
7      1266
12     1199
6       974
15      648
4       636
5       554
16      464
8       464
3       385
2       180
1        55
Name: Jmlh Tahun Pendidikan, dtype: int64

In [361]:
df = df.drop(['Kerugian Capital', 'Keuntungan Kapital'], axis= 1)
df

Unnamed: 0,id,Umur,Kelas Pekerja,Berat Akhir,Pendidikan,Jmlh Tahun Pendidikan,Status Perkawinan,Pekerjaan,Jenis Kelamin,Jam per Minggu,Gaji
0,27247,59,Pemerintah Negara,139616,Master,14,Menikah,Ekesekutif Managerial,Laki2,50.0,1
1,1640,52,Wiraswasta,158993,SMA,9,Cerai,Servis Lainnya,Perempuan,38.0,0
2,45206,52,Pekerja Bebas Bukan Perusahan,284648,SMA,9,Cerai,Ekesekutif Managerial,Perempuan,99.0,1
3,16154,45,Wiraswasta,132847,SMA,9,Belum Pernah Menikah,Pembersih,Perempuan,40.0,0
4,43023,28,Wiraswasta,103432,SMA,9,Belum Pernah Menikah,Supir,Laki2,45.0,1
...,...,...,...,...,...,...,...,...,...,...,...
38388,46220,47,Pekerja Bebas Bukan Perusahan,148169,SMA,9,Menikah,Perbaikan Kerajinan,Laki2,40.0,0
38389,33268,69,Pekerja Bebas Perusahaan,264722,D3,12,Menikah,Sales,Laki2,40.0,1
38390,44845,24,Pekerja Bebas Bukan Perusahan,31606,Sarjana,13,Menikah,Spesialis,Perempuan,20.0,1
38391,4517,47,Wiraswasta,197836,SMA,9,Menikah,Sales,Laki2,45.0,0


In [362]:
education_mapping = {
    'SMA':0, 
    'Pendidikan Tinggi':1, 
    'Sarjana':2, 
    'Master':3, 
    'D4':4, 
    '11th':5,
    'D3':6,
    '10th':7,
    'Sekolah Professional':8,
    '7th-8th':9,
    '9th':10,
    '12th':11,
    'Doktor':12,
    '5th-6th':13,
    '1st-4th':14,
    'SD':15
}
df['Pendidikan'] = df['Pendidikan'].map(education_mapping)

In [363]:
df['Pendidikan'].value_counts()

0     11789
1      7869
2      6115
3      2045
4      1546
5      1266
6      1199
7       974
8       648
9       636
10      554
12      464
11      464
13      385
14      180
15       55
Name: Pendidikan, dtype: int64

In [364]:
sex_mapping = {
    'Perempuan':0, 
    'Laki2':1
}
df['Jenis Kelamin'] = df['Jenis Kelamin'].map(sex_mapping)

In [365]:
marital_mapping = {
    'Menikah':0, 
    'Belum Pernah Menikah':1, 
    'Cerai':2, 
    'Berpisah':3, 
    'Janda':4, 
    'Menikah LDR':5
}
df['Status Perkawinan'] = df['Status Perkawinan'].map(marital_mapping)

In [366]:
job_mapping = {
    'Spesialis':0, 
    'Ekesekutif Managerial':1, 
    'Perbaikan Kerajinan':2, 
    'Pemuka Agama':3, 
    'Sales':4, 
    'Servis Lainnya':5,
    'Mesin Inspeksi':6,
    'Supir':7,
    'Pembersih':8,
    'Petani':9,
    'Tech-support':10,
    'Penjaga':11,
    'Asisten Rumah Tangga':12,
    'Tentara':13,
    'Pengangguran':14
}
df['Pekerjaan'] = df['Pekerjaan'].map(job_mapping)

In [367]:
df['Kelas Pekerja'].value_counts()

Wiraswasta                       26589
Pekerja Bebas Bukan Perusahan     3072
Pemerintah Lokal                  2454
Pemerintah Negara                 1579
Pekerja Bebas Perusahaan          1345
Pemerintah Provinsi               1128
Tanpa di Bayar                      16
Tidak Pernah Bekerja                 6
Name: Kelas Pekerja, dtype: int64

In [368]:
working_class_mapping = {
    'Wiraswasta':0, 
    'Pekerja Bebas Bukan Perusahan':1, 
    'Pemerintah Lokal':2, 
    'Pemerintah Negara':3, 
    'Pekerja Bebas Perusahaan':4, 
    'Pemerintah Provinsi':5,
    'Tanpa di Bayar':6,
    'Tidak Pernah Bekerja':7,
}
df['Kelas Pekerja'] = df['Kelas Pekerja'].map(working_class_mapping)

In [369]:
df

Unnamed: 0,id,Umur,Kelas Pekerja,Berat Akhir,Pendidikan,Jmlh Tahun Pendidikan,Status Perkawinan,Pekerjaan,Jenis Kelamin,Jam per Minggu,Gaji
0,27247,59,3,139616,3,14,0,1,1,50.0,1
1,1640,52,0,158993,0,9,2,5,0,38.0,0
2,45206,52,1,284648,0,9,2,1,0,99.0,1
3,16154,45,0,132847,0,9,1,8,0,40.0,0
4,43023,28,0,103432,0,9,1,7,1,45.0,1
...,...,...,...,...,...,...,...,...,...,...,...
38388,46220,47,1,148169,0,9,0,2,1,40.0,0
38389,33268,69,4,264722,6,12,0,4,1,40.0,1
38390,44845,24,1,31606,2,13,0,0,0,20.0,1
38391,4517,47,0,197836,0,9,0,4,1,45.0,0


In [370]:
std_scaller = StandardScaler()
df[['Jam per Minggu']] = std_scaller.fit_transform(df[['Jam per Minggu']])
df[['Umur']] = std_scaller.fit_transform(df[['Umur']])
df[['Berat Akhir']] = std_scaller.fit_transform(df[['Berat Akhir']])

In [371]:
X = df.drop(['id', 'Gaji'], axis= 1)
y = df['Gaji']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 21, stratify= y)

model_knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': np.arange(5, 30),
    'weights': ['uniform', 'distance']
}
rscv = RandomizedSearchCV(model_knn, param_grid, cv=10, scoring= 'roc_auc', n_iter=50)
rscv.fit(X_train, y_train)

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='deprecated', n_iter=50, n_jobs=None,
                   param_distributions={'n_neighbors': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29]),
                                        'weights': ['uniform', 'distance']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='roc_auc', verbose=0)

In [372]:
rscv.best_score_

0.8696634634720798

In [373]:
rscv.best_params_

{'weights': 'uniform', 'n_neighbors': 28}

In [374]:
y_pred_val = rscv.predict_proba(X_test)

In [375]:
y_pred_val

array([[0.89285714, 0.10714286],
       [1.        , 0.        ],
       [1.        , 0.        ],
       ...,
       [0.92857143, 0.07142857],
       [0.78571429, 0.21428571],
       [0.35714286, 0.64285714]])

In [376]:
y_pred_val_above_umr = y_pred_val[:, 1]

In [377]:
roc_auc_score(y_test, y_pred_val_above_umr)

0.869099771586425

In [378]:
df_test = pd.read_csv('Pendapatan_test.csv')

In [379]:
df_test['Kelas Pekerja'].value_counts()

Wiraswasta                       6714
Pekerja Bebas Bukan Perusahan     732
Pemerintah Lokal                  624
?                                 552
Pemerintah Negara                 376
Pekerja Bebas Perusahaan          316
Pemerintah Provinsi               279
Tanpa di Bayar                      4
Tidak Pernah Bekerja                2
Name: Kelas Pekerja, dtype: int64

In [380]:
df_test['Pekerjaan'] = np.where(((df_test['Kelas Pekerja'] != '?') & (df_test['Pekerjaan'] == '?')),'Pengangguran',df_test['Pekerjaan'])
df_test = df_test.drop(['Kerugian Capital', 'Keuntungan Kapital'], axis= 1)
df_test['Pekerjaan'] = df_test['Pekerjaan'].replace('?', 'Pengangguran')
df_test['Kelas Pekerja'] = df_test['Kelas Pekerja'].replace('?', 'Tidak Pernah Bekerja')

In [381]:
education_mapping = {
    'SMA':0, 
    'Pendidikan Tinggi':1, 
    'Sarjana':2, 
    'Master':3, 
    'D4':4, 
    '11th':5,
    'D3':6,
    '10th':7,
    'Sekolah Professional':8,
    '7th-8th':9,
    '9th':10,
    '12th':11,
    'Doktor':12,
    '5th-6th':13,
    '1st-4th':14,
    'SD':15
}
df_test['Pendidikan'] = df_test['Pendidikan'].map(education_mapping)

In [382]:
sex_mapping = {
    'Perempuan':0, 
    'Laki2':1
}
df_test['Jenis Kelamin'] = df_test['Jenis Kelamin'].map(sex_mapping)


In [383]:
marital_mapping = {
    'Menikah':0, 
    'Belum Pernah Menikah':1, 
    'Cerai':2, 
    'Berpisah':3, 
    'Janda':4, 
    'Menikah LDR':5
}
df_test['Status Perkawinan'] = df_test['Status Perkawinan'].map(marital_mapping)

In [384]:
working_class_mapping = {
    'Wiraswasta':0, 
    'Pekerja Bebas Bukan Perusahan':1, 
    'Pemerintah Lokal':2, 
    'Pemerintah Negara':3, 
    'Pekerja Bebas Perusahaan':4, 
    'Pemerintah Provinsi':5,
    'Tanpa di Bayar':6,
    'Tidak Pernah Bekerja':7,
}
df_test['Kelas Pekerja'] = df_test['Kelas Pekerja'].map(working_class_mapping)


In [385]:
job_mapping = {
    'Spesialis':0, 
    'Ekesekutif Managerial':1, 
    'Perbaikan Kerajinan':2, 
    'Pemuka Agama':3, 
    'Sales':4, 
    'Servis Lainnya':5,
    'Mesin Inspeksi':6,
    'Supir':7,
    'Pembersih':8,
    'Petani':9,
    'Tech-support':10,
    'Penjaga':11,
    'Asisten Rumah Tangga':12,
    'Tentara':13,
    'Pengangguran':14
}
df_test['Pekerjaan'] = df_test['Pekerjaan'].map(job_mapping)

In [386]:
std_scaller = StandardScaler()
df_test[['Jam per Minggu']] = std_scaller.fit_transform(df_test[['Jam per Minggu']])
df_test[['Umur']] = std_scaller.fit_transform(df_test[['Umur']])
df_test[['Berat Akhir']] = std_scaller.fit_transform(df_test[['Berat Akhir']])


In [387]:
df_test

Unnamed: 0,id,Umur,Kelas Pekerja,Berat Akhir,Pendidikan,Jmlh Tahun Pendidikan,Status Perkawinan,Pekerjaan,Jenis Kelamin,Jam per Minggu
0,47933,-0.612932,2,-0.847976,4,11,2,3,0,-0.026687
1,44213,-0.758052,0,0.114735,0,9,0,6,1,0.779877
2,20823,-0.322692,5,0.093384,1,10,0,3,1,-0.026687
3,40565,0.910830,0,1.723954,1,10,0,1,1,-0.026687
4,9860,-1.338533,7,-0.425810,1,10,1,14,0,-0.026687
...,...,...,...,...,...,...,...,...,...,...
9594,13587,1.418751,3,-0.291090,10,5,0,9,1,-0.026687
9595,33231,0.475469,1,-0.431439,5,7,0,2,1,0.779877
9596,18614,-1.338533,0,0.457540,1,10,1,5,0,-0.671938
9597,37566,0.185229,2,0.554840,0,9,3,3,0,-0.026687


In [388]:
X = df_test.drop('id', axis= 1)
y_id = df_test['id']

In [389]:
X.isnull().sum()

Umur                     0
Kelas Pekerja            0
Berat Akhir              0
Pendidikan               0
Jmlh Tahun Pendidikan    0
Status Perkawinan        0
Pekerjaan                0
Jenis Kelamin            0
Jam per Minggu           0
dtype: int64

In [390]:
y_pred_test = rscv.predict(X)

In [391]:
y_pred_test

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [392]:
y_id

0       47933
1       44213
2       20823
3       40565
4        9860
        ...  
9594    13587
9595    33231
9596    18614
9597    37566
9598     6744
Name: id, Length: 9599, dtype: int64

In [393]:
df_res = pd.DataFrame({'id' : y_id, 'Gaji': y_pred_test}) 

In [394]:
df_res.to_csv('pred_res.csv', index= False)