In [1]:
import pandas as pd
import numpy as np

from lazypredict.Supervised import LazyRegressor, LazyClassifier
from sklearn.ensemble import  RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import  train_test_split
import joblib
import pickle

In [2]:
df = pd.read_csv("dummy_data_1000.csv")
df.head()

Unnamed: 0,customer_id,nama,gender,pendidikan,umur,alamat,nomor_telepon,status_pernikahan,pendapatan
0,CUS_000574,Pranata Wahyuni,wanita,SD,26,"Gang Tubagus Ismail No. 891\nMagelang, Kepulau...",+62872-4514-7223,Single,47000000
1,CUS_000750,"Chelsea Damanik, S.E.",pria,Sarjana,28,"Gang Peta No. 0\nBanjarbaru, KS 27475",+62816-4228-6548,Single,66000000
2,CUS_002206,"Putri Napitupulu, M.Farm",pria,Magister,33,"Gang M.H Thamrin No. 32\nSubulussalam, BT 04848",+62829-2265-1008,Sudah menikah,25000000
3,CUS_005022,Teddy Sihotang,pria,SMP,44,"Jalan Tebet Barat Dalam No. 8\nTarakan, PB 39605",+62897-7037-0869,Sudah menikah,96500000
4,CUS_005720,Tira Ardianto,pria,SD,58,"Jl. Erlangga No. 4\nPurwokerto, KI 74286",+62809-4995-4231,Sudah menikah,67000000


In [3]:
def generate_jumlah_anak(row):
    if row['status_pernikahan'] == 'Single':
        return np.nan
    else:
        return np.random.choice([np.nan,0, 1, 2, 3, 4])



In [4]:
df['jumlah_anak'] = df.apply(generate_jumlah_anak, axis=1)

In [5]:
acak_status_kredit= ['sangat baik', 'baik', 'buruk']
acak_riwayat_peminjaman = [np.nan,'baik','tidah pernah']
df['riwayat_peminjaman'] = np.random.choice(acak_riwayat_peminjaman, size=len(df))
df['status_kredit'] = np.random.choice(acak_status_kredit, size=len(df))

In [6]:
df.loc[0,"umur"] = np.nan

In [7]:
df

Unnamed: 0,customer_id,nama,gender,pendidikan,umur,alamat,nomor_telepon,status_pernikahan,pendapatan,jumlah_anak,riwayat_peminjaman,status_kredit
0,CUS_000574,Pranata Wahyuni,wanita,SD,,"Gang Tubagus Ismail No. 891\nMagelang, Kepulau...",+62872-4514-7223,Single,47000000,,,baik
1,CUS_000750,"Chelsea Damanik, S.E.",pria,Sarjana,28.00,"Gang Peta No. 0\nBanjarbaru, KS 27475",+62816-4228-6548,Single,66000000,,baik,sangat baik
2,CUS_002206,"Putri Napitupulu, M.Farm",pria,Magister,33.00,"Gang M.H Thamrin No. 32\nSubulussalam, BT 04848",+62829-2265-1008,Sudah menikah,25000000,2.00,baik,baik
3,CUS_005022,Teddy Sihotang,pria,SMP,44.00,"Jalan Tebet Barat Dalam No. 8\nTarakan, PB 39605",+62897-7037-0869,Sudah menikah,96500000,1.00,,buruk
4,CUS_005720,Tira Ardianto,pria,SD,58.00,"Jl. Erlangga No. 4\nPurwokerto, KI 74286",+62809-4995-4231,Sudah menikah,67000000,1.00,baik,baik
...,...,...,...,...,...,...,...,...,...,...,...,...
995,CUS_992065,Ina Samosir,pria,SMP,54.00,"Gg. Raya Setiabudhi No. 066\nCirebon, BA 91958",+62866-3385-0550,Single,31500000,,tidah pernah,sangat baik
996,CUS_992747,Ratna Pradana,pria,Sarjana,39.00,"Gang Sadang Serang No. 897\nKediri, Papua 19172",+62806-3493-4494,Single,89000000,,tidah pernah,sangat baik
997,CUS_997479,"Siska Rajata, S.I.Kom",pria,Sarjana,34.00,"Gang KH Amin Jasuta No. 9\nBanjarbaru, KB 80063",+62806-8492-6277,Sudah menikah,89000000,4.00,baik,buruk
998,CUS_999664,Aisyah Wahyuni,pria,SMA/SMK/MA,29.00,"Gang Indragiri No. 70\nKotamobagu, Kalimantan ...",+62891-4590-3389,Single,50000000,,tidah pernah,buruk


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         1000 non-null   object 
 1   nama                1000 non-null   object 
 2   gender              1000 non-null   object 
 3   pendidikan          1000 non-null   object 
 4   umur                999 non-null    float64
 5   alamat              1000 non-null   object 
 6   nomor_telepon       1000 non-null   object 
 7   status_pernikahan   1000 non-null   object 
 8   pendapatan          1000 non-null   int64  
 9   jumlah_anak         436 non-null    float64
 10  riwayat_peminjaman  1000 non-null   object 
 11  status_kredit       1000 non-null   object 
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


# Data Pipeline

In [9]:
df.isnull().sum()

customer_id             0
nama                    0
gender                  0
pendidikan              0
umur                    1
alamat                  0
nomor_telepon           0
status_pernikahan       0
pendapatan              0
jumlah_anak           564
riwayat_peminjaman      0
status_kredit           0
dtype: int64

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

In [11]:
class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):        
        imputer = SimpleImputer()
        X['umur'] = imputer.fit_transform(X[['umur']]).astype(int)
        return X

class JumlahAnakImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X): 
        imputer = SimpleImputer(strategy='mean')

        # Mengisi nilai berdasarkan kondisi dengan menggunakan fit_transform
        X['jumlah_anak'] = imputer.fit_transform(X[['jumlah_anak']]).astype(int)
        X.loc[df['status_pernikahan']=='Single', 'jumlah_anak'] = 0
        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for label, content in X.items():
            if not pd.api.types.is_numeric_dtype(content):
                    # menambahkan +1 karena panda menyandikan kategori yang hilang sebagai -1
                    X[label] = pd.Categorical(content).codes+1  
        return X

class UselessFeature(BaseEstimator, TransformerMixin):
    def fit(self,X):
        return self

        
    def transform(self, X, y=None):

        return X.drop(['customer_id', 'nama','alamat','nomor_telepon'],axis=1)

In [12]:
from sklearn.pipeline import Pipeline

preprocessing_data = Pipeline([
    ('drop_feature',UselessFeature()),
    ('imput_umur',AgeImputer()),
    ('imput_jumlah_anak', JumlahAnakImputer()),
    ('imput_categorical_feature', CategoricalEncoder()),
])

## Klasifikasi

### Modelling

In [13]:
X = df.drop('status_kredit',axis=1)
y = df['status_kredit']

In [14]:
X = preprocessing_data.fit_transform(X)
X

Unnamed: 0,gender,pendidikan,umur,status_pernikahan,pendapatan,jumlah_anak,riwayat_peminjaman
0,2,3,40,1,47000000,0,2
1,1,6,28,1,66000000,0,1
2,1,2,33,2,25000000,2,1
3,1,5,44,2,96500000,1,2
4,1,3,58,2,67000000,1,1
...,...,...,...,...,...,...,...
995,1,5,54,1,31500000,0,3
996,1,6,39,1,89000000,0,3
997,1,6,34,2,89000000,4,1
998,1,4,29,1,50000000,0,3


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
model = LazyClassifier()

In [17]:
prediksi = model.fit(X_train, X_test, y_train, y_test)

 76%|███████▌  | 22/29 [00:00<00:00, 24.43it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 7
[LightGBM] [Info] Start training from score -1.053400
[LightGBM] [Info] Start training from score -1.183354
[LightGBM] [Info] Start training from score -1.064211


100%|██████████| 29/29 [00:01<00:00, 20.98it/s]


In [18]:
prediksi[0]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTreeClassifier,0.38,0.38,,0.38,0.01
ExtraTreeClassifier,0.37,0.37,,0.37,0.01
CalibratedClassifierCV,0.39,0.37,,0.34,0.13
NearestCentroid,0.35,0.36,,0.35,0.01
Perceptron,0.37,0.35,,0.34,0.01
NuSVC,0.34,0.34,,0.34,0.08
SGDClassifier,0.34,0.34,,0.34,0.01
LGBMClassifier,0.34,0.34,,0.34,0.32
DummyClassifier,0.36,0.33,,0.19,0.01
LabelSpreading,0.33,0.33,,0.33,0.03


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
model = RandomForestClassifier()
model.fit(X_train,y_train)
prediksi = model.predict(X_test)
print(classification_report(y_test, prediksi))

              precision    recall  f1-score   support

        baik       0.36      0.40      0.38        72
       buruk       0.23      0.18      0.20        57
 sangat baik       0.29      0.31      0.30        71

    accuracy                           0.30       200
   macro avg       0.29      0.30      0.29       200
weighted avg       0.30      0.30      0.30       200



### Model Pipeline

In [20]:
X = df.drop('status_kredit',axis=1)
y = df['status_kredit']

In [21]:
X = preprocessing_data.fit_transform(X)
X

Unnamed: 0,gender,pendidikan,umur,status_pernikahan,pendapatan,jumlah_anak,riwayat_peminjaman
0,2,3,40,1,47000000,0,2
1,1,6,28,1,66000000,0,1
2,1,2,33,2,25000000,2,1
3,1,5,44,2,96500000,1,2
4,1,3,58,2,67000000,1,1
...,...,...,...,...,...,...,...
995,1,5,54,1,31500000,0,3
996,1,6,39,1,89000000,0,3
997,1,6,34,2,89000000,4,1
998,1,4,29,1,50000000,0,3


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [23]:
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier 

In [24]:
features_pipeline = Pipeline([
    ('drop_feature',UselessFeature()),
    ('imput_umur',AgeImputer()),
    ('imput_jumlah_anak', JumlahAnakImputer()),
    ('imput_categorical_feature', CategoricalEncoder()),
])

In [25]:
clf = regressors = [
    SVC(),
    SGDClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    XGBClassifier()
]

In [26]:
clf_pipeline = Pipeline([
    ('classifier', clf[0])
])

In [27]:
clf_pipeline.fit(X_train, y_train)

In [28]:
prediksi = clf_pipeline.predict(X_test)
print(classification_report(y_test,prediksi))

              precision    recall  f1-score   support

        baik       0.33      0.31      0.32        72
       buruk       0.00      0.00      0.00        57
 sangat baik       0.34      0.65      0.45        71

    accuracy                           0.34       200
   macro avg       0.23      0.32      0.26       200
weighted avg       0.24      0.34      0.27       200



### Save model joblib

In [29]:
joblib.dump(model, 'klasifikasi_model.joblib')

['klasifikasi_model.joblib']

In [30]:
klasifikasi_model = joblib.load('klasifikasi_model.joblib')

In [31]:
test_data = {'gender': [2],
        'pendidikan': [3],
        'umur': [31],
        'status_pernikahan': [1],
        'pendapatan': [27000000],
        'jumlah_anak': [2],
        'riwayat_peminjaman': [2]}

In [32]:
klasifikasi_model.predict(pd.DataFrame(test_data))

array(['baik'], dtype=object)

### Save model pickle

In [33]:
pickle.dump(model, open('klasifikasi_model.pkl', 'wb'))
klasifikasi_model = pickle.load(open('klasifikasi_model.pkl', 'rb'))

In [34]:
klasifikasi_model.predict(pd.DataFrame(test_data))

array(['baik'], dtype=object)

## Regresi

### Modelling

In [35]:
df.columns

Index(['customer_id', 'nama', 'gender', 'pendidikan', 'umur', 'alamat',
       'nomor_telepon', 'status_pernikahan', 'pendapatan', 'jumlah_anak',
       'riwayat_peminjaman', 'status_kredit'],
      dtype='object')

In [36]:
X = df.drop('pendapatan',axis=1)
y = df['pendapatan']

X = preprocessing_data.fit_transform(X)
X

Unnamed: 0,gender,pendidikan,umur,status_pernikahan,jumlah_anak,riwayat_peminjaman,status_kredit
0,2,3,40,1,0,2,1
1,1,6,28,1,0,1,3
2,1,2,33,2,2,1,1
3,1,5,44,2,1,2,2
4,1,3,58,2,1,1,1
...,...,...,...,...,...,...,...
995,1,5,54,1,0,3,3
996,1,6,39,1,0,3,3
997,1,6,34,2,4,1,2
998,1,4,29,1,0,3,2


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [38]:
model = LazyRegressor()
prediksi = model.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [00:42<00:00,  1.00s/it]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 7
[LightGBM] [Info] Start training from score 53752500.000000





In [39]:
prediksi[0]

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,-0.01,0.03,26503617.25,0.03
OrthogonalMatchingPursuit,-0.01,0.03,26511724.03,0.01
GradientBoostingRegressor,-0.01,0.02,26582646.19,0.06
LassoLarsIC,-0.03,0.01,26766849.68,0.01
LassoCV,-0.04,-0.0,26911586.95,0.03
LarsCV,-0.04,-0.0,26926011.6,0.02
LassoLarsCV,-0.04,-0.0,26926011.6,0.02
TweedieRegressor,-0.04,-0.0,26940371.83,0.01
GammaRegressor,-0.04,-0.0,26948189.55,0.01
ElasticNet,-0.04,-0.0,26951784.62,0.01


### Model Pipeline

In [40]:
X = df.drop('pendapatan',axis=1)
y = df['pendapatan']
X = preprocessing_data.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [41]:
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor 

regressors = [
    SVR(),
    SGDRegressor(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    ExtraTreeRegressor(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    XGBRegressor()
]

In [42]:
Regression_pipeline = Pipeline([
    ('regressor', regressors[0])
])

In [43]:
Regression_pipeline.fit(X_train, y_train)

In [44]:
prediksi = Regression_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, prediksi)
mse = mean_squared_error(y_test, prediksi)

# Menghitung nilai Root Mean Squared Error (RMSE) menggunakan np.sqrt
rmse = np.sqrt(mse)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


MAE: 22814999.899679326
MSE: 726654990984621.8
RMSE: 26956538.928145465


### Save model joblib

In [45]:
model = RandomForestRegressor()
model.fit(X_train,y_train)
prediksi = model.predict(X_test)
# Menghitung nilai Mean Squared Error (MSE)
mae = mean_absolute_error(y_test, prediksi)
mse = mean_squared_error(y_test, prediksi)

# Menghitung nilai Root Mean Squared Error (RMSE) menggunakan np.sqrt
rmse = np.sqrt(mse)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


MAE: 23549585.714285716
MSE: 784934397499669.8
RMSE: 28016680.700962234


In [46]:
joblib.dump(model, 'regresi_model.joblib')
regresi_model = joblib.load('regresi_model.joblib')

In [47]:
test_data = {'gender': [2],
        'pendidikan': [1],
        'umur': [45],
        'status_pernikahan': [1],
        'jumlah_anak': [3],
        'riwayat_peminjaman': [1],
        'status_kredit': [2]}

In [48]:
regresi_model.predict(pd.DataFrame(test_data))

array([52026750.])

### Save Model Pickle

In [49]:
pickle.dump(model, open('regresi_model.pkl', 'wb'))
regresi_model = pickle.load(open('regresi_model.pkl', 'rb'))
regresi_model.predict(pd.DataFrame(test_data))

array([52026750.])