In [159]:
import pandas as pd
import numpy as np

from lazypredict.Supervised import LazyRegressor, LazyClassifier
from sklearn.ensemble import  RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import  train_test_split
import joblib
import pickle

In [160]:
df = pd.read_csv("dummy_data_1000.csv")
df.head()

Unnamed: 0,customer_id,nama,gender,pendidikan,umur,alamat,nomor_telepon,status_pernikahan,pendapatan
0,CUS_000574,Pranata Wahyuni,wanita,SD,26,"Gang Tubagus Ismail No. 891\nMagelang, Kepulau...",+62872-4514-7223,Single,47000000
1,CUS_000750,"Chelsea Damanik, S.E.",pria,Sarjana,28,"Gang Peta No. 0\nBanjarbaru, KS 27475",+62816-4228-6548,Single,66000000
2,CUS_002206,"Putri Napitupulu, M.Farm",pria,Magister,33,"Gang M.H Thamrin No. 32\nSubulussalam, BT 04848",+62829-2265-1008,Sudah menikah,25000000
3,CUS_005022,Teddy Sihotang,pria,SMP,44,"Jalan Tebet Barat Dalam No. 8\nTarakan, PB 39605",+62897-7037-0869,Sudah menikah,96500000
4,CUS_005720,Tira Ardianto,pria,SD,58,"Jl. Erlangga No. 4\nPurwokerto, KI 74286",+62809-4995-4231,Sudah menikah,67000000


In [161]:
def generate_jumlah_anak(row):
    if row['status_pernikahan'] == 'Single':
        return np.nan
    else:
        return np.random.choice([np.nan,0, 1, 2, 3, 4])



In [162]:
df['jumlah_anak'] = df.apply(generate_jumlah_anak, axis=1)

In [163]:
acak_status_kredit= ['sangat baik', 'baik', 'buruk']
acak_riwayat_peminjaman = [np.nan,'baik','tidah pernah']
df['riwayat_peminjaman'] = np.random.choice(acak_riwayat_peminjaman, size=len(df))
df['status_kredit'] = np.random.choice(acak_status_kredit, size=len(df))

In [164]:
df.loc[0,"umur"] = np.nan

In [165]:
df

Unnamed: 0,customer_id,nama,gender,pendidikan,umur,alamat,nomor_telepon,status_pernikahan,pendapatan,jumlah_anak,riwayat_peminjaman,status_kredit
0,CUS_000574,Pranata Wahyuni,wanita,SD,,"Gang Tubagus Ismail No. 891\nMagelang, Kepulau...",+62872-4514-7223,Single,47000000,,baik,baik
1,CUS_000750,"Chelsea Damanik, S.E.",pria,Sarjana,28.00,"Gang Peta No. 0\nBanjarbaru, KS 27475",+62816-4228-6548,Single,66000000,,,sangat baik
2,CUS_002206,"Putri Napitupulu, M.Farm",pria,Magister,33.00,"Gang M.H Thamrin No. 32\nSubulussalam, BT 04848",+62829-2265-1008,Sudah menikah,25000000,2.00,baik,buruk
3,CUS_005022,Teddy Sihotang,pria,SMP,44.00,"Jalan Tebet Barat Dalam No. 8\nTarakan, PB 39605",+62897-7037-0869,Sudah menikah,96500000,0.00,,sangat baik
4,CUS_005720,Tira Ardianto,pria,SD,58.00,"Jl. Erlangga No. 4\nPurwokerto, KI 74286",+62809-4995-4231,Sudah menikah,67000000,1.00,,baik
...,...,...,...,...,...,...,...,...,...,...,...,...
995,CUS_992065,Ina Samosir,pria,SMP,54.00,"Gg. Raya Setiabudhi No. 066\nCirebon, BA 91958",+62866-3385-0550,Single,31500000,,,baik
996,CUS_992747,Ratna Pradana,pria,Sarjana,39.00,"Gang Sadang Serang No. 897\nKediri, Papua 19172",+62806-3493-4494,Single,89000000,,,buruk
997,CUS_997479,"Siska Rajata, S.I.Kom",pria,Sarjana,34.00,"Gang KH Amin Jasuta No. 9\nBanjarbaru, KB 80063",+62806-8492-6277,Sudah menikah,89000000,1.00,,baik
998,CUS_999664,Aisyah Wahyuni,pria,SMA/SMK/MA,29.00,"Gang Indragiri No. 70\nKotamobagu, Kalimantan ...",+62891-4590-3389,Single,50000000,,tidah pernah,sangat baik


In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         1000 non-null   object 
 1   nama                1000 non-null   object 
 2   gender              1000 non-null   object 
 3   pendidikan          1000 non-null   object 
 4   umur                999 non-null    float64
 5   alamat              1000 non-null   object 
 6   nomor_telepon       1000 non-null   object 
 7   status_pernikahan   1000 non-null   object 
 8   pendapatan          1000 non-null   int64  
 9   jumlah_anak         427 non-null    float64
 10  riwayat_peminjaman  1000 non-null   object 
 11  status_kredit       1000 non-null   object 
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


# Data Pipeline

In [167]:
df.isnull().sum()

customer_id             0
nama                    0
gender                  0
pendidikan              0
umur                    1
alamat                  0
nomor_telepon           0
status_pernikahan       0
pendapatan              0
jumlah_anak           573
riwayat_peminjaman      0
status_kredit           0
dtype: int64

In [168]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

In [169]:
class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):        
        imputer = SimpleImputer()
        X['umur'] = imputer.fit_transform(X[['umur']]).astype(int)
        return X

class JumlahAnakImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X): 
        imputer = SimpleImputer(strategy='mean')

        # Mengisi nilai berdasarkan kondisi dengan menggunakan fit_transform
        X['jumlah_anak'] = imputer.fit_transform(X[['jumlah_anak']]).astype(int)
        X.loc[df['status_pernikahan']=='Single', 'jumlah_anak'] = 0
        return X

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for label, content in X.items():
            if not pd.api.types.is_numeric_dtype(content):
                    # menambahkan +1 karena panda menyandikan kategori yang hilang sebagai -1
                    X[label] = pd.Categorical(content).codes+1  
        return X

class UselessFeature(BaseEstimator, TransformerMixin):
    def fit(self,X):
        return self

        
    def transform(self, X, y=None):

        return X.drop(['customer_id', 'nama','alamat','nomor_telepon'],axis=1)

In [170]:
from sklearn.pipeline import Pipeline

preprocessing_data = Pipeline([
    ('drop_feature',UselessFeature()),
    ('imput_umur',AgeImputer()),
    ('imput_jumlah_anak', JumlahAnakImputer()),
    ('imput_categorical_feature', CategoricalEncoder()),
])

## Klasifikasi

### Modelling

In [171]:
X = df.drop('status_kredit',axis=1)
y = df['status_kredit']

In [172]:
X = preprocessing_data.fit_transform(X)
X

Unnamed: 0,gender,pendidikan,umur,status_pernikahan,pendapatan,jumlah_anak,riwayat_peminjaman
0,2,3,40,1,47000000,0,1
1,1,6,28,1,66000000,0,2
2,1,2,33,2,25000000,2,1
3,1,5,44,2,96500000,0,2
4,1,3,58,2,67000000,1,2
...,...,...,...,...,...,...,...
995,1,5,54,1,31500000,0,2
996,1,6,39,1,89000000,0,2
997,1,6,34,2,89000000,1,2
998,1,4,29,1,50000000,0,3


In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [174]:
model = LazyClassifier()

In [175]:
prediksi = model.fit(X_train, X_test, y_train, y_test)

100%|██████████| 29/29 [00:01<00:00, 23.79it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 7
[LightGBM] [Info] Start training from score -1.123930
[LightGBM] [Info] Start training from score -1.101115
[LightGBM] [Info] Start training from score -1.071484





In [176]:
prediksi[0]

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,0.39,0.4,,0.38,0.13
NuSVC,0.4,0.39,,0.39,0.08
PassiveAggressiveClassifier,0.39,0.39,,0.39,0.01
RidgeClassifier,0.36,0.37,,0.35,0.01
LinearSVC,0.36,0.37,,0.35,0.03
RidgeClassifierCV,0.35,0.37,,0.34,0.01
CalibratedClassifierCV,0.34,0.37,,0.26,0.14
LinearDiscriminantAnalysis,0.35,0.37,,0.35,0.01
LogisticRegression,0.35,0.37,,0.35,0.01
GaussianNB,0.36,0.37,,0.36,0.01


In [177]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
model = RandomForestClassifier()
model.fit(X_train,y_train)
prediksi = model.predict(X_test)
print(classification_report(y_test, prediksi))

              precision    recall  f1-score   support

        baik       0.37      0.29      0.33        78
       buruk       0.34      0.40      0.37        58
 sangat baik       0.24      0.27      0.25        64

    accuracy                           0.32       200
   macro avg       0.32      0.32      0.32       200
weighted avg       0.32      0.32      0.32       200



### Model Pipeline

In [178]:
X = df.drop('status_kredit',axis=1)
y = df['status_kredit']

In [179]:
X = preprocessing_data.fit_transform(X)
X

Unnamed: 0,gender,pendidikan,umur,status_pernikahan,pendapatan,jumlah_anak,riwayat_peminjaman
0,2,3,40,1,47000000,0,1
1,1,6,28,1,66000000,0,2
2,1,2,33,2,25000000,2,1
3,1,5,44,2,96500000,0,2
4,1,3,58,2,67000000,1,2
...,...,...,...,...,...,...,...
995,1,5,54,1,31500000,0,2
996,1,6,39,1,89000000,0,2
997,1,6,34,2,89000000,1,2
998,1,4,29,1,50000000,0,3


In [180]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [181]:
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier 

In [182]:
features_pipeline = Pipeline([
    ('drop_feature',UselessFeature()),
    ('imput_umur',AgeImputer()),
    ('imput_jumlah_anak', JumlahAnakImputer()),
    ('imput_categorical_feature', CategoricalEncoder()),
])

In [183]:
clf = [
    SVC(),
    SGDClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    XGBClassifier()
]

In [184]:
clf_pipeline = Pipeline([
    ('classifier', clf[0])
])

In [185]:
clf_pipeline.fit(X_train, y_train)

In [186]:
prediksi = clf_pipeline.predict(X_test)
print(classification_report(y_test,prediksi))

              precision    recall  f1-score   support

        baik       0.40      0.47      0.43        78
       buruk       0.00      0.00      0.00        58
 sangat baik       0.29      0.48      0.36        64

    accuracy                           0.34       200
   macro avg       0.23      0.32      0.27       200
weighted avg       0.25      0.34      0.28       200



### Save model joblib

In [187]:
X_train

Unnamed: 0,gender,pendidikan,umur,status_pernikahan,pendapatan,jumlah_anak,riwayat_peminjaman
29,1,1,56,1,25500000,0,2
535,2,3,45,1,34500000,0,3
695,1,6,35,2,57000000,0,3
557,2,4,48,1,73000000,0,3
836,2,6,55,1,31000000,0,3
...,...,...,...,...,...,...,...
106,2,6,49,1,37000000,0,3
270,1,6,36,1,49000000,0,2
860,1,5,34,2,7000000,2,2
435,1,5,37,1,66500000,0,2


In [188]:
joblib.dump(model, 'klasifikasi_model.joblib')

['klasifikasi_model.joblib']

In [189]:
klasifikasi_model = joblib.load('klasifikasi_model.joblib')

In [190]:
test_data = {'gender': [2],
        'pendidikan': [3],
        'umur': [31],
        'status_pernikahan': [1],
        'pendapatan': [27000000],
        'jumlah_anak': [2],
        'riwayat_peminjaman': [2]}

In [191]:
[2,3,31,1,2700000,2,2]

[2, 3, 31, 1, 2700000, 2, 2]

In [192]:
klasifikasi_model.predict(pd.DataFrame(test_data))

array(['sangat baik'], dtype=object)

### Save model pickle

In [193]:
pickle.dump(model, open('klasifikasi_model.pkl', 'wb'))
klasifikasi_model = pickle.load(open('klasifikasi_model.pkl', 'rb'))

In [194]:
klasifikasi_model.predict(pd.DataFrame(test_data))

array(['sangat baik'], dtype=object)

## Regresi

### Modelling

In [195]:
df.columns

Index(['customer_id', 'nama', 'gender', 'pendidikan', 'umur', 'alamat',
       'nomor_telepon', 'status_pernikahan', 'pendapatan', 'jumlah_anak',
       'riwayat_peminjaman', 'status_kredit'],
      dtype='object')

In [196]:
X = df.drop('pendapatan',axis=1)
y = df['pendapatan']

X = preprocessing_data.fit_transform(X)
X

Unnamed: 0,gender,pendidikan,umur,status_pernikahan,jumlah_anak,riwayat_peminjaman,status_kredit
0,2,3,40,1,0,1,1
1,1,6,28,1,0,2,3
2,1,2,33,2,2,1,2
3,1,5,44,2,0,2,3
4,1,3,58,2,1,2,1
...,...,...,...,...,...,...,...
995,1,5,54,1,0,2,1
996,1,6,39,1,0,2,2
997,1,6,34,2,1,2,1
998,1,4,29,1,0,3,3


In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [198]:
model = LazyRegressor()
prediksi = model.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [00:56<00:00,  1.34s/it]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 7
[LightGBM] [Info] Start training from score 53752500.000000





In [199]:
prediksi[0]

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,0.02,0.05,26179077.95,0.02
OrthogonalMatchingPursuitCV,-0.01,0.03,26511724.03,0.01
OrthogonalMatchingPursuit,-0.01,0.03,26511724.03,0.01
LassoLarsIC,-0.02,0.02,26678880.98,0.01
LarsCV,-0.02,0.01,26741295.07,0.02
LassoLarsCV,-0.02,0.01,26741295.07,0.01
LassoCV,-0.02,0.01,26743412.55,0.03
ElasticNet,-0.03,0.01,26792123.12,0.01
TweedieRegressor,-0.03,0.01,26812382.42,0.01
GammaRegressor,-0.03,0.01,26816203.31,0.01


### Model Pipeline

In [200]:
X = df.drop('pendapatan',axis=1)
y = df['pendapatan']
X = preprocessing_data.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [201]:
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor 

regressors = [
    SVR(),
    SGDRegressor(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    ExtraTreeRegressor(),
    GradientBoostingRegressor(),
    AdaBoostRegressor(),
    XGBRegressor()
]

In [202]:
Regression_pipeline = Pipeline([
    ('regressor', regressors[0])
])

In [203]:
Regression_pipeline.fit(X_train, y_train)

In [204]:
prediksi = Regression_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, prediksi)
mse = mean_squared_error(y_test, prediksi)

# Menghitung nilai Root Mean Squared Error (RMSE) menggunakan np.sqrt
rmse = np.sqrt(mse)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


MAE: 22814999.891966943
MSE: 726654990452121.5
RMSE: 26956538.91826845


### Save model joblib

In [205]:
model = RandomForestRegressor()
model.fit(X_train,y_train)
prediksi = model.predict(X_test)
# Menghitung nilai Mean Squared Error (MSE)
mae = mean_absolute_error(y_test, prediksi)
mse = mean_squared_error(y_test, prediksi)

# Menghitung nilai Root Mean Squared Error (RMSE) menggunakan np.sqrt
rmse = np.sqrt(mse)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


MAE: 23751812.579365082
MSE: 814806258545607.9
RMSE: 28544811.411981825


In [206]:
joblib.dump(model, 'regresi_model.joblib')
regresi_model = joblib.load('regresi_model.joblib')

In [207]:
test_data = {'gender': [2],
        'pendidikan': [1],
        'umur': [45],
        'status_pernikahan': [1],
        'jumlah_anak': [3],
        'riwayat_peminjaman': [1],
        'status_kredit': [2]}

In [208]:
regresi_model.predict(pd.DataFrame(test_data))

array([45391833.33333334])

### Save Model Pickle

In [209]:
pickle.dump(model, open('regresi_model.pkl', 'wb'))
regresi_model = pickle.load(open('regresi_model.pkl', 'rb'))
regresi_model.predict(pd.DataFrame(test_data))

array([45391833.33333334])