In [1]:
# Importing important libraries

# For data cleaning and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# For preprocessing and building model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [2]:
# Membaca dataset retail
train_features = pd.read_csv('train_features.csv')
train_label = pd.read_csv('train_labels.csv')
test = pd.read_csv('test_features.csv')
format = pd.read_csv('submission_format.csv')

# EDA & Cleaning

In [3]:
train_features.head(20)

Unnamed: 0,tahun_kelahiran,pendidikan,status_pernikahan,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,tanggal_menjadi_anggota
0,1979,Sarjana,Rencana Menikah,,0.0,1.0,,50575.0,260967.0,50575.0,20230.0,2.0,2.0,5.0,0.0,2014-05-05
1,1950,Sarjana,Rencana Menikah,84063000.0,,,70.0,6069.0,44506.0,80920.0,20230.0,9.0,6.0,4.0,0.0,2013-03-17
2,1966,Sarjana,Menikah,127532564.0,0.0,0.0,45.0,117611.0,265460.0,96341.0,145573.0,1.0,1.0,7.0,0.0,
3,1961,Magister,Rencana Menikah,165579620.0,0.0,0.0,90.0,206346.0,1613901.0,27725.0,125868.0,0.0,7.0,8.0,0.0,
4,1970,Sarjana,Rencana Menikah,117703159.0,1.0,1.0,78.0,90563.0,311757.0,40358.0,33875.0,7.0,6.0,5.0,0.0,
5,1952,Magister,Sendiri,94346105.0,0.0,0.0,9.0,33509.0,49228.0,0.0,,1.0,4.0,2.0,0.0,
6,1963,Magister,Sendiri,75313000.0,2.0,,96.0,0.0,14161.0,8092.0,2023.0,1.0,4.0,5.0,0.0,2013-05-03
7,1959,Sarjana,Rencana Menikah,137916316.0,0.0,0.0,21.0,38505.0,1146659.0,192956.0,240718.0,4.0,,6.0,0.0,
8,1957,Sarjana,Sendiri,138069883.0,0.0,0.0,26.0,56486.0,,193754.0,202737.0,3.0,9.0,5.0,0.0,
9,1975,Sarjana,Menikah,37567504.0,1.0,,67.0,19061.0,28067.0,22223.0,14146.0,3.0,2.0,1.0,0.0,


In [4]:
train_features.loc[train_features['jumlah_anak_balita'].isnull()]

Unnamed: 0,tahun_kelahiran,pendidikan,status_pernikahan,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,tanggal_menjadi_anggota
1,1950,Sarjana,Rencana Menikah,84063000.0,,,70.0,6069.0,44506.0,80920.0,20230.0,9.0,6.0,4.0,0.0,2013-03-17
13,1979,SMP,Menikah,14967000.0,,0.0,65.0,34391.0,30345.0,32368.0,0.0,0.0,5.0,2.0,0.0,2012-11-14
17,1973,Sarjana,Rencana Menikah,119766595.0,,1.0,16.0,20230.0,181594.0,10650.0,51288.0,0.0,7.0,8.0,0.0,
43,1954,Magister,Rencana Menikah,105339251.0,,1.0,58.0,4046.0,154472.0,15851.0,442.0,7.0,9.0,4.0,0.0,
52,1956,Magister,Rencana Menikah,190785436.0,,0.0,59.0,77286.0,1082787.0,112601.0,19302.0,0.0,9.0,6.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3731,1990,SMA,Menikah,125535217.0,,2.0,41.0,35834.0,,,103129.0,0.0,0.0,4.0,0.0,
3783,1966,Magister,Rencana Menikah,65730000.0,,1.0,10.0,0.0,52598.0,,0.0,2.0,4.0,,0.0,2012-09-07
3792,1956,Doktor,Rencana Menikah,136934000.0,,2.0,86.0,76874.0,,24276.0,0.0,5.0,13.0,3.0,0.0,2013-03-07
3795,1987,SMA,Menikah,165167035.0,,0.0,20.0,,674183.0,54447.0,53628.0,2.0,3.0,8.0,0.0,


In [5]:
train_features.isnull().sum()

tahun_kelahiran               0
pendidikan                  189
status_pernikahan           212
pendapatan                  190
jumlah_anak_balita          190
jumlah_anak_remaja          204
terakhir_belanja            172
belanja_buah                181
belanja_daging              178
belanja_ikan                193
belanja_kue                 214
pembelian_diskon            178
pembelian_web               165
pembelian_toko              169
keluhan                     196
tanggal_menjadi_anggota    2752
dtype: int64

In [6]:
train_features.columns

Index(['tahun_kelahiran', 'pendidikan', 'status_pernikahan', 'pendapatan',
       'jumlah_anak_balita', 'jumlah_anak_remaja', 'terakhir_belanja',
       'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue',
       'pembelian_diskon', 'pembelian_web', 'pembelian_toko', 'keluhan',
       'tanggal_menjadi_anggota'],
      dtype='object')

In [7]:
test.head()

Unnamed: 0,ID,tahun_kelahiran,pendidikan,status_pernikahan,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,tanggal_menjadi_anggota
0,2241,1957,Sarjana,,120660151.0,0.0,1.0,63.0,122277.0,541399.0,214192.0,84305.0,4.0,6.0,10.0,0.0,
1,2274,1968,Doktor,Menikah,163551821.0,0.0,1.0,58.0,35761.0,353335.0,63365.0,41112.0,2.0,5.0,10.0,0.0,
2,1107,1968,SMA,Menikah,29857000.0,0.0,0.0,34.0,8092.0,22253.0,30345.0,26299.0,2.0,0.0,5.0,0.0,2013-08-06
3,4478,1971,Doktor,Menikah,117949098.0,0.0,1.0,82.0,4872.0,126061.0,0.0,9745.0,3.0,5.0,7.0,0.0,
4,5080,1974,Sarjana,Rencana Menikah,164761134.0,0.0,0.0,28.0,343208.0,1416462.0,236196.0,107776.0,0.0,1.0,8.0,0.0,


In [8]:
test.columns

Index(['ID', 'tahun_kelahiran', 'pendidikan', 'status_pernikahan',
       'pendapatan', 'jumlah_anak_balita', 'jumlah_anak_remaja',
       'terakhir_belanja', 'belanja_buah', 'belanja_daging', 'belanja_ikan',
       'belanja_kue', 'pembelian_diskon', 'pembelian_web', 'pembelian_toko',
       'keluhan', 'tanggal_menjadi_anggota'],
      dtype='object')

In [9]:
train_features.dtypes

tahun_kelahiran              int64
pendidikan                  object
status_pernikahan           object
pendapatan                 float64
jumlah_anak_balita         float64
jumlah_anak_remaja         float64
terakhir_belanja           float64
belanja_buah               float64
belanja_daging             float64
belanja_ikan               float64
belanja_kue                float64
pembelian_diskon           float64
pembelian_web              float64
pembelian_toko             float64
keluhan                    float64
tanggal_menjadi_anggota     object
dtype: object

In [10]:
# menggabungkan dataset train

train = pd.concat([train_features, train_label], axis=1)
train

Unnamed: 0,tahun_kelahiran,pendidikan,status_pernikahan,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,tanggal_menjadi_anggota,jumlah_promosi
0,1979,Sarjana,Rencana Menikah,,0.0,1.0,,50575.0,260967.0,50575.0,20230.0,2.0,2.0,5.0,0.0,2014-05-05,2
1,1950,Sarjana,Rencana Menikah,84063000.0,,,70.0,6069.0,44506.0,80920.0,20230.0,9.0,6.0,4.0,0.0,2013-03-17,0
2,1966,Sarjana,Menikah,127532564.0,0.0,0.0,45.0,117611.0,265460.0,96341.0,145573.0,1.0,1.0,7.0,0.0,,1
3,1961,Magister,Rencana Menikah,165579620.0,0.0,0.0,90.0,206346.0,1613901.0,27725.0,125868.0,0.0,7.0,8.0,0.0,,4
4,1970,Sarjana,Rencana Menikah,117703159.0,1.0,1.0,78.0,90563.0,311757.0,40358.0,33875.0,7.0,6.0,5.0,0.0,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3812,1955,Magister,Menikah,78199470.0,0.0,0.0,33.0,6069.0,25977.0,3856.0,5784.0,5.0,1.0,0.0,0.0,,5
3813,1947,Doktor,Rencana Menikah,109306000.0,0.0,1.0,44.0,0.0,50575.0,,0.0,3.0,6.0,3.0,0.0,2014-06-09,1
3814,1974,Magister,Menikah,104621000.0,0.0,2.0,68.0,2023.0,62713.0,8092.0,0.0,7.0,5.0,7.0,0.0,2013-11-07,0
3815,1957,SMA,Rencana Menikah,110850000.0,1.0,1.0,67.0,18207.0,70805.0,24276.0,,4.0,5.0,4.0,0.0,2013-06-30,0


In [11]:
train.isnull().sum()

tahun_kelahiran               0
pendidikan                  189
status_pernikahan           212
pendapatan                  190
jumlah_anak_balita          190
jumlah_anak_remaja          204
terakhir_belanja            172
belanja_buah                181
belanja_daging              178
belanja_ikan                193
belanja_kue                 214
pembelian_diskon            178
pembelian_web               165
pembelian_toko              169
keluhan                     196
tanggal_menjadi_anggota    2752
jumlah_promosi                0
dtype: int64

In [16]:
# Menangani null pada pendidikan dan status_pernikahan 

train['pendidikan'].fillna('Unknown', inplace=True)
train['status_pernikahan'].fillna('Unknown', inplace=True)

test['pendidikan'].fillna('Unknown', inplace=True)
test['status_pernikahan'].fillna('Unknown', inplace=True)


In [17]:
# Mengatasi nilai null pada belanja_buah, belanja_daging, belanja_ikan, belanja_kue

train['belanja_buah'].fillna(0, inplace=True)
train['belanja_daging'].fillna(0, inplace=True)
train['belanja_ikan'].fillna(0, inplace=True)
train['belanja_kue'].fillna(0, inplace=True)

test['belanja_daging'].fillna(0, inplace=True)
test['belanja_ikan'].fillna(0, inplace=True)
test['belanja_kue'].fillna(0, inplace=True)
test['belanja_buah'].fillna(0, inplace=True)

In [18]:
train.isnull().sum()

tahun_kelahiran               0
pendidikan                    0
status_pernikahan             0
pendapatan                  190
jumlah_anak_balita          190
jumlah_anak_remaja          204
terakhir_belanja            172
belanja_buah                  0
belanja_daging                0
belanja_ikan                  0
belanja_kue                   0
pembelian_diskon            178
pembelian_web               165
pembelian_toko              169
keluhan                     196
tanggal_menjadi_anggota    2752
jumlah_promosi                0
dtype: int64

In [19]:
# meangani null pada jumlah_anak_balita, jumlah_anak_remaja, terakhir_belanja, pembelian_diskon, pembelian_web, pembelian_toko, keluhan

train['jumlah_anak_balita'].fillna(0, inplace=True)
train['jumlah_anak_remaja'].fillna(0, inplace=True)
train['terakhir_belanja'].fillna(0, inplace=True)
train['pembelian_diskon'].fillna(0, inplace=True)
train['pembelian_web'].fillna(0, inplace=True)
train['pembelian_toko'].fillna(0, inplace=True)
train['keluhan'].fillna(0, inplace=True)


test['jumlah_anak_balita'].fillna(0, inplace=True)
test['jumlah_anak_remaja'].fillna(0, inplace=True)
test['terakhir_belanja'].fillna(0, inplace=True)
test['pembelian_diskon'].fillna(0, inplace=True)
test['pembelian_web'].fillna(0, inplace=True)
test['pembelian_toko'].fillna(0, inplace=True)
test['keluhan'].fillna(0, inplace=True)

In [22]:
# menangani null pada pendapatan

mean_value = train['pendapatan'].mean()
train['pendapatan'].fillna(mean_value, inplace=True)

mean_value1 = test['pendapatan'].mean()
test['pendapatan'].fillna(mean_value1, inplace=True)

In [24]:
# menangani nilai null pada tanggal_menjadi_anggota

train['tanggal_menjadi_anggota'] = train['tanggal_menjadi_anggota'].fillna(train['tanggal_menjadi_anggota'].mode()[0])

test['tanggal_menjadi_anggota'] = test['tanggal_menjadi_anggota'].fillna(test['tanggal_menjadi_anggota'].mode()[0])

In [25]:
train.isnull().sum()

tahun_kelahiran            0
pendidikan                 0
status_pernikahan          0
pendapatan                 0
jumlah_anak_balita         0
jumlah_anak_remaja         0
terakhir_belanja           0
belanja_buah               0
belanja_daging             0
belanja_ikan               0
belanja_kue                0
pembelian_diskon           0
pembelian_web              0
pembelian_toko             0
keluhan                    0
tanggal_menjadi_anggota    0
jumlah_promosi             0
dtype: int64

In [26]:
test.isnull().sum()

ID                         0
tahun_kelahiran            0
pendidikan                 0
status_pernikahan          0
pendapatan                 0
jumlah_anak_balita         0
jumlah_anak_remaja         0
terakhir_belanja           0
belanja_buah               0
belanja_daging             0
belanja_ikan               0
belanja_kue                0
pembelian_diskon           0
pembelian_web              0
pembelian_toko             0
keluhan                    0
tanggal_menjadi_anggota    0
dtype: int64

In [23]:
train.isnull().sum()

tahun_kelahiran               0
pendidikan                    0
status_pernikahan             0
pendapatan                    0
jumlah_anak_balita            0
jumlah_anak_remaja            0
terakhir_belanja              0
belanja_buah                  0
belanja_daging                0
belanja_ikan                  0
belanja_kue                   0
pembelian_diskon              0
pembelian_web                 0
pembelian_toko                0
keluhan                       0
tanggal_menjadi_anggota    2752
jumlah_promosi                0
dtype: int64

In [None]:
# ubah tipe data jumlah_anak_balita, jumlah_anak_remaja, terakhir_belanja, pembelian_diskon, pembelian_web, pembelian_toko, dan keluhan

column = {train_features['jumlah_anak_balita'],
          train_features['jumlah_anak_remaja'],
          train_features['terakhir_belanja'],
          train_features['pembelian_diskon'],
          train_features['pembelian_web'],
          train_features['pembelian_toko'],
          train_features['keluhan']}



In [None]:
train_label

In [None]:
train_features.head()

In [None]:
train_features.isnull().sum()

In [None]:
# Mangatasi nilai null pada data string di train

train_features['pendidikan'] = train_features['pendidikan'].fillna(train_features['pendidikan'].mode()[0])
train_features['status_pernikahan'] = train_features['status_pernikahan'].fillna(train_features['status_pernikahan'].mode()[0])
train_features['tanggal_menjadi_anggota'] = train_features['tanggal_menjadi_anggota'].fillna(train_features['tanggal_menjadi_anggota'].mode()[0])

In [None]:
# Mangatasi nilai null pada data string di test

test['pendidikan'] = test['pendidikan'].fillna(test['pendidikan'].mode()[0])
test['status_pernikahan'] = test['status_pernikahan'].fillna(test['status_pernikahan'].mode()[0])
test['tanggal_menjadi_anggota'] = test['tanggal_menjadi_anggota'].fillna(test['tanggal_menjadi_anggota'].mode()[0])

In [None]:
# Mengatasi nilai null pada belanja_buah, belanja_daging, belanja_ikan, belanja_kue di train

train_features['belanja_buah'].fillna(0, inplace=True)
train_features['belanja_daging'].fillna(0, inplace=True)
train_features['belanja_ikan'].fillna(0, inplace=True)
train_features['belanja_kue'].fillna(0, inplace=True)


In [None]:
train_features.isnull().sum()

In [None]:
# Mengatasi nilai null pada belanja_buah, belanja_daging, belanja_ikan, belanja_kue di test

test['belanja_buah'].fillna(0, inplace=True)
test['belanja_daging'].fillna(0, inplace=True)
test['belanja_ikan'].fillna(0, inplace=True)
test['belanja_kue'].fillna(0, inplace=True)


In [None]:
test.isnull().sum()

In [None]:
# Menambahkan kolom baru di train

train_features['TotalAmount'] = train_features['belanja_buah'] + train_features['belanja_daging'] + train_features['belanja_ikan'] + train_features['belanja_kue']

In [None]:
# Menambahkan kolom baru di test

test['TotalAmount'] = test['belanja_buah'] + test['belanja_daging'] + test['belanja_ikan'] + test['belanja_kue']

In [None]:
mean_value = train_features['pendapatan'].mean()
print(mean_value)

In [None]:
# Mengatasi nilai null pada pendapatan, jumlah_anak_balita, jumlah_anak_remaja, terakhir_belanja, pembelian_diskon, pembelian_web, pembelian_toko

mean_value1 = train_features['pendapatan'].mean()
train_features['pendapatan'].fillna(mean_value, inplace=True)

mean_value2 = train_features['jumlah_anak_balita'].mean()
train_features['jumlah_anak_balita'].fillna(mean_value, inplace=True)

mean_value3 = train_features['jumlah_anak_remaja'].mean()
train_features['jumlah_anak_remaja'].fillna(mean_value, inplace=True)

mean_value4 = train_features['terakhir_belanja'].mean()
train_features['terakhir_belanja'].fillna(mean_value, inplace=True)

mean_value5 = train_features['pembelian_diskon'].mean()
train_features['pembelian_diskon'].fillna(mean_value, inplace=True)

mean_value6 = train_features['pembelian_web'].mean()
train_features['pembelian_web'].fillna(mean_value, inplace=True)

mean_value7 = train_features['pembelian_toko'].mean()
train_features['pembelian_toko'].fillna(mean_value, inplace=True)

In [None]:
train_features.isnull().sum()

In [None]:
# Mengatasi nilai null pada pendapatan, jumlah_anak_balita, jumlah_anak_remaja, terakhir_belanja, pembelian_diskon, pembelian_web, pembelian_toko

mean_value1 = test['pendapatan'].mean()
test['pendapatan'].fillna(mean_value, inplace=True)

mean_value2 = test['jumlah_anak_balita'].mean()
test['jumlah_anak_balita'].fillna(mean_value, inplace=True)

mean_value3 = test['jumlah_anak_remaja'].mean()
test['jumlah_anak_remaja'].fillna(mean_value, inplace=True)

mean_value4 = test['terakhir_belanja'].mean()
test['terakhir_belanja'].fillna(mean_value, inplace=True)

mean_value5 = test['pembelian_diskon'].mean()
test['pembelian_diskon'].fillna(mean_value, inplace=True)

mean_value6 = test['pembelian_web'].mean()
test['pembelian_web'].fillna(mean_value, inplace=True)

mean_value7 = test['pembelian_toko'].mean()
test['pembelian_toko'].fillna(mean_value, inplace=True)

In [None]:
test.isnull().sum()

In [None]:
# Mengatasi null pada column keluhan

train_features['keluhan'].fillna(0, inplace=True)
test['keluhan'].fillna(0, inplace=True)

In [None]:
train_features.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
for col in train_features.columns:
    print(col)
    print(train_features[col].unique())
    print()

In [None]:
train_features[train_features['pendidikan'] == '5']

In [None]:
train_features['pendidikan'].mode()

In [None]:
train_features['pendidikan'] = train_features['pendidikan'].str.replace('5', 'Sarjana')

In [None]:
train_features['status_pernikahan'].unique()

In [None]:
train_features[train_features['status_pernikahan'] == '5']

In [None]:
train_features['status_pernikahan'].mode()

In [None]:
train_features['status_pernikahan'] = train_features['status_pernikahan'].str.replace('5', 'Rencana Menikah')

In [None]:
for col in train_features.columns:
    print(col)
    print(train_features[col].unique())
    print()

In [None]:
for col in test.columns:
    print(col)
    print(test[col].unique())
    print()

In [None]:
test['pendidikan'].mode()

In [None]:
test['pendidikan'] = test['pendidikan'].str.replace('5', 'Sarjana')

In [None]:
test['status_pernikahan'].mode()

In [None]:
test['status_pernikahan'] = test['status_pernikahan'].str.replace('5', 'Rencana Menikah')

In [None]:
for col in test.columns:
    print(col)
    print(test[col].unique())
    print()

In [None]:
train_label.isnull().sum()

In [None]:
# Buat dictionary untuk mapping nilai pendidikan ke angka
mapping_pendidikan = {'SMP': 1, 'SMA': 2, 'Sarjana': 3, 'Magister': 4,
           'Doktor': 5}

# Encode kolom 'pendidikan' menggunakan mapping
train_features['pendidikan'] = train_features['pendidikan'].map(mapping_pendidikan)

In [None]:
train_features.head()

In [None]:
# Buat dictionary untuk mapping nilai status_pernikahan ke angka
mapping_status = {'Sendiri': 1, 'Rencana Menikah': 2, 'Menikah': 3, 'Cerai': 4,
           'Cerai Mati': 5}

# Encode kolom 'status_pernikahan' menggunakan mapping
train_features['status_pernikahan'] = train_features['status_pernikahan'].map(mapping_status)

In [None]:
train_features.head()

In [None]:
train_features['tanggal_menjadi_anggota'] = train_features['tanggal_menjadi_anggota'].astype(dtype='datetime64[ns]')

In [None]:
train_features.head()

In [None]:
train_features.info()

In [None]:
# Buat dictionary untuk mapping nilai pendidikan ke angka
mapping_pendidikan = {'SMP': 1, 'SMA': 2, 'Sarjana': 3, 'Magister': 4,
           'Doktor': 5}

# Encode kolom 'pendidikan' menggunakan mapping
test['pendidikan'] = test['pendidikan'].map(mapping_pendidikan)

In [None]:
# Buat dictionary untuk mapping nilai status_pernikahan ke angka
mapping_status = {'Sendiri': 1, 'Rencana Menikah': 2, 'Menikah': 3, 'Cerai': 4,
           'Cerai Mati': 5}

# Encode kolom 'status_pernikahan' menggunakan mapping
test['status_pernikahan'] = test['status_pernikahan'].map(mapping_status)

In [None]:
test['tanggal_menjadi_anggota'] = test['tanggal_menjadi_anggota'].astype(dtype='datetime64[ns]')

In [None]:
test.head()

In [None]:
# Menghapus kolom ID

test = test.drop(columns='ID')

In [None]:
format.head()

In [None]:
test.head()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_features.tanggal_menjadi_anggota = le.fit_transform(train_features.tanggal_menjadi_anggota)
test.tanggal_menjadi_anggota = le.fit_transform(test.tanggal_menjadi_anggota)

In [None]:
train_features.head()

In [None]:
test.head()

In [None]:
train_features['tanggal_menjadi_anggota'].unique()

In [None]:
train_features.info()

In [None]:
test.info()

# Melihat data yang relevan dengan melihat korelasi setiap feature terhadap variabel target

In [None]:
# Melihat data yang relevan dengan melihat korelasi setiap feature terhadap variabel target

In [None]:
# Menggabungkan fitur dan variabel target menjadi satu dataframe
df = pd.concat([train_features, train_label], axis=1)
df

In [None]:
# Menghitung korelasi Pearson antara setiap fitur dan variabel target
df.corr()

In [None]:
# Membuat heatmap dari data korelasi
corr_matrix = df.corr()  # Menghitung matriks korelasi
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')  # Membuat heatmap

# Menampilkan heatmap
plt.title('Heatmap Korelasi')
plt.show()
     

# Data Processing

In [None]:
df.columns

In [None]:
# Memilih fitur yang akan digunakan untuk prediksi
features = ['tahun_kelahiran', 'pendidikan', 'status_pernikahan', 'pendapatan',
       'jumlah_anak_balita', 'jumlah_anak_remaja', 'terakhir_belanja',
       'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue',
       'pembelian_diskon', 'pembelian_web', 'pembelian_toko', 'keluhan',
       'tanggal_menjadi_anggota']
target = 'jumlah_promosi'
     

In [None]:
# Memisahkan fitur dan label dari data train
X_train = train_features[features]
y_train = train_label[target]

In [None]:
# Memisahkan fitur dari data test
X_test = test

# Membangun model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score


# Definisikan fungsi evaluasi Macro F-Score
macro_f1_scorer = make_scorer(f1_score, average='macro')

## Linear Regression

In [None]:
linreg = LinearRegression()

In [None]:
# Hitung skor evaluasi menggunakan validasi silang dan Macro F-Score
scores = cross_val_score(linreg, X_train, y_train, cv=10, scoring=macro_f1_scorer)

## Ridege Regression

In [None]:
ridge = Ridge()

In [None]:
scores = -1 * cross_val_score(ridge, X_train, y_train, cv = 10, scoring=macro_f1_scorer)

## RandomForest Regressor

In [None]:
rfr = RandomForestRegressor()

In [None]:
scores = -1 * cross_val_score(rfr, X_train, y_train, cv = 10, scoring=macro_f1_scorer)
#scores.mean()

In [None]:
# Pilih salah satu algoritma klasifikasi, misalnya Random Forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Membuat objek model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Melatih model menggunakan data pelatihan
rf_model.fit(X_train, y_train)

In [None]:
# Membuat prediksi menggunakan data pengujian
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Menghitung Macro F-Score
from sklearn.metrics import f1_score

In [None]:
format

In [None]:
macro_fscore_rf = f1_score(format['jumlah_promosi'], y_pred_rf, average='macro')
print("Macro F-Score (Random Forest):", macro_fscore_rf)

In [None]:






# Langkah 6: Evaluasi Model dengan Macro F-Score





