In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [12]:
# Load data
df = pd.read_csv('data/lead_scoring.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Prospect ID                                     9240 non-null   object 
 1   Lead Number                                     9240 non-null   int64  
 2   Lead Origin                                     9240 non-null   object 
 3   Lead Source                                     9204 non-null   object 
 4   Do Not Email                                    9240 non-null   object 
 5   Do Not Call                                     9240 non-null   object 
 6   Converted                                       9240 non-null   int64  
 7   TotalVisits                                     9103 non-null   float64
 8   Total Time Spent on Website                     9240 non-null   int64  
 9   Page Views Per Visit                     

## Handling Each Columns Values

In [13]:
## Untuk Kolom Prospect ID & Lead Number tidak ada masalah unique values dan non-null count sama yaitu `9240`

In [17]:
## untuk kolom Lead Origin memiliki value yang hanya 1 yaitu Quick Add Form, kita gabungkan dengan Lead Ads Form
print("\nBefore")
display(df['Lead Origin'].value_counts())

df['Lead Origin'] = np.where(df['Lead Origin'].str.contains("Add Form"), "Add Form", df['Lead Origin'])
print("\nAfter")
display(df['Lead Origin'].value_counts())


Before


Landing Page Submission    4886
API                        3580
Add Form                    719
Lead Import                  55
Name: Lead Origin, dtype: int64


After


Landing Page Submission    4886
API                        3580
Add Form                    719
Lead Import                  55
Name: Lead Origin, dtype: int64

In [21]:
## untuk kolom Lead Source, mari kita ganti Source yang kurang dari 1% populasi menjadi `Others`
# Tampilkan jumlah nilai untuk kolom 'Lead Source', dinormalisasi untuk menampilkan persentase
lead_source_percentage = df['Lead Source'].value_counts(normalize=True)
print("\nBefore")
display(lead_source_percentage)

# Lanjutkan dengan kode untuk mengganti Lead Source yang menyumbang kurang dari 1% populasi dengan 'Others'
threshold = 0.01  # 1% threshold
sources_to_replace = lead_source_percentage[lead_source_percentage < threshold].index

# Ganti Lead Source yang menyumbang kurang dari 1% populasi dengan 'Others'
df['Lead Source'] = df['Lead Source'].apply(lambda x: 'Others' if x in sources_to_replace else x)
print("\nAfter")
display(df['Lead Source'].value_counts(normalize=True))


Before


Google               0.311604
Direct Traffic       0.276293
Olark Chat           0.190678
Organic Search       0.125380
Reference            0.058018
Welingak Website     0.015428
Referral Sites       0.013581
Facebook             0.005976
bing                 0.000652
google               0.000543
Click2call           0.000435
Press_Release        0.000217
Social Media         0.000217
Live Chat            0.000217
youtubechannel       0.000109
testone              0.000109
Pay per Click Ads    0.000109
welearnblog_Home     0.000109
WeLearn              0.000109
blog                 0.000109
NC_EDM               0.000109
Name: Lead Source, dtype: float64


After


Google              0.311604
Direct Traffic      0.276293
Olark Chat          0.190678
Organic Search      0.125380
Reference           0.058018
Welingak Website    0.015428
Referral Sites      0.013581
Others              0.009018
Name: Lead Source, dtype: float64

In [22]:
df['Do Not Email'].value_counts() ## tidak ada yang perlu dihandling

No     8506
Yes     734
Name: Do Not Email, dtype: int64

In [23]:
df['Do Not Call'].value_counts() ## tidak ada yang perlu dihandling

No     9238
Yes       2
Name: Do Not Call, dtype: int64

In [24]:
df['Converted'].value_counts() ## tidak ada yang perlu dihandling

0    5679
1    3561
Name: Converted, dtype: int64

In [26]:
display(df['TotalVisits'].describe()) ## ada null values, mari kita isi dengan `0`
df['TotalVisits'].fillna(0, inplace=True)
display(df['TotalVisits'].describe()) 

count    9103.000000
mean        3.445238
std         4.854853
min         0.000000
25%         1.000000
50%         3.000000
75%         5.000000
max       251.000000
Name: TotalVisits, dtype: float64

count    9240.000000
mean        3.394156
std         4.836682
min         0.000000
25%         0.000000
50%         3.000000
75%         5.000000
max       251.000000
Name: TotalVisits, dtype: float64

In [None]:
display(df['Page Views Per Visit'].describe()) ## ada null values, mari kita isi dengan `0`
df['Page Views Per Visit'].fillna(0, inplace=True)
display(df['Page Views Per Visit'].describe()) 

In [32]:
## mari kita kelompokan aktivitas menjadi Good, Bad, dan Neutral
display(df['Last Activity'].value_counts())

# Definisikan kategori berdasarkan sudut pandang perusahaan (subjektif)
good_activities = ['Email Opened', 'Resubscribed to emails',  'Page Visited on Website', 'Form Submitted on Website', 'Email Link Clicked', 'Had a Phone Conversation', 'Approached upfront']
bad_activities = ['Email Bounced', 'Unreachable', 'Unsubscribed', 'Email Marked Spam']
neutral_activities = ['Email Received', 'SMS Sent', 'Olark Chat Conversation', 'Converted to Lead', 'View in browser link Clicked', 'Visited Booth in Tradeshow']

# Categorize 'Last Activity' menjadi Good, Bad, atau Neutral
df['Last Activity Category'] = df['Last Activity'].apply(lambda x: 'Good' if x in good_activities else ('Neutral' if x in neutral_activities else 'Bad')) # Jika `Last Activity` kosong, maka dianggap Bad

# Tampilkan
last_activity_category_counts = df['Last Activity Category'].value_counts()
display(last_activity_category_counts)

df = df.drop(columns=['Last Activity']) # kita coba drop saja

Email Opened                    3437
SMS Sent                        2745
Olark Chat Conversation          973
Page Visited on Website          640
Converted to Lead                428
Email Bounced                    326
Email Link Clicked               267
Form Submitted on Website        116
Unreachable                       93
Unsubscribed                      61
Had a Phone Conversation          30
Approached upfront                 9
View in browser link Clicked       6
Email Received                     2
Email Marked Spam                  2
Visited Booth in Tradeshow         1
Resubscribed to emails             1
Name: Last Activity, dtype: int64

Good       4500
Neutral    4155
Bad         585
Name: Last Activity Category, dtype: int64

In [None]:
# Pembersihan Data
# Menghapus atau mengimputasi nilai yang hilang
imputer = SimpleImputer(strategy='most_frequent')  # atau strategi lainnya seperti mean atau median
df['Column_Name'] = imputer.fit_transform(df[['Column_Name']])

# Menghapus duplikat
df = df.drop_duplicates()

# Transformasi Data
# Encoding variabel kategorikal
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' untuk menghindari multicollinearity
encoded_columns = encoder.fit_transform(df[['Categorical_Column']])
df_encoded = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

# Standarisasi data numerik
scaler = StandardScaler()
df[['Numerical_Column']] = scaler.fit_transform(df[['Numerical_Column']])

# Menggabungkan kembali data yang telah di-encode ke dalam dataframe utama
df = df.join(df_encoded)

# Pemilihan Fitur
# Menggunakan SelectKBest untuk memilih K fitur teratas
selector = SelectKBest(score_func=f_classif, k=10)  # k adalah jumlah fitur yang ingin Anda pertahankan
X_new = selector.fit_transform(df.drop('Target_Column', axis=1), df['Target_Column'])

# Memilih fitur berdasarkan pengetahuan domain
selected_features = df[['Feature_1', 'Feature_2', 'Feature_3']]