In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# import sklearn libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans


In [2]:
data_path=os.path.join('lead_scoring.csv')

In [3]:
print(data_path)

lead_scoring.csv


In [4]:
data=pd.read_csv(data_path)

In [5]:
data.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Jakarta,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Jakarta,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Jakarta,02.Medium,01.High,15.0,18.0,No,No,Modified


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Prospect ID                                     9240 non-null   object 
 1   Lead Number                                     9240 non-null   int64  
 2   Lead Origin                                     9240 non-null   object 
 3   Lead Source                                     9204 non-null   object 
 4   Do Not Email                                    9240 non-null   object 
 5   Do Not Call                                     9240 non-null   object 
 6   Converted                                       9240 non-null   int64  
 7   TotalVisits                                     9103 non-null   float64
 8   Total Time Spent on Website                     9240 non-null   int64  
 9   Page Views Per Visit                     

# THE GOAL
### Finding out which customer would buy the product of Madugital based on their activity in the website


In [7]:
data.isna().mean()*100

Prospect ID                                        0.000000
Lead Number                                        0.000000
Lead Origin                                        0.000000
Lead Source                                        0.389610
Do Not Email                                       0.000000
Do Not Call                                        0.000000
Converted                                          0.000000
TotalVisits                                        1.482684
Total Time Spent on Website                        0.000000
Page Views Per Visit                               1.482684
Last Activity                                      1.114719
Country                                           26.634199
Specialization                                    15.562771
How did you hear about Madugital                  23.885281
What is your current occupation                   29.112554
What matters most to you in choosing a product    29.318182
Search                                  

In [8]:
data.describe(include=['O'])

Unnamed: 0,Prospect ID,Lead Origin,Lead Source,Do Not Email,Do Not Call,Last Activity,Country,Specialization,How did you hear about Madugital,What is your current occupation,...,Lead Quality,Update me on Supply Chain Content,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
count,9240,9240,9204,9240,9240,9137,6779,7802,7033,6550,...,4473,9240,9240,6531,7820,5022,5022,9240,9240,9240
unique,9240,5,21,2,2,17,38,19,10,6,...,5,1,1,6,7,3,3,1,2,16
top,7927b2df-8bba-4d29-b9a2-b6e0beafe620,Landing Page Submission,Google,No,No,Email Opened,Indonesia,Select,Select,Unemployed,...,Might be,No,No,Select,Jakarta,02.Medium,02.Medium,No,No,Modified
freq,1,4886,2868,8506,9238,3437,6492,1942,5043,5600,...,1560,9240,9240,4146,3222,3839,2788,9240,6352,3407


In [9]:
clean_data = data.drop(columns=['Last Activity','Specialization','Lead Origin','Lead Source','How did you hear about Madugital','What is your current occupation',
                                'What matters most to you in choosing a product','Through Recommendations','Tags',
                                'Lead Quality','Update me on Supply Chain Content','Get updates on DM Content','Lead Profile',
                                'City','I agree to pay the amount through cheque','A free copy of Mastering The Interview',
                                'Last Notable Activity','Search','Magazine','Newspaper Article','Madugital Telegram',
                                'Newspaper','Digital Advertisement'], axis=1)


I dropped these columns as they are not relevant to the goal

In [10]:
clean_data.shape, data.shape

((9240, 14), (9240, 37))

In [11]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 14 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Prospect ID                              9240 non-null   object 
 1   Lead Number                              9240 non-null   int64  
 2   Do Not Email                             9240 non-null   object 
 3   Do Not Call                              9240 non-null   object 
 4   Converted                                9240 non-null   int64  
 5   TotalVisits                              9103 non-null   float64
 6   Total Time Spent on Website              9240 non-null   int64  
 7   Page Views Per Visit                     9103 non-null   float64
 8   Country                                  6779 non-null   object 
 9   Receive More Updates About Our Products  9240 non-null   object 
 10  Asymmetrique Activity Index              5022 no

In [12]:
#object describe
clean_data.describe(include=['O'])

Unnamed: 0,Prospect ID,Do Not Email,Do Not Call,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index
count,9240,9240,9240,6779,9240,5022,5022
unique,9240,2,2,38,1,3,3
top,7927b2df-8bba-4d29-b9a2-b6e0beafe620,No,No,Indonesia,No,02.Medium,02.Medium
freq,1,8506,9238,6492,9240,3839,2788


In [13]:
clean_data.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [14]:
clean_data.loc[clean_data['Prospect ID'].duplicated()]

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score


In [15]:
clean_data.loc[clean_data['Lead Number'].duplicated()]

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score


In [16]:
clean_data['TotalVisits']=clean_data['TotalVisits'].fillna(clean_data['TotalVisits'].mean())
clean_data['Asymmetrique Activity Score']=clean_data['Asymmetrique Activity Score'].fillna(clean_data['Asymmetrique Activity Score'].mean())
clean_data['Asymmetrique Profile Score']=clean_data['Asymmetrique Profile Score'].fillna(clean_data['Asymmetrique Profile Score'].mean())

clean_data['Asymmetrique Activity Index']=clean_data['Asymmetrique Activity Index'].fillna(method='ffill')
clean_data['Asymmetrique Profile Index']=clean_data['Asymmetrique Profile Index'].fillna(method='ffill')
clean_data['Country']=clean_data['Country'].fillna(method='ffill')
clean_data['Page Views Per Visit']=clean_data['Page Views Per Visit'].fillna(clean_data['Page Views Per Visit'].mean())

In [17]:
clean_data.sample(20, random_state=400)

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score
7201,bb8e9fc5-3f58-4fef-8483-10ad1d489490,594161,Yes,No,0,2.0,712,2.0,Indonesia,No,02.Medium,02.Medium,14.306252,16.344883
6752,a8e5b4e5-7e56-4957-87ac-77cb0dbe936e,598042,No,No,0,0.0,0,0.0,Indonesia,No,02.Medium,01.High,14.306252,16.344883
6576,a8ec6419-b33f-427d-af9f-e5e7a99bfba4,599351,No,No,1,4.0,98,2.0,Indonesia,No,03.Low,02.Medium,14.306252,16.344883
4834,12283296-3c05-4926-ae9b-762e4798d9f6,613650,No,No,1,2.0,139,2.0,Indonesia,No,03.Low,02.Medium,14.306252,16.344883
5660,e5e93bbc-c96c-40d7-b697-5303398630c5,606572,No,No,0,2.0,43,2.0,Indonesia,No,01.High,01.High,14.306252,16.344883
174,70c71ff4-5848-46da-b501-2e9867c46c06,658675,No,No,1,0.0,0,0.0,Indonesia,No,02.Medium,01.High,14.0,20.0
8117,7e7ca2c8-7640-4e22-b8d5-bcacfc79c18f,587883,No,No,1,0.0,0,0.0,Indonesia,No,02.Medium,01.High,14.0,19.0
7854,3c2b9a24-2ba8-480c-8b47-6c87fe42198d,589539,No,No,0,0.0,0,0.0,Indonesia,No,01.High,02.Medium,17.0,15.0
3418,d8080e23-8964-4130-8bb7-98c6a03bcd10,627523,No,No,0,6.0,322,3.0,Indonesia,No,01.High,02.Medium,14.306252,16.344883
6956,49b21341-9440-4c31-8278-aa9a2fb0315e,596283,Yes,No,0,1.0,157,1.0,Indonesia,No,02.Medium,02.Medium,13.0,15.0


In [18]:
clean_data = clean_data.dropna()

In [19]:
clean_data.isnull().sum()

Prospect ID                                0
Lead Number                                0
Do Not Email                               0
Do Not Call                                0
Converted                                  0
TotalVisits                                0
Total Time Spent on Website                0
Page Views Per Visit                       0
Country                                    0
Receive More Updates About Our Products    0
Asymmetrique Activity Index                0
Asymmetrique Profile Index                 0
Asymmetrique Activity Score                0
Asymmetrique Profile Score                 0
dtype: int64

In [20]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9239 entries, 1 to 9239
Data columns (total 14 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Prospect ID                              9239 non-null   object 
 1   Lead Number                              9239 non-null   int64  
 2   Do Not Email                             9239 non-null   object 
 3   Do Not Call                              9239 non-null   object 
 4   Converted                                9239 non-null   int64  
 5   TotalVisits                              9239 non-null   float64
 6   Total Time Spent on Website              9239 non-null   int64  
 7   Page Views Per Visit                     9239 non-null   float64
 8   Country                                  9239 non-null   object 
 9   Receive More Updates About Our Products  9239 non-null   object 
 10  Asymmetrique Activity Index              9239 no

# INSIGHT

In [21]:
Total_Visit_Country = clean_data.groupby('Country').sum()[['TotalVisits']]
Total_Conversion = clean_data.groupby('Country').sum()[['Converted']]

  Total_Visit_Country = clean_data.groupby('Country').sum()[['TotalVisits']]
  Total_Conversion = clean_data.groupby('Country').sum()[['Converted']]


In [22]:
clean_data[clean_data['Country']=='unknown']

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score
6018,8850e4e1-0901-438f-b71d-1e2d15546263,603626,No,No,0,3.0,362,1.5,unknown,No,02.Medium,01.High,14.306252,16.344883
6717,e243fa83-6528-42dc-9cf3-07306ab5a920,598285,No,No,0,3.0,636,1.5,unknown,No,02.Medium,01.High,14.306252,16.344883
6805,57c557f0-f373-4468-affb-3ed5a03ea47c,597579,No,No,1,24.0,1527,24.0,unknown,No,02.Medium,01.High,14.306252,16.344883
7499,383d9485-5859-4e11-b9b7-31325357b4d3,592101,No,No,0,2.0,327,2.0,unknown,No,02.Medium,02.Medium,15.0,15.0
7995,43b25142-a55a-4f9d-8df4-7b54ffd4c61f,588569,No,No,0,4.0,190,4.0,unknown,No,02.Medium,02.Medium,15.0,15.0


In [23]:
clean_data.drop([6018,6717,6805,7499,7995],axis=0,inplace=True)

In [24]:
clean_data[clean_data['Page Views Per Visit']==55.]

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score
2190,0e4c0711-6cb5-455d-8e0d-7f9f2cc4f895,638668,Yes,No,0,55.0,297,55.0,Indonesia,No,01.High,02.Medium,17.0,13.0


In [25]:
clean_data.drop([2190],axis=0,inplace=True)

In [26]:
#Total user converted by country
Total_Conversion

Unnamed: 0_level_0,Converted
Country,Unnamed: 1_level_1
Asia/Pacific Region,1
Australia,9
Bahrain,4
Bangladesh,1
Belgium,1
Canada,0
China,0
Denmark,1
France,4
Germany,2


In [27]:
#Total visit by country
Total_Visit_Country//1

Unnamed: 0_level_0,TotalVisits
Country,Unnamed: 1_level_1
Asia/Pacific Region,3.0
Australia,60.0
Bahrain,29.0
Bangladesh,8.0
Belgium,12.0
Canada,22.0
China,4.0
Denmark,4.0
France,40.0
Germany,18.0


In [28]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9233 entries, 1 to 9239
Data columns (total 14 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Prospect ID                              9233 non-null   object 
 1   Lead Number                              9233 non-null   int64  
 2   Do Not Email                             9233 non-null   object 
 3   Do Not Call                              9233 non-null   object 
 4   Converted                                9233 non-null   int64  
 5   TotalVisits                              9233 non-null   float64
 6   Total Time Spent on Website              9233 non-null   int64  
 7   Page Views Per Visit                     9233 non-null   float64
 8   Country                                  9233 non-null   object 
 9   Receive More Updates About Our Products  9233 non-null   object 
 10  Asymmetrique Activity Index              9233 no

In [29]:
clean_data['Receive More Updates About Our Products'].unique()

array(['No'], dtype=object)

In [30]:
pd.qcut(clean_data['TotalVisits'],4) #Binning

1          (3.0, 5.0]
2          (1.0, 3.0]
3       (-0.001, 1.0]
4          (1.0, 3.0]
5       (-0.001, 1.0]
            ...      
9235     (5.0, 251.0]
9236       (1.0, 3.0]
9237       (1.0, 3.0]
9238       (1.0, 3.0]
9239     (5.0, 251.0]
Name: TotalVisits, Length: 9233, dtype: category
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 3.0] < (3.0, 5.0] < (5.0, 251.0]]

In [31]:
pd.qcut(clean_data['TotalVisits'],4).value_counts()

(1.0, 3.0]       2983
(-0.001, 1.0]    2583
(3.0, 5.0]       2039
(5.0, 251.0]     1628
Name: TotalVisits, dtype: int64

In [32]:
pd.qcut(clean_data['Page Views Per Visit'],3)

1          (1.5, 3.0]
2          (1.5, 3.0]
3       (-0.001, 1.5]
4       (-0.001, 1.5]
5       (-0.001, 1.5]
            ...      
9235       (1.5, 3.0]
9236       (1.5, 3.0]
9237       (1.5, 3.0]
9238       (1.5, 3.0]
9239       (1.5, 3.0]
Name: Page Views Per Visit, Length: 9233, dtype: category
Categories (3, interval[float64, right]): [(-0.001, 1.5] < (1.5, 3.0] < (3.0, 16.0]]

In [33]:
pd.qcut(clean_data['Page Views Per Visit'],4).value_counts()

(-0.001, 1.0]    2839
(1.0, 2.0]       2353
(3.0, 16.0]      2268
(2.0, 3.0]       1773
Name: Page Views Per Visit, dtype: int64

In [34]:
pd.qcut(clean_data['Total Time Spent on Website'],4).value_counts()

(-0.001, 12.0]     2319
(936.0, 2272.0]    2307
(248.0, 936.0]     2306
(12.0, 248.0]      2301
Name: Total Time Spent on Website, dtype: int64

In [35]:
clean_data.sample(10)

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score
292,4beec5ec-bdea-4ecd-ab89-345060e38bc2,657316,No,No,0,13.0,194,13.0,Indonesia,No,02.Medium,01.High,15.0,18.0
3608,901846e9-c7ba-4f95-a1d4-3cc4fa353177,625894,No,No,1,5.0,1303,5.0,Indonesia,No,01.High,02.Medium,14.306252,16.344883
8587,23bb6ab5-d81e-4f24-b26c-86d0d8805180,584241,No,No,1,2.0,1163,2.0,Indonesia,No,02.Medium,02.Medium,14.0,15.0
5591,d7bcd677-29e1-4490-9d3a-40d5d8fadd80,607089,No,No,1,2.0,1726,2.0,Indonesia,No,02.Medium,01.High,14.306252,16.344883
5229,21cfac2d-957e-4613-b432-72dee7729047,609677,No,No,1,3.0,1158,1.5,Indonesia,No,01.High,02.Medium,14.306252,16.344883
2943,1ad9269b-8d72-4730-aea4-46d92e0b8d69,631496,No,No,1,3.0,644,3.0,Indonesia,No,02.Medium,02.Medium,14.306252,16.344883
2393,c0c6af0d-fef2-46b3-a902-28905d3b82df,636765,No,No,1,3.0,818,3.0,Indonesia,No,02.Medium,02.Medium,14.0,16.0
8626,d233e1ff-68c7-4435-8d85-85b94c02cbed,583921,No,No,0,2.0,122,2.0,Indonesia,No,02.Medium,01.High,13.0,20.0
7365,a44b8850-3b46-41a3-8d90-416683ea99a2,592940,No,No,0,0.0,0,0.0,Indonesia,No,01.High,02.Medium,16.0,15.0
4113,60617e22-d80f-4e99-a34b-6ba5fdd4ecb4,620507,No,No,1,3.0,321,3.0,United Arab Emirates,No,02.Medium,02.Medium,15.0,16.0


In [36]:
clean_data['Asymmetrique Profile Index'].unique()

array(['02.Medium', '01.High', '03.Low'], dtype=object)

In [37]:
clean_data['Potential Buyer']=np.where(clean_data['Converted']==1,'Yes','No')

In [38]:
clean_data['Profile']=np.where(clean_data['Asymmetrique Profile Index']=="01.High",'High','Medium')

clean_data['Profile']=np.where(clean_data['Asymmetrique Profile Index']=="03.Low",'Low',clean_data['Profile'])

In [39]:
clean_data['Activity']=np.where(clean_data['Asymmetrique Activity Index']=="01.High",'High','Medium')

clean_data['Activity']=np.where(clean_data['Asymmetrique Activity Index']=="03.Low",'Low',clean_data['Activity'])

In [41]:
clean_data.head()

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,Potential Buyer,Profile,Activity
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,No,No,0,5.0,674,2.5,Indonesia,No,02.Medium,02.Medium,15.0,15.0,No,Medium,Medium
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,No,No,1,2.0,1532,2.0,Indonesia,No,02.Medium,01.High,14.0,20.0,Yes,High,Medium
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,No,No,0,1.0,305,1.0,Indonesia,No,02.Medium,01.High,13.0,17.0,No,High,Medium
4,3256f628-e534-4826-9d63-4a8b88782852,660681,No,No,1,2.0,1428,1.0,Indonesia,No,02.Medium,01.High,15.0,18.0,Yes,High,Medium
5,2058ef08-2858-443e-a01f-a9237db2f5ce,660680,No,No,0,0.0,0,0.0,Indonesia,No,01.High,02.Medium,17.0,15.0,No,Medium,High


In [40]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9233 entries, 1 to 9239
Data columns (total 17 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Prospect ID                              9233 non-null   object 
 1   Lead Number                              9233 non-null   int64  
 2   Do Not Email                             9233 non-null   object 
 3   Do Not Call                              9233 non-null   object 
 4   Converted                                9233 non-null   int64  
 5   TotalVisits                              9233 non-null   float64
 6   Total Time Spent on Website              9233 non-null   int64  
 7   Page Views Per Visit                     9233 non-null   float64
 8   Country                                  9233 non-null   object 
 9   Receive More Updates About Our Products  9233 non-null   object 
 10  Asymmetrique Activity Index              9233 no

In [41]:
clean_data['Page Views Per Visit'].unique()//1

array([ 2.,  2.,  1.,  0.,  4.,  8.,  2., 11.,  5.,  6.,  3.,  1.,  1.,
        3.,  7.,  2., 13.,  8.,  5.,  1.,  2.,  4.,  3., 16., 12.,  1.,
        1.,  6.,  4., 14.,  3., 10.,  1.,  1.,  2., 15.,  2.,  3.,  1.,
        9.,  2.,  4.,  1.,  3.,  5.,  2.,  2.,  2.,  2.,  2.,  2.,  3.,
        1.,  5.,  3.,  1.,  2.,  2.,  5.,  6.,  3.,  2.,  1.,  3.,  1.,
        1.,  1.,  5.,  4.,  1.,  1.,  2.,  1.,  1.,  3.,  7.,  1.,  2.,
        1.,  2.,  1.,  2.,  1., 12.,  3.,  2.,  6.,  1.,  8.,  4.,  3.,
        8.,  1.,  1.,  3.,  6.,  1.,  2.,  2.,  2.,  3.,  4.,  1.,  3.,
        1., 14.,  3.,  1.,  3.,  2.,  1., 11.,  2.])

In [42]:
clean_data['Receive More Updates About Our Products'].unique()

array(['No'], dtype=object)

In [43]:
clean_data['TotalVisits']=clean_data['TotalVisits'].clip(1,100)

In [44]:
clean_data['TotalVisits'].describe()

count    9233.000000
mean        3.652550
std         3.802272
min         1.000000
25%         1.000000
50%         3.000000
75%         5.000000
max       100.000000
Name: TotalVisits, dtype: float64

In [45]:
clean_data['TotalVisits'].unique()

array([  5.        ,   2.        ,   1.        ,   4.        ,
         8.        ,  11.        ,   6.        ,   3.        ,
         7.        ,  13.        ,  17.        ,   3.44523783,
         9.        ,  12.        ,  10.        ,  16.        ,
        14.        ,  21.        ,  15.        ,  22.        ,
        19.        ,  18.        ,  20.        ,  43.        ,
        30.        ,  23.        , 100.        ,  25.        ,
        27.        ,  29.        ,  24.        ,  28.        ,
        26.        ,  74.        ,  41.        ,  54.        ,
        32.        ,  42.        ])

In [46]:
Q1 = clean_data['TotalVisits'].quantile(0.25)
Q3 = clean_data['TotalVisits'].quantile(0.75)
IQR = Q3 - Q1
Lwishker = Q1 - 1.5 * IQR
Uwishker = Q3 + 1.5 * IQR
clean_data['TotalVisits'] = clean_data['TotalVisits'].clip(Lwishker, Uwishker)

In [47]:
clean_data['TotalVisits'].describe()

count    9233.000000
mean        3.459764
std         2.612907
min         1.000000
25%         1.000000
50%         3.000000
75%         5.000000
max        11.000000
Name: TotalVisits, dtype: float64

In [48]:
clean_data['Total Time Spent on Website'].describe()

count    9233.000000
mean      487.706379
std       548.080333
min         0.000000
25%        12.000000
50%       248.000000
75%       936.000000
max      2272.000000
Name: Total Time Spent on Website, dtype: float64

In [49]:
Q1 = clean_data['Total Time Spent on Website'].quantile(0.25)
Q3 = clean_data['Total Time Spent on Website'].quantile(0.75)
IQR = Q3 - Q1
Lwishker = Q1 - 1.5 * IQR
Uwishker = Q3 + 1.5 * IQR
clean_data['Total Time Spent on Website'] = clean_data['Total Time Spent on Website'].clip(Lwishker, Uwishker)

In [50]:
clean_data['Total Time Spent on Website'].describe()

count    9233.000000
mean      487.706379
std       548.080333
min         0.000000
25%        12.000000
50%       248.000000
75%       936.000000
max      2272.000000
Name: Total Time Spent on Website, dtype: float64

In [51]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [52]:
scaler1 = StandardScaler()
Visit_scaled = scaler1.fit_transform(clean_data[['TotalVisits']])

In [53]:
scaler2 = MinMaxScaler()
Time_scaled = scaler2.fit_transform(clean_data[['Total Time Spent on Website']])

In [54]:
clean_data['Visit_scaled'] = Visit_scaled
clean_data['Time_scaled'] = Time_scaled*100

In [55]:
clean_data[['TotalVisits','Visit_scaled','Total Time Spent on Website','Time_scaled']].sample(10, random_state=100)

Unnamed: 0,TotalVisits,Visit_scaled,Total Time Spent on Website,Time_scaled
4869,2.0,-0.558704,1321,58.142606
8687,5.0,0.589504,1445,63.600352
3719,2.0,-0.558704,144,6.338028
3511,8.0,1.737713,1015,44.674296
2508,5.0,0.589504,112,4.929577
7811,2.0,-0.558704,949,41.769366
436,3.0,-0.175968,46,2.024648
7653,11.0,2.885922,908,39.964789
4065,5.0,0.589504,96,4.225352
2307,11.0,2.885922,333,14.65669


In [56]:
clean_data[['TotalVisits','Visit_scaled','Total Time Spent on Website','Time_scaled']].describe()

Unnamed: 0,TotalVisits,Visit_scaled,Total Time Spent on Website,Time_scaled
count,9233.0,9233.0,9233.0,9233.0
mean,3.459764,-3.770886e-17,487.706379,21.46595
std,2.612907,1.000054,548.080333,24.123254
min,1.0,-0.9414407,0.0,0.0
25%,1.0,-0.9414407,12.0,0.528169
50%,3.0,-0.1759682,248.0,10.915493
75%,5.0,0.5895043,936.0,41.197183
max,11.0,2.885922,2272.0,100.0


In [57]:
clean_data[['Time_scaled']].describe()

Unnamed: 0,Time_scaled
count,9233.0
mean,21.46595
std,24.123254
min,0.0
25%,0.528169
50%,10.915493
75%,41.197183
max,100.0


In [58]:
clean_data[['Visit_scaled']].describe()

Unnamed: 0,Visit_scaled
count,9233.0
mean,-3.770886e-17
std,1.000054
min,-0.9414407
25%,-0.9414407
50%,-0.1759682
75%,0.5895043
max,2.885922


In [59]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9233 entries, 1 to 9239
Data columns (total 19 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Prospect ID                              9233 non-null   object 
 1   Lead Number                              9233 non-null   int64  
 2   Do Not Email                             9233 non-null   object 
 3   Do Not Call                              9233 non-null   object 
 4   Converted                                9233 non-null   int64  
 5   TotalVisits                              9233 non-null   float64
 6   Total Time Spent on Website              9233 non-null   int64  
 7   Page Views Per Visit                     9233 non-null   float64
 8   Country                                  9233 non-null   object 
 9   Receive More Updates About Our Products  9233 non-null   object 
 10  Asymmetrique Activity Index              9233 no

In [60]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

In [61]:
col = sorted(clean_data['Profile'].unique().tolist()) + sorted(clean_data['Activity'].unique().tolist() + sorted(clean_data['Potential Buyer'].unique().tolist()))

In [62]:
enc_df = pd.DataFrame(enc.fit_transform(clean_data[['Profile', 'Activity','Potential Buyer']]).toarray(), 
                      columns=col) 

In [63]:
clean_df = clean_data[['Profile', 'Activity', 'Potential Buyer']].join(enc_df)
clean_df

Unnamed: 0,Profile,Activity,Potential Buyer,High,Low,Medium,High.1,Low.1,Medium.1,No,Yes
1,Medium,Medium,No,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,High,Medium,Yes,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,High,Medium,No,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,High,Medium,Yes,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
5,Medium,High,No,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
9235,High,Medium,Yes,,,,,,,,
9236,High,Medium,No,,,,,,,,
9237,High,Medium,No,,,,,,,,
9238,Medium,Medium,Yes,,,,,,,,


In [64]:
index_data = pd.get_dummies(clean_data[['Profile', 'Activity','Potential Buyer']], drop_first=False) # 
final_data = clean_data.join(index_data)
final_data.sample(10)

Unnamed: 0,Prospect ID,Lead Number,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Country,Receive More Updates About Our Products,...,Visit_scaled,Time_scaled,Profile_High,Profile_Low,Profile_Medium,Activity_High,Activity_Low,Activity_Medium,Potential Buyer_No,Potential Buyer_Yes
7596,851c5aa8-13a4-4a85-a6d7-8e89aefeb44d,591460,No,No,1,1.0,0,0.0,Indonesia,No,...,-0.941441,0.0,1,0,0,0,0,1,0,1
2830,a7e76161-32fc-44ba-b383-f0a3cd3e7100,632565,No,No,0,9.0,890,9.0,Indonesia,No,...,2.120449,39.172535,1,0,0,0,0,1,1,0
4595,b3c4aec9-c55f-4d2a-80f3-c3590b1db232,615719,No,No,0,4.0,1411,4.0,Indonesia,No,...,0.206768,62.103873,1,0,0,0,0,1,1,0
6087,1520480f-4226-4fe3-9136-8aa2bc9cdf05,603146,Yes,No,0,1.0,0,0.0,Indonesia,No,...,-0.941441,0.0,0,0,1,1,0,0,1,0
7056,3b528599-f303-400f-990e-4a7f3f36d9a2,595555,No,No,0,1.0,0,0.0,Indonesia,No,...,-0.941441,0.0,0,0,1,0,0,1,1,0
6366,33abf642-6398-4d44-b55f-a3ccf31b771d,601072,No,No,0,1.0,0,0.0,Indonesia,No,...,-0.941441,0.0,0,0,1,1,0,0,1,0
8974,4fda6275-558f-4b45-b47e-d817bb97b32e,581538,No,No,0,8.0,178,4.0,Indonesia,No,...,1.737713,7.834507,1,0,0,0,0,1,1,0
5297,6bed6102-c23e-4d28-a959-0006e1b35ff2,609170,No,No,0,1.0,0,0.0,Indonesia,No,...,-0.941441,0.0,0,0,1,0,0,1,1,0
5766,8d39c225-2315-40d5-b3e2-b915cfccade1,605487,No,No,0,4.0,22,4.0,Indonesia,No,...,0.206768,0.96831,1,0,0,0,0,1,1,0
1240,c91ca540-7a7a-4333-bf3d-5fdc0bd224d3,646738,No,No,0,1.0,0,0.0,Indonesia,No,...,-0.941441,0.0,0,0,1,1,0,0,1,0


In [65]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9233 entries, 1 to 9239
Data columns (total 27 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Prospect ID                              9233 non-null   object 
 1   Lead Number                              9233 non-null   int64  
 2   Do Not Email                             9233 non-null   object 
 3   Do Not Call                              9233 non-null   object 
 4   Converted                                9233 non-null   int64  
 5   TotalVisits                              9233 non-null   float64
 6   Total Time Spent on Website              9233 non-null   int64  
 7   Page Views Per Visit                     9233 non-null   float64
 8   Country                                  9233 non-null   object 
 9   Receive More Updates About Our Products  9233 non-null   object 
 10  Asymmetrique Activity Index              9233 no

In [66]:
model_data = final_data.drop(['Prospect ID','Lead Number','Do Not Email','Do Not Call',
                              'Country','Receive More Updates About Our Products',
                              'Asymmetrique Activity Index','Asymmetrique Profile Index',
                              'Potential Buyer','Profile','Activity'], axis=1)

In [67]:
model_data.shape, final_data.shape

((9233, 16), (9233, 27))

In [68]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(model_data, test_size=0.3, random_state=2021)

In [69]:
train.shape, test.shape

((6463, 16), (2770, 16))

In [70]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6463 entries, 5368 to 6204
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Converted                    6463 non-null   int64  
 1   TotalVisits                  6463 non-null   float64
 2   Total Time Spent on Website  6463 non-null   int64  
 3   Page Views Per Visit         6463 non-null   float64
 4   Asymmetrique Activity Score  6463 non-null   float64
 5   Asymmetrique Profile Score   6463 non-null   float64
 6   Visit_scaled                 6463 non-null   float64
 7   Time_scaled                  6463 non-null   float64
 8   Profile_High                 6463 non-null   uint8  
 9   Profile_Low                  6463 non-null   uint8  
 10  Profile_Medium               6463 non-null   uint8  
 11  Activity_High                6463 non-null   uint8  
 12  Activity_Low                 6463 non-null   uint8  
 13  Activity_Medium

# MODELING

In [71]:
from sklearn.linear_model import LinearRegression

In [85]:
x_train = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
                      'Asymmetrique Profile Score'],1)
y_train = train['Converted']
x_test = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
                      'Asymmetrique Profile Score'],1)
y_test = train['Converted']

  x_train = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
  x_test = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',


In [86]:
lr = LinearRegression(normalize=True)
lr.fit(x_train,y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [88]:
lr.predict(x_test[2:3])

array([1.])

In [89]:
dt.predict(x_test[2:3])

array([1], dtype=int64)

In [75]:
from sklearn.tree import DecisionTreeClassifier

x_train = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
                      'Asymmetrique Profile Score'],1)
y_train = train['Converted']
x_test = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
                      'Asymmetrique Profile Score'],1)
y_test = train['Converted']

dt = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=2021)
dt.fit(x_train, y_train)

  x_train = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
  x_test = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',


In [76]:
from sklearn.cluster import KMeans

x_train = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
                      'Asymmetrique Profile Score'],1)
x_test = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
                      'Asymmetrique Profile Score'],1)

kmeans = KMeans(n_clusters=2, max_iter=100)
kmeans.fit(x_train)

  x_train = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',
  x_test = train.drop(['Converted','TotalVisits','Total Time Spent on Website','Asymmetrique Activity Score',


# MODELING EVALUATION

In [77]:
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, silhouette_score

In [90]:
y_pred = lr.predict(x_test)
mean_squared_error(y_test, y_pred)

2.3488728615756285e-31

In [82]:
y_pred = dt.predict(x_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

pred = (acc*100, prec*100, recall*100)
pred

(100.0, 100.0, 100.0)

In [80]:
silhouette_score(x_train, kmeans.labels_)*100

73.92620697892383