In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("Leads.csv")
df.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [3]:
#let's check the shape of the data frame

df.shape

(9240, 37)

In [4]:
#let's check the info() method and the datatype of the columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [32]:
#let's check the null values percentage

round(df.isnull().sum() * 100/ 9240,3)

Prospect ID                                       0.000
Lead Number                                       0.000
Lead Origin                                       0.000
Lead Source                                       0.390
Do Not Email                                      0.000
Do Not Call                                       0.000
Converted                                         0.000
TotalVisits                                       1.483
Total Time Spent on Website                       0.000
Page Views Per Visit                              1.483
Last Activity                                     1.115
Country                                          26.634
Specialization                                   15.563
How did you hear about X Education               23.885
What is your current occupation                  29.113
What matters most to you in choosing a course    29.318
Search                                            0.000
Magazine                                        

#### We clearly see that we have more than 30% of missing data in some of the columns

In [54]:
#let's check the 'select' value in the columns, which can be considered as a null value.
cols = list(df.columns)
selectList = [round(len(df[df[i] == 'Select'][i])/9240,2) for i in cols]

[(cols[i], selectList[i])for i in range(0,len(selectList)-1) if (selectList[i] > 0)]

[('Specialization', 0.21),
 ('How did you hear about X Education', 0.55),
 ('Lead Profile', 0.45),
 ('City', 0.24)]

#### We can see four columns have values as Select

In [61]:
df[~df['Asymmetrique Activity Index'].isnull()]['Asymmetrique Activity Index'].value_counts(normalize=True)

02.Medium    0.764436
01.High      0.163481
03.Low       0.072083
Name: Asymmetrique Activity Index, dtype: float64

In [63]:
df['Asymmetrique Activity Index'].value_counts(normalize=True)

02.Medium    0.764436
01.High      0.163481
03.Low       0.072083
Name: Asymmetrique Activity Index, dtype: float64

In [64]:
df['Asymmetrique Activity Index'].describe()

count          5022
unique            3
top       02.Medium
freq           3839
Name: Asymmetrique Activity Index, dtype: object

In [90]:
[i[3:] for i in df['Asymmetrique Activity Index'].unique() if type(i)== str]

['Medium', 'High', 'Low']

In [106]:
df['Asymmetrique Profile Index'].unique()

array(['02.Medium', '01.High', '03.Low', nan], dtype=object)

In [104]:
 df['Asymmetrique Activity Index'] = df['Asymmetrique Activity Index'].apply(lambda x : x[3:] if type(x) == str else x)

In [107]:
 df['Asymmetrique Profile Index'] = df['Asymmetrique Profile Index'].apply(lambda x : x[3:] if type(x) == str else x)

In [108]:
df.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,Medium,Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,Medium,Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,Medium,High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,Medium,High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,Medium,High,15.0,18.0,No,No,Modified


In [112]:
df['Asymmetrique Profile Index'].value_counts(normalize=True)

Medium    0.555157
High      0.438670
Low       0.006173
Name: Asymmetrique Profile Index, dtype: float64

In [113]:
df['Asymmetrique Activity Index'].value_counts(normalize=True)

Medium    0.764436
High      0.163481
Low       0.072083
Name: Asymmetrique Activity Index, dtype: float64

In [114]:
# Asymetrique Activity Index is important in the model even though it has very High missing 
# values we can't drop this, hence we shall fill the values with the Medium

df['Asymmetrique Activity Index'].fillna(value="Medium", inplace=True)

In [115]:
df['Asymmetrique Activity Index'].value_counts(normalize=True)

Medium    0.871970
High      0.088853
Low       0.039177
Name: Asymmetrique Activity Index, dtype: float64

In [116]:
df['Asymmetrique Activity Index'].isnull().sum()

0

In [117]:
# we need to take care of 'Asymmetrique Profile Index' missing values