In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

In [2]:
# Set Pandas display options to show all columns
pd.set_option('display.max_columns', None)

In [3]:
# load the leads dataset
df = pd.read_csv("../input/train_folds.csv")
df.shape

(9240, 38)

In [4]:
# As "Select" Value of many of columns are as good as null, we will replace "Select" values with null
df.replace("Select", np.nan, inplace=True)

In [5]:
# drop the irrelevant columns
df.drop(['Prospect ID', 'Lead Number'], axis=1, inplace=True)
df.shape

(9240, 36)

In [6]:
# drop the columsn with more than 25% missing values
threshold = 0.75
df.dropna(thresh=threshold*df.shape[0], axis=1, inplace=True)
df.shape

(9240, 23)

In [7]:
# drop the columns with single value
single_value_columns = [col for col in df.columns if df[col].dropna().nunique() == 1]
df.drop(single_value_columns, axis=1, inplace=True)
df.shape

(9240, 18)

In [8]:
df.isna().sum()

Lead Origin                                 0
Lead Source                                36
Do Not Email                                0
Do Not Call                                 0
Converted                                   0
TotalVisits                               137
Total Time Spent on Website                 0
Page Views Per Visit                      137
Last Activity                             103
Search                                      0
Newspaper Article                           0
X Education Forums                          0
Newspaper                                   0
Digital Advertisement                       0
Through Recommendations                     0
A free copy of Mastering The Interview      0
Last Notable Activity                       0
kfold                                       0
dtype: int64

In [9]:
run_fold=0

test = df[df.kfold == run_fold]
train = df[df.kfold != run_fold]

print(train.shape, test.shape)

(7392, 18) (1848, 18)


In [10]:
# Imputing the missing values with most frequent values
mode_imputer = SimpleImputer(strategy='most_frequent')
train = mode_imputer.fit_transform(train)
test = mode_imputer.transform(test)

In [11]:
train = pd.DataFrame(train, columns=df.columns)
test = pd.DataFrame(test, columns=df.columns)

In [12]:
# Check the train and test data for missing values
print(train.isna().sum().sum(), test.isna().sum().sum())

0 0
