In [47]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore')

In [48]:
# load dataset
data = pd.read_csv('data.csv')
print(data.shape)
data.head()

(8000, 6)


Unnamed: 0.1,Unnamed: 0,mql_id,first_contact_date,landing_page_id,origin,convert
0,0,dac32acd4db4c29c230538b72f8dd87d,2018-02-01,88740e65d5d6b056e0cda098e1ea6313,social,0.0
1,1,8c18d1de7f67e60dbd64e3c07d7e9d5d,2017-10-20,007f9098284a86ee80ddeb25d53e0af8,paid_search,0.0
2,2,b4bc852d233dfefc5131f593b538befa,2018-03-22,a7982125ff7aa3b2054c6e44f9d28522,organic_search,0.0
3,3,6be030b81c75970747525b843c1ef4f8,2018-01-22,d45d558f0daeecf3cccdffe3c59684aa,email,0.0
4,4,5420aad7fec3549a85876ba1c529bd84,2018-02-21,b48ec5f3b04e9068441002a19df93c6c,organic_search,1.0


Unnamed: 0.1,Unnamed: 0,mql_id,first_contact_date,landing_page_id,origin,convert
0,0,dac32acd4db4c29c230538b72f8dd87d,2018-02-01,88740e65d5d6b056e0cda098e1ea6313,social,0.0
1,1,8c18d1de7f67e60dbd64e3c07d7e9d5d,2017-10-20,007f9098284a86ee80ddeb25d53e0af8,paid_search,0.0
2,2,b4bc852d233dfefc5131f593b538befa,2018-03-22,a7982125ff7aa3b2054c6e44f9d28522,organic_search,0.0
3,3,6be030b81c75970747525b843c1ef4f8,2018-01-22,d45d558f0daeecf3cccdffe3c59684aa,email,0.0
4,4,5420aad7fec3549a85876ba1c529bd84,2018-02-21,b48ec5f3b04e9068441002a19df93c6c,organic_search,1.0


## Seperate Dataset into Train and Test   

Seperating the dataset involves randomness, which can be handled by setting the seed. This way we can obtain reproducibility between our research and our development code during deployment. 

In [49]:
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    data['convert'],
                                                    test_size=0.1,
                                                    # we are setting the seed here:
                                                    random_state=0)  

X_train.shape, X_test.shape

((7200, 6), (800, 6))

## Handling missing values : Categorical Variables

We can replace missing values with the new lable "missing"

In [50]:
# make a list of the categorical variables that contain missing values

vars_with_na = [
    var for var in data.columns
    if X_train[var].isnull().sum() > 0 and X_train[var].dtypes == 'O'
]

# print percentage of missing values per variable
X_train[vars_with_na].isnull().mean()

origin    0.007639
dtype: float64

In [51]:
# replace missing values with new label: "Missing"

X_train[vars_with_na] = X_train[vars_with_na].fillna('Missing')
X_test[vars_with_na] = X_test[vars_with_na].fillna('Missing')

In [52]:
# check that we have no missing information in the engineered variables
X_train[vars_with_na].isnull().sum()

origin    0
dtype: int64

In [53]:
# check that test set does not contain null values in the engineered variables
[var for var in vars_with_na if X_test[var].isnull().sum() > 0]

[]

In [54]:
# let's capture the categorical variables in a list

cat_vars = ['landing_page_id', 'origin']
cat_vars

['landing_page_id', 'origin']

### Encoding Categorical Variables

In [55]:
def replace_categories(train, test, var,target):

    # order the categories in a variable from that with the lowest
    # house sale price, to that with the highest
    ordered_labels = train.groupby([var])[target].mean().sort_values().index
    # create a dictionary of ordered categories to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}

    # use the dictionary to replace the categorical strings by integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [56]:
for var in cat_vars:
    replace_categories(X_train, X_test, var, 'convert')

In [57]:
X_train

Unnamed: 0.1,Unnamed: 0,mql_id,first_contact_date,landing_page_id,origin,convert
5059,5059,32364276cb2f62e1e492f15ca557159c,2017-08-01,410,9,0.0
5359,5359,67a381bd43fbf14a0a122b8ae1bb271a,2018-03-07,404,6,0.0
7286,7286,e38b50b909f7a76965c768c3b64cd9e7,2017-10-26,249,7,0.0
7893,7893,02d3da8fbcddd08b35d3a740f584d899,2018-05-09,388,8,0.0
3998,3998,6058d6b7d1f04803d15d1896d3256c92,2018-03-01,410,7,0.0
...,...,...,...,...,...,...
4931,4931,070659cee3540cd84a4ca2eabd2a694c,2018-01-08,376,7,0.0
3264,3264,e7bf1017621f65d4e7858af08b345bed,2018-02-12,411,7,0.0
1653,1653,51ac520c783d88964a793e455dae3506,2018-01-15,410,9,0.0
2607,2607,709a255d7ae6859551c9cb810d091a7b,2018-01-21,177,7,0.0


In [58]:
# check absence of na in the train set
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

In [79]:
X_test.shape

(800, 6)

In [80]:
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]

['landing_page_id']

In [87]:
X_test= X_test.dropna()

In [88]:
X_test.shape

(785, 6)

In [89]:
X_train.to_csv('xtrain.csv', index=False)
X_test.to_csv('xtest.csv', index=False)