In [480]:
import pandas as pd

In [481]:
# Data Preparation

df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [482]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [483]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [484]:
cat = df.select_dtypes(include='object').columns

num = df.select_dtypes(exclude='object').columns

cat = list(cat)
num = list(num)

df[cat] = df[cat].fillna('NA')

df[num] = df[num].fillna(0.0)

df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [485]:

df['industry'].head()

0            NA
1        retail
2    healthcare
3        retail
4     education
Name: industry, dtype: object

In [486]:
df['industry'].unique()[:10]

array(['NA', 'retail', 'healthcare', 'education', 'manufacturing',
       'technology', 'other', 'finance'], dtype=object)

In [487]:
# Most frequent observation - Mode of industry

df['industry'].mode()

# Most frequent observation is Retail

0    retail
Name: industry, dtype: object

In [488]:
#correlation matrix for numerical features
cor_matrix = df[num].corr()

pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for a, b in pairs:
    print(f"{a} & {b}: {cor_matrix.loc[a, b]}")
# Biggest Correlation: annual_income & interaction_count

interaction_count & lead_score: 0.009888182496913131
number_of_courses_viewed & lead_score: -0.004878998354681276
number_of_courses_viewed & interaction_count: -0.023565222882888037
annual_income & interaction_count: 0.02703647240481443


In [489]:
# Split data into train, validate and test using scikit learn

from sklearn.model_selection import train_test_split

In [490]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state =1)

In [491]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state =1)

In [492]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [493]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [494]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [495]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [496]:
# Calculate the mutual information score between y and other categorical variables
from sklearn.metrics import mutual_info_score
for c in cat:
    print(c, mutual_info_score(y_train, df_train[c]))
# Lead Source has biggest MI

lead_source 0.024803322681970594
industry 0.006160554077226654
employment_status 0.016344665458739693
location 0.0014525301016858547


In [497]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

# one-hot encoding
from sklearn.feature_extraction import DictVectorizer

dicts = df_train[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)
x_train = dv.fit_transform(dicts)
x_train


array([[9.5543e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [5.4924e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [7.7352e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       ...,
       [7.3702e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        1.0000e+00],
       [9.3341e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]], shape=(876, 31))

In [498]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [499]:
dicts_val = df_val[categorical + numerical].to_dict(orient = 'records')
x_val = dv.transform(dicts_val)
y_pred = model.predict_proba(x_val)[:,1]
churn_decision = (y_pred >= 0.5)
accuracy_without_rounding = (churn_decision == y_val).mean()
print("accuracy without rounding", accuracy_without_rounding)
print( "accuracy", round((churn_decision == y_val).mean(), 2))

# Accuracy is 0.7

accuracy without rounding 0.6996587030716723
accuracy 0.7


In [500]:
# Train model without industry feature
categorical = ['lead_source', 'employment_status', 'location']

# one-hot encoding
from sklearn.feature_extraction import DictVectorizer

dicts = df_train[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)
x_train = dv.fit_transform(dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

dicts_val = df_val[categorical + numerical].to_dict(orient = 'records')
x_val = dv.transform(dicts_val)
y_pred = model.predict_proba(x_val)[:,1]
churn_decision = (y_pred >= 0.5)
print("original accuracy", accuracy_without_rounding)
print("accuracy after removing feature 'industry", (churn_decision == y_val).mean())

original accuracy 0.6996587030716723
accuracy after removing feature 'industry 0.6996587030716723


In [501]:
# Train model without employment_status feature
categorical = ['lead_source', 'industry', 'location']

# one-hot encoding
from sklearn.feature_extraction import DictVectorizer

dicts = df_train[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)
x_train = dv.fit_transform(dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

dicts_val = df_val[categorical + numerical].to_dict(orient = 'records')
x_val = dv.transform(dicts_val)
y_pred = model.predict_proba(x_val)[:,1]
churn_decision = (y_pred >= 0.5)
print("original accuracy", accuracy_without_rounding)
print("accuracy after removing feature 'industry", (churn_decision == y_val).mean())

original accuracy 0.6996587030716723
accuracy after removing feature 'industry 0.7030716723549488


In [502]:
# Train model without lead_source feature
categorical = ['industry', 'employment_status', 'location']

# one-hot encoding
from sklearn.feature_extraction import DictVectorizer

dicts = df_train[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)
x_train = dv.fit_transform(dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

dicts_val = df_val[categorical + numerical].to_dict(orient = 'records')
x_val = dv.transform(dicts_val)
y_pred = model.predict_proba(x_val)[:,1]
churn_decision = (y_pred >= 0.5)
print("original accuracy", accuracy_without_rounding)
print("accuracy after removing feature 'industry", (churn_decision == y_val).mean())

# 'Industry feature has least diff"

original accuracy 0.6996587030716723
accuracy after removing feature 'industry 0.7030716723549488


In [503]:
# Parameter Tuning

Cs = [0.01, 0.1, 1, 10, 100]
c_to_acc = []

categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

dicts = df_train[categorical + numerical].to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)
x_train = dv.fit_transform(dicts)
dicts_val = df_val[categorical + numerical].to_dict(orient = 'records')
x_val = dv.transform(dicts_val)

for C in Cs:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_val)[:,1]
    churn_decision = (y_pred >= 0.5)
    c_to_acc.append((churn_decision == y_val).mean())

  

In [504]:
print(c_to_acc) 

[np.float64(0.6996587030716723), np.float64(0.6996587030716723), np.float64(0.6996587030716723), np.float64(0.6996587030716723), np.float64(0.6996587030716723)]


In [505]:
# Accuracy is same so lets go with 0.01