In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [177]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [178]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [179]:
numerical = ["number_of_courses_viewed", "annual_income", "interaction_count", "lead_score"]
categorial = ['industry','employment_status', 'location', 'lead_source']
df[numerical] = df[numerical].fillna(value=0.0)
df[categorial] = df[categorial].fillna('NA')
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [180]:
df[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [181]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values


del df_train['converted']
del df_val['converted']
del df_test['converted']

In [182]:
from sklearn.metrics import mutual_info_score
categorial.remove("converted") if "converted" in categorial else None
for catg in categorial:
  score = mutual_info_score(y_train, df_train[catg])
  print(f"for {catg}: {round(score,2)}")

for industry: 0.01
for employment_status: 0.01
for location: 0.0
for lead_source: 0.04


In [183]:
from sklearn.feature_extraction import DictVectorizer
def prepare_x(input_df):
    dicts = input_df.to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    dv.fit(dicts)
    return dv.transform(dicts)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
X_train = prepare_x(df_train)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [156]:
X_val = prepare_x(df_val)

y_pred = model.predict_proba(X_val)[:, 1]

def calculate_accuracy(y_pred, y):
    df_pred = pd.DataFrame()
    df_pred['prob'] = y_pred
    df_pred['prediction'] = (y_pred >= 0.5).astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    return df_pred.correct.mean()

accuracy = calculate_accuracy(y_pred, y_val)
round(accuracy,2)

np.float64(0.7)

In [186]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)


X_train = prepare_x(df_train)
X_val = prepare_x(df_val)
acc = calculate_accuracy(y_pred, y_val)


for feat in ['industry', 'lead_source', 'employment_status']:
    df_train_dropped = df_train.drop(columns=[feat])

    X_train = prepare_x(df_train_dropped)

    model.fit(X_train, y_train)

    df_val_dropped = df_val.drop(columns=[feat])
    X_val = prepare_x(df_val_dropped)

    y_pred = model.predict_proba(X_val)[:, 1]
    feat_acc = calculate_accuracy(y_pred, y_val)
    print(feat, abs(feat_acc - acc))

industry 0.0034129692832765013
lead_source 0.0
employment_status 0.0068259385665528916


In [None]:
C_list = [0.01, 0.1, 1, 10, 100]

for C in C_list:
    X_train = prepare_x(df_train)
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    X_val = prepare_x(df_val)

    y_pred = model.predict_proba(X_val)[:, 1]
    accuracy = calculate_accuracy(y_pred, y_val)
    print(C, round(accuracy,3))