In [1]:
!which python

/opt/anaconda3/envs/ml_/bin/python


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [3]:
df = pd.read_csv('../data/course_lead_scoring.csv')

In [4]:
df.head(3)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1


In [19]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [5]:
df.converted.unique()

array([1, 0])

#### 0: Fix missing values issues

In [7]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [15]:
df.columns.to_list()

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score',
 'converted']

In [16]:
categorical = [
    'lead_source',
     'industry',
     'employment_status',
     'location'
]

In [17]:
numerical = [
    'number_of_courses_viewed',
     'annual_income',
    'interaction_count',
     'lead_score',
]

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [29]:
df.dtypes[df.dtypes == 'object'].index.to_list()

['lead_source', 'industry', 'employment_status', 'location']

In [32]:
for c in df.columns:
    if c in categorical:
        df[c] = df[c].fillna('NA')
    else:
        df[c] = df[c].fillna(0.0)

In [33]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [34]:
df.head(3)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1


In [35]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

#### Q1:

#### Mode of industry

In [36]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [37]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

#### Q2:

In [42]:
# Compute correlation matrix
corr_df = df[numerical].corr()

In [43]:
corr_df

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [44]:
corr_df.abs()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,0.023565,0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,0.023565,0.027036,1.0,0.009888
lead_score,0.004879,0.01561,0.009888,1.0


In [45]:
# Set diagonal to 0 to ignore self-correlation
np.fill_diagonal(corr_df.values, 0)

In [46]:
corr_df

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,0.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,0.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,0.0,0.009888
lead_score,-0.004879,0.01561,0.009888,0.0


In [47]:
corr_df.values

array([[ 0.        ,  0.00977029, -0.02356522, -0.004879  ],
       [ 0.00977029,  0.        ,  0.02703647,  0.01560955],
       [-0.02356522,  0.02703647,  0.        ,  0.00988818],
       [-0.004879  ,  0.01560955,  0.00988818,  0.        ]])

In [48]:
# Find the index (i,j) of the maximum correlation value
max_corr = corr_df.unstack().idxmax()
max_value = corr_df.unstack().max()

print(f"Highest correlation is between: {max_corr[0]} and {max_corr[1]} = {max_value:.3f}")

Highest correlation is between: annual_income and interaction_count = 0.027


In [57]:
corr_df.unstack()

number_of_courses_viewed  number_of_courses_viewed    0.000000
                          annual_income               0.009770
                          interaction_count          -0.023565
                          lead_score                 -0.004879
annual_income             number_of_courses_viewed    0.009770
                          annual_income               0.000000
                          interaction_count           0.027036
                          lead_score                  0.015610
interaction_count         number_of_courses_viewed   -0.023565
                          annual_income               0.027036
                          interaction_count           0.000000
                          lead_score                  0.009888
lead_score                number_of_courses_viewed   -0.004879
                          annual_income               0.015610
                          interaction_count           0.009888
                          lead_score                  0

In [50]:
corr_df.unstack().idxmax()

('annual_income', 'interaction_count')

#### Let's split dataset into train, val and test

In [58]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [59]:
len(df_full_train), len(df_test)

(1169, 293)

In [60]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [62]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [64]:
# Reset index to make dfs look cleaner
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [65]:
# Get the target values
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [66]:
# Delete the target column
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [67]:
df_test.head(3)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,social_media,manufacturing,2,56070.0,self_employed,middle_east,2,0.23
1,,other,1,78409.0,,australia,4,0.79
2,referral,manufacturing,2,66206.0,employed,australia,3,0.3


#### Q3: 

In [68]:
# Mututal INformation betwwen converted and other categorical columns to see which is more important

In [75]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [76]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)

In [77]:
mi.sort_values(ascending=False) #Showing importance from highest

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

In [78]:
# lead_source

#### Q4:

In [107]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [82]:
train_dicts[:2]

[{'lead_source': 'paid_ads',
  'industry': 'retail',
  'employment_status': 'student',
  'location': 'middle_east',
  'number_of_courses_viewed': 0,
  'annual_income': 58472.0,
  'interaction_count': 5,
  'lead_score': 0.03},
 {'lead_source': 'organic_search',
  'industry': 'manufacturing',
  'employment_status': 'student',
  'location': 'middle_east',
  'number_of_courses_viewed': 3,
  'annual_income': 71738.0,
  'interaction_count': 6,
  'lead_score': 0.77}]

In [134]:
val_dicts[:2]

[{'lead_source': 'paid_ads',
  'industry': 'healthcare',
  'employment_status': 'unemployed',
  'location': 'europe',
  'number_of_courses_viewed': 3,
  'annual_income': 52220.0,
  'interaction_count': 1,
  'lead_score': 0.07},
 {'lead_source': 'organic_search',
  'industry': 'technology',
  'employment_status': 'unemployed',
  'location': 'middle_east',
  'number_of_courses_viewed': 3,
  'annual_income': 59656.0,
  'interaction_count': 4,
  'lead_score': 0.65}]

In [88]:
dv = DictVectorizer(sparse=False) #Simply return a numpy array and not a sparse matrix

In [212]:
X_train = dv.fit_transform(train_dicts)

In [213]:
X_train

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [214]:
y_train[:10]

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 0])

In [215]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [216]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [217]:
# Let's create X_val

X_val = dv.transform(val_dicts)

In [230]:
# Using validation df

y_preds = model.predict_proba(X_train)

In [231]:
y_preds[:10]

array([[0.42085657, 0.57914343],
       [0.12716509, 0.87283491],
       [0.41183894, 0.58816106],
       [0.48768789, 0.51231211],
       [0.36333525, 0.63666475],
       [0.24478932, 0.75521068],
       [0.32339268, 0.67660732],
       [0.19536109, 0.80463891],
       [0.68068765, 0.31931235],
       [0.47663295, 0.52336705]])

In [232]:
y_preds = y_preds[:,1]

In [233]:
converted_decision = (y_preds >= 0.5)

In [234]:
# Let's use mean to check match and accuracy
_acc = (y_train == converted_decision).mean()

In [235]:
_acc

np.float64(0.7385844748858448)

In [236]:
np.round(_acc, 2).item()

0.74

#### Q5:

In [152]:
feature_names = df.columns != 'converted'

In [158]:
features = list(df.columns[feature_names])

In [159]:
features

['lead_source',
 'industry',
 'number_of_courses_viewed',
 'annual_income',
 'employment_status',
 'location',
 'interaction_count',
 'lead_score']

In [162]:
y_val[:10]

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0])

In [249]:
def multple_accuracies():
    accuracies = []
    diffs = []
    for f in features:
        result = [x for x in features if x != f]
        train_dicts = df_train[result].to_dict(orient='records')
        val_dicts = df_val[result].to_dict(orient='records')
    
        dv = DictVectorizer(sparse=False)
        X_train = dv.fit_transform(train_dicts)
        X_val = dv.transform(val_dicts)

        model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42) #Same num of features for each loop
        model.fit(X_train, y_train)
        
        y_preds = model.predict_proba(X_train) #### X_val
        y_preds = y_preds[:,1]
        converted_decision = (y_preds >= 0.5)

        acc = (y_train == converted_decision).mean() ###y_val
        # acc = np.round(acc, 2).item()
        diff = np.abs(_acc - acc)
        diffs.append(diff)
        
        accuracies.append((f, acc, diff)) # Feature that was removed and accuracy without it
        

    return accuracies, diffs

In [250]:
accuracies, diffs = multple_accuracies()

In [255]:
# This means removing 'annual_income' actually improved model accuracy and removing 
# 'industry' doesnt really do much to the model, so it is quite insignificant

In [None]:
#Note: We use train df throughout for train and test for accuracy...

In [252]:
np.min(diffs)

np.float64(0.0022831050228310223)

In [254]:
np.max(diffs)

np.float64(0.13242009132420085)

In [253]:
accuracies

[('lead_source',
  np.float64(0.7420091324200914),
  np.float64(0.003424657534246589)),
 ('industry',
  np.float64(0.7408675799086758),
  np.float64(0.0022831050228310223)),
 ('number_of_courses_viewed',
  np.float64(0.6232876712328768),
  np.float64(0.11529680365296802)),
 ('annual_income',
  np.float64(0.8710045662100456),
  np.float64(0.13242009132420085)),
 ('employment_status',
  np.float64(0.7351598173515982),
  np.float64(0.003424657534246589)),
 ('location',
  np.float64(0.7420091324200914),
  np.float64(0.003424657534246589)),
 ('interaction_count',
  np.float64(0.6255707762557078),
  np.float64(0.113013698630137)),
 ('lead_score',
  np.float64(0.7420091324200914),
  np.float64(0.003424657534246589))]

#### Q6: