In [240]:

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression,RidgeClassifier 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031464,59886.273224,2.976744,0.506108,0.619015
std,1.449717,15070.140389,1.681564,0.288465,0.485795
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.2625,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [6]:
for col in df.columns:
    print(col)
    print(df[col].nunique(), df[col].unique())
    print(df[col].isnull().sum())
    print()

lead_source
5 ['paid_ads' 'social_media' 'events' 'referral' 'organic_search' nan]
128

industry
7 [nan 'retail' 'healthcare' 'education' 'manufacturing' 'technology'
 'other' 'finance']
134

number_of_courses_viewed
10 [1 5 2 3 0 4 6 8 7 9]
0

annual_income
1267 [79450. 46992. 78796. ... 45688. 71016. 92855.]
181

employment_status
4 ['unemployed' 'employed' nan 'self_employed' 'student']
100

location
7 ['south_america' 'australia' 'europe' 'africa' 'middle_east' nan
 'north_america' 'asia']
63

interaction_count
12 [ 4  1  3  6  2  0  5  7  9  8 10 11]
0

lead_score
101 [0.94 0.8  0.69 0.87 0.62 0.83 0.57 0.86 0.43 0.92 0.97 0.71 0.75 0.64
 0.74 0.51 0.33 0.2  0.6  0.49 0.55 0.68 0.63 0.82 0.48 0.54 0.46 0.26
 0.9  0.79 0.21 0.52 0.81 0.06 0.1  0.02 0.15 0.27 0.08 0.76 0.88 0.11
 0.32 0.91 0.42 0.95 0.85 0.34 0.78 0.13 0.23 0.98 0.7  0.18 0.19 0.72
 0.12 0.37 1.   0.38 0.61 0.04 0.5  0.35 0.24 0.3  0.58 0.96 0.22 0.
 0.44 0.05 0.39 0.41 0.99 0.45 0.89 0.25 0.4  0.56 0.01 0.17 0.53 0

In [7]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
Name: count, dtype: int64

In [8]:
df.industry.isnull().sum()

134

In [9]:
df.industry.fillna("NA")

0                  NA
1              retail
2          healthcare
3              retail
4           education
            ...      
1457    manufacturing
1458       technology
1459       technology
1460               NA
1461          finance
Name: industry, Length: 1462, dtype: object

In [10]:
fdf = df.fillna({'industry': 'NA',
            'employment_status': 'NA',
            'location': 'NA',
            'lead_source': 'NA',
            'annual_income': 0.0,
            })

In [11]:
fdf.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [12]:
fdf.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [113]:
fdf.converted.value_counts(normalize=True)

converted
1    0.619015
0    0.380985
Name: proportion, dtype: float64

In [116]:
global_converted_rate = fdf.converted.mean()
round(global_converted_rate, 2)

0.62

In [122]:

df_full_train, df_test = train_test_split(fdf, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [123]:
# reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
# Separate target variable
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values
# Remove target variable from feature set
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [124]:
df_full_train.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [125]:
numerical = ['number_of_courses_viewed', 'annual_income',
       'interaction_count', 'lead_score']

In [143]:
categorical = list(fdf.dtypes[df.dtypes == 'object'].index)

In [127]:
df_full_train[categorical].nunique()

lead_source          6
industry             8
employment_status    5
location             8
dtype: int64

In [131]:
df_group = df_full_train.groupby('industry').converted.agg(['mean','count'])
df_group['diff'] = df_group['mean'] - global_converted_rate
df_group['risk'] = df_group['mean'] / global_converted_rate
df_group

Unnamed: 0_level_0,mean,count,diff,risk
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.557522,113,-0.061493,0.90066
education,0.79021,143,0.171195,1.27656
finance,0.556886,167,-0.062129,0.899633
healthcare,0.593333,150,-0.025682,0.958512
manufacturing,0.621429,140,0.002414,1.003899
other,0.612903,155,-0.006112,0.990127
retail,0.566265,166,-0.05275,0.914784
technology,0.562963,135,-0.056052,0.90945


In [136]:
from IPython.display import display

In [138]:
for c in categorical:
    df_group = df_full_train.groupby(c).converted.agg(['mean','count'])
    df_group['diff'] = df_group['mean'] - global_converted_rate
    df_group['risk'] = df_group['mean'] / global_converted_rate
    display(df_group)
    print()
    print()

Unnamed: 0_level_0,mean,count,diff,risk
lead_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.669725,109,0.05071,1.08192
events,0.592965,199,-0.02605,0.957917
organic_search,0.615721,229,-0.003295,0.994678
paid_ads,0.428571,210,-0.190444,0.692344
referral,0.786408,206,0.167393,1.270418
social_media,0.583333,216,-0.035682,0.942357






Unnamed: 0_level_0,mean,count,diff,risk
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.557522,113,-0.061493,0.90066
education,0.79021,143,0.171195,1.27656
finance,0.556886,167,-0.062129,0.899633
healthcare,0.593333,150,-0.025682,0.958512
manufacturing,0.621429,140,0.002414,1.003899
other,0.612903,155,-0.006112,0.990127
retail,0.566265,166,-0.05275,0.914784
technology,0.562963,135,-0.056052,0.90945






Unnamed: 0_level_0,mean,count,diff,risk
employment_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.571429,84,-0.047586,0.923125
employed,0.671937,253,0.052922,1.085493
self_employed,0.635739,291,0.016724,1.027017
student,0.659722,288,0.040707,1.065761
unemployed,0.462451,253,-0.156564,0.747075






Unnamed: 0_level_0,mean,count,diff,risk
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.68,50,0.060985,1.098519
africa,0.592857,140,-0.026158,0.957743
asia,0.588608,158,-0.030407,0.950878
australia,0.612903,155,-0.006112,0.990127
europe,0.641176,170,0.022161,1.035801
middle_east,0.644172,163,0.025157,1.04064
north_america,0.589888,178,-0.029127,0.952946
south_america,0.554839,155,-0.064176,0.896325






In [187]:
fdf[['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [181]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [182]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

In [183]:
df_full_train[numerical].corrwith(df_full_train.converted).sort_values(ascending=False)

number_of_courses_viewed    0.442068
interaction_count           0.378482
lead_score                  0.225641
annual_income               0.029612
dtype: float64

In [184]:
# One-hot encoding
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
test_dict = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dict)

In [185]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [204]:
X_train_num = df_train[numerical].values

scaler = StandardScaler()
#scaler = MinMaxScaler()

X_train_num = scaler.fit_transform(X_train_num)

In [218]:
ohe = OneHotEncoder( sparse_output=False)
X_train_cat = ohe.fit_transform(df_train[categorical].values)

In [219]:
df_train[categorical]

Unnamed: 0,lead_source,industry,employment_status,location
0,paid_ads,retail,student,middle_east
1,organic_search,manufacturing,student,middle_east
2,paid_ads,technology,employed,north_america
3,,technology,employed,europe
4,organic_search,retail,student,australia
...,...,...,...,...
871,organic_search,other,employed,australia
872,social_media,retail,employed,north_america
873,,education,employed,asia
874,social_media,manufacturing,self_employed,europe


In [225]:
ohe.get_feature_names_out(categorical)

array(['lead_source_NA', 'lead_source_events',
       'lead_source_organic_search', 'lead_source_paid_ads',
       'lead_source_referral', 'lead_source_social_media', 'industry_NA',
       'industry_education', 'industry_finance', 'industry_healthcare',
       'industry_manufacturing', 'industry_other', 'industry_retail',
       'industry_technology', 'employment_status_NA',
       'employment_status_employed', 'employment_status_self_employed',
       'employment_status_student', 'employment_status_unemployed',
       'location_NA', 'location_africa', 'location_asia',
       'location_australia', 'location_europe', 'location_middle_east',
       'location_north_america', 'location_south_america'], dtype=object)

In [226]:
X_train = np.column_stack([X_train_num, X_train_cat])

In [227]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train)



In [228]:
X_val_num = df_val[numerical].values
X_val_num = scaler.transform(X_val_num)

X_val_cat = ohe.transform(df_val[categorical].values)

X_val = np.column_stack([X_val_num, X_val_cat])

In [233]:
y_pred = model.predict_proba(X_val)[:, 1]
accuracy_baseline = accuracy_score(y_val, y_pred >= 0.5)
accuracy_baseline

0.8532423208191127

In [236]:
def evaluate_without_feature(feature_to_exclude):
    # Get the feature names from the DictVectorizer
    feature_names = dv.get_feature_names_out()
    
    # Find indices of features to keep (all except the ones to exclude)
    exclude_indices = [i for i, feat in enumerate(feature_names) if feature_to_exclude in feat]
    keep_indices = [i for i in range(len(feature_names)) if i not in exclude_indices]
    
    # Create new datasets without the feature
    X_train_reduced = X_train[:, keep_indices]
    X_val_reduced = X_val[:, keep_indices]
    
    # Train the model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_reduced, y_train)
    
    # Evaluate on validation set
    y_val_pred = model.predict(X_val_reduced)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    return accuracy

In [237]:
# Q5 Which of following feature has the smallest difference?
# Evaluate the model without each feature
features_to_test = ['industry', 'employment_status', 'lead_score']
results = {}

for feature in features_to_test:
    accuracy_without_feature = evaluate_without_feature(feature)
    difference = accuracy_baseline - accuracy_without_feature
    results[feature] = {
        'accuracy': accuracy_without_feature,
        'difference': difference
    }
    print(f"Without {feature}: Accuracy = {accuracy_without_feature:.6f}, Difference = {difference:.6f}")

# Find the feature with the smallest difference
min_feature = min(results, key=lambda x: abs(results[x]['difference']))
print(f"\nFeature with smallest impact: {min_feature}, Difference: {results[min_feature]['difference']:.6f}")

Without industry: Accuracy = 0.825939, Difference = 0.027304
Without employment_status: Accuracy = 0.730375, Difference = 0.122867
Without lead_score: Accuracy = 0.853242, Difference = 0.000000

Feature with smallest impact: lead_score, Difference: 0.000000


In [242]:
# Q6: Train regularized logistic regression with different C values

# C values to test
c_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store results
c_results = {}

# Train and evaluate models for each C value
for c in c_values:
    # Train model with current C value
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate on validation set
    y_val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    # Store result (rounded to 3 decimal places)
    c_results[c] = round(accuracy, 3)
    
    print(f"C = {c}, Validation Accuracy = {c_results[c]}")

# Find the best C value (if multiple have the same accuracy, take the smallest)
best_accuracy = max(c_results.values())
best_c_values = [c for c, acc in c_results.items() if acc == best_accuracy]
best_c = min(best_c_values)

print(f"\nBest C value: {best_c} with validation accuracy: {c_results[best_c]}")

C = 0.01, Validation Accuracy = 0.84
C = 0.1, Validation Accuracy = 0.857
C = 1, Validation Accuracy = 0.853
C = 10, Validation Accuracy = 0.853
C = 100, Validation Accuracy = 0.853

Best C value: 0.1 with validation accuracy: 0.857
