In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-13 20:18:59--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.6’


2025-10-13 20:19:00 (39.8 MB/s) - ‘course_lead_scoring.csv.6’ saved [80876/80876]



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('course_lead_scoring.csv')

numerical_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

categorical_cols =['lead_source','industry','employment_status','location']

df[categorical_cols] = df[categorical_cols].fillna('NA')

df[numerical_cols] = df[numerical_cols].fillna(0.0)


### Question 1

In [4]:
df.industry.mode()

0    retail
Name: industry, dtype: object

### Question 2

In [5]:
corr = df[numerical_cols].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))


corr_eff = corr.mask(mask).stack().reset_index()
corr_eff.columns = ['numerical1', 'numerical2', 'corr']


corr_eff_sorted = corr_eff.sort_values(by='corr', ascending=False)

print(corr_eff_sorted.reset_index(drop='True'))


          numerical1                numerical2      corr
0  interaction_count             annual_income  0.027036
1         lead_score             annual_income  0.015610
2         lead_score         interaction_count  0.009888
3      annual_income  number_of_courses_viewed  0.009770
4         lead_score  number_of_courses_viewed -0.004879
5  interaction_count  number_of_courses_viewed -0.023565


### Split the data

In [6]:
	
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)


df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted
y_val = df_val.converted
y_test = df_test.converted


del df_train['converted']
del df_val['converted']
del df_test['converted']

### Question 3

In [7]:
from sklearn.metrics import mutual_info_score 

In [8]:
def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.converted)

In [9]:
mi = df_full_train[categorical_cols].apply(mutual_info_churn_score)

round(mi.sort_values(ascending=False),2)

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

### Question 4

In [10]:
from sklearn.feature_extraction import DictVectorizer

train_dics = df_train[categorical_cols+numerical_cols].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dics)

val_dics = df_val[categorical_cols+numerical_cols].to_dict(orient='records')

X_val = dv.transform(val_dics)

In [11]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [12]:
y_pred = model.predict_proba(X_val)[:, 1]

In [13]:
converted_decision = (y_pred>=0.5)

In [14]:
accuracy = (y_val == converted_decision).mean()
accuracy = round(accuracy, 2)
accuracy

np.float64(0.7)

### Question 5

In [37]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

features = [
    'lead_source', 'industry', 'location', 'employment_status', 
    'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score'
]

def check_feature_effect(features):

    train_dicts = df_train[features].to_dict(orient='records')
    val_dicts = df_val[features].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    baseline_accuracy = np.mean(churn_decision == y_val)
    
    for feature in features:
        reduced = [f for f in features if f != feature]
        
        train_dicts_reduced = df_train[reduced].to_dict(orient='records')
        val_dicts_reduced = df_val[reduced].to_dict(orient='records')

        dv_reduced = DictVectorizer(sparse=False)
        X_train_reduced = dv_reduced.fit_transform(train_dicts_reduced)
        X_val_reduced = dv_reduced.transform(val_dicts_reduced)

        model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
        model_reduced.fit(X_train_reduced, y_train)

        y_pred_reduced = model_reduced.predict_proba(X_val_reduced)[:, 1]
        converted_decision = (y_pred_reduced >= 0.5)
        accuracy = np.mean(converted_decision == y_val)

        print(f"Exclude '{feature}': accuracy diff = {baseline_accuracy - accuracy:.3f}")


check_feature_effect(features)

Exclude 'lead_source': accuracy diff = -0.003
Exclude 'industry': accuracy diff = 0.000
Exclude 'location': accuracy diff = -0.010
Exclude 'employment_status': accuracy diff = 0.003
Exclude 'number_of_courses_viewed': accuracy diff = 0.143
Exclude 'annual_income': accuracy diff = -0.154
Exclude 'interaction_count': accuracy diff = 0.143
Exclude 'lead_score': accuracy diff = -0.007


### Question 6

In [38]:
c_vals = [0.01, 0.1, 1, 10, 100]

In [39]:
dicts_full_train = df_full_train[categorical_cols + numerical_cols].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

y_full_train = df_full_train.converted.values

In [40]:
for c in c_vals:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_full_train, y_full_train)
    dicts_test = df_test[categorical_cols + numerical_cols].to_dict(orient='records')
    X_test = dv.transform(dicts_test)
    y_pred = model.predict_proba(X_test)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accuracy = (y_test == converted_decision).mean()
    accuracy
    print(f'C={c}: accuracy={round(accuracy,3)}')

C=0.01: accuracy=0.737
C=0.1: accuracy=0.737
C=1: accuracy=0.737
C=10: accuracy=0.737
C=100: accuracy=0.737
