In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score

In [56]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
print(df.shape)
df.head()

(1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


#### Data Preparation

In [58]:
# Check missing values
print(df.isnull().sum())

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [59]:
numerical = ['interaction_count', 'lead_score', 'number_of_courses_viewed', 'annual_income']
categorical = ['industry', 'location', 'lead_source', 'employment_status']

## Fill missing values 
# Categorical NA
for col in categorical:
    df[col] = df[col].fillna('NA')

# Numerical 0.0
for col in numerical:
    df[col] = df[col].fillna(0.0)

In [60]:
print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


### Question 1 - Mode industry

In [61]:
industry_mode = df['industry'].mode()[0]
print(f"Mode: {industry_mode}")

# Verify with value counts
print(df['industry'].value_counts())

Mode: retail
industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64


### Question 2 - Correlation

In [62]:
# Correlation for numerical features
numerical = ['interaction_count', 'lead_score', 'number_of_courses_viewed', 'annual_income']

corr_num = df[numerical].corr()
print(corr_num)

print(f"interaction_count & lead_score: {corr_num.loc['interaction_count', 'lead_score']:.3f}")
print(f"number_of_courses_viewed & lead_score: {corr_num.loc['number_of_courses_viewed', 'lead_score']:.3f}")
print(f"number_of_courses_viewed & interaction_count: {corr_num.loc['number_of_courses_viewed', 'interaction_count']:.3f}")
print(f"annual_income & interaction_count: {corr_num.loc['annual_income', 'interaction_count']:.3f}")

                          interaction_count  lead_score  \
interaction_count                  1.000000    0.009888   
lead_score                         0.009888    1.000000   
number_of_courses_viewed          -0.023565   -0.004879   
annual_income                      0.027036    0.015610   

                          number_of_courses_viewed  annual_income  
interaction_count                        -0.023565       0.027036  
lead_score                               -0.004879       0.015610  
number_of_courses_viewed                  1.000000       0.009770  
annual_income                             0.009770       1.000000  
interaction_count & lead_score: 0.010
number_of_courses_viewed & lead_score: -0.005
number_of_courses_viewed & interaction_count: -0.024
annual_income & interaction_count: 0.027


#### Data Split 60%/20%/20%

In [63]:
# First: 80% (full_train), 20% (test), seed=42
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Second: split 80% into 60% (train) and 20% (val)
# 0.25 * 0.8 = 0.2
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

print(f"Train size: {len(df_train)} ({len(df_train)/len(df)*100:.1f}%)")
print(f"Validation size: {len(df_val)} ({len(df_val)/len(df)*100:.1f}%)")
print(f"Test size: {len(df_test)} ({len(df_test)/len(df)*100:.1f}%)")

# Extract target
y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

# Remove 'converted'
del df_train['converted']
del df_val['converted']
del df_test['converted']

Train size: 876 (59.9%)
Validation size: 293 (20.0%)
Test size: 293 (20.0%)


### Question 3 - Mutual Information

In [64]:
# Categorical list
categorical = ['industry', 'location', 'lead_source', 'employment_status']

mi_scores = {}
for cat in categorical:
    mi = mutual_info_score(df_train[cat], y_train)
    mi_scores[cat] = round(mi, 2)
    print(f"{cat}: {mi_scores[cat]}")

industry: 0.01
location: 0.0
lead_source: 0.04
employment_status: 0.01


### Question 4 - Logistic Regression + One Hot Encoding

In [65]:
# One-hot encoding
dv = DictVectorizer(sparse=False)

# Convert to dictionaries
train_dict = df_train[categorical + numerical].to_dict(orient='records')
val_dict = df_val[categorical + numerical].to_dict(orient='records')

# Transform
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions on validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = (y_pred == y_val).mean()
accuracy_rounded = round(accuracy, 2)
float(accuracy_rounded)

0.7

### Question 5 - Feature Elimination

In [66]:
# Accuracy result from question 4
baseline_accuracy = accuracy

# Define features 
features_test = ['industry', 'employment_status', 'lead_score']

# Store differences
differences = {}

# Test each feature
for feature in features_test:  
    # Create list
    features_without_current = [f for f in (categorical + numerical) if f != feature]
    
    # Training
    train_dict_reduced = df_train[features_without_current].to_dict(orient='records')
    val_dict_reduced = df_val[features_without_current].to_dict(orient='records')
    
    # DictVectorizer + transform
    dv_reduced = DictVectorizer(sparse=False)
    X_train_reduced = dv_reduced.fit_transform(train_dict_reduced)
    X_val_reduced = dv_reduced.transform(val_dict_reduced)
    
    # Train model
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Predictions on validation set
    y_pred_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = (y_pred_reduced == y_val).mean()
    
    # Calculate difference
    diff = baseline_accuracy - accuracy_reduced
    differences[feature] = diff

# Find differences
for feature, diff in differences.items():
    print(f"{feature:25s}: {diff:+.4f}")

industry                 : +0.0000
employment_status        : +0.0034
lead_score               : -0.0068


#### Question 6 - Regularized Logistic Regression

In [68]:
# Value list
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    # Train 
    model_c = LogisticRegression(solver='liblinear', C=C,max_iter=1000, random_state=42)
    model_c.fit(X_train, y_train)
    
    # Validation
    y_pred_c = model_c.predict(X_val)
    acc = (y_pred_c == y_val).mean()
    results[C] = round(acc, 3)  # Round to 3 decimals
    
    print(f"C={C:6.2f} → Accuracy={acc:.4f} (rounded: {round(acc, 2)})")

# Best C
best_accuracy = max(results.values())
best_C = min([c for c, acc in results.items() if acc == best_accuracy])

C=  0.01 → Accuracy=0.6997 (rounded: 0.7)
C=  0.10 → Accuracy=0.6997 (rounded: 0.7)
C=  1.00 → Accuracy=0.6997 (rounded: 0.7)
C= 10.00 → Accuracy=0.6997 (rounded: 0.7)
C=100.00 → Accuracy=0.6997 (rounded: 0.7)
