In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [19]:
# Data Preparation
TARGET = 'converted'

X = df.drop(columns=[TARGET], axis=1)
y = df[TARGET]

features_categorical = X.select_dtypes(include=['object']).columns.to_list()
features_numeric = [col for col in X if col not in features_categorical]

# imputer_categorical = SimpleImputer(strategy='constant', fill_value='NA')
# imputer_numeric = SimpleImputer(strategy='constant', fill_value=0.0)

X[features_categorical] = X[features_categorical].fillna('NA')
X[features_numeric] = X[features_numeric].fillna(0.0)

In [20]:
# Q1: mode value of industry
df.industry.value_counts()

# retail

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
Name: count, dtype: int64

In [22]:
# Q2: What are the two features that have the biggest correlation?
X[features_numeric].corr()

# highest: annual_income x interaction_count

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [23]:
# Split data
SEED = 42
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, train_size=0.2, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size=0.25, random_state=SEED)

In [None]:
# Q3: which categorical feature has the biggest mutual information score
encoder_categorical = OrdinalEncoder()
X_cat_encoded = encoder_categorical.fit_transform(X_train[features_categorical])
mi = mutual_info_classif(X_cat_encoded, y_train, discrete_features=True)
mi_scores = dict(zip(features_categorical, [round(v, 2) for v in mi]))
print(mi_scores)

# industry

{'lead_source': np.float64(0.07), 'industry': np.float64(0.09), 'employment_status': np.float64(0.01), 'location': np.float64(0.06)}


In [32]:
# Q4
# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

preprocess = ColumnTransformer([
    ('categorical', OneHotEncoder(), features_categorical)
], remainder='passthrough')

pipe = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(round(accuracy, 2))

0.75


In [None]:
# Q5
# Let's find the least useful feature using the feature elimination technique.
# Train a model using the same features and parameters as in Q4 (without rounding).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

feature_dict = {}
for feature in X.columns:
    X_train_feature = X_train.drop(columns=[feature])
    X_val_feature = X_val.drop(columns=[feature])
    features_categorical_feature = [col for col in features_categorical if col != feature]

    preprocess = ColumnTransformer([
        ('categorical', OneHotEncoder(), features_categorical_feature)
    ], remainder='passthrough')

    pipe = Pipeline([
        ('preprocess', preprocess),
        ('model', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])

    pipe.fit(X_train_feature, y_train)
    y_pred = pipe.predict(X_val_feature)
    feature_accuracy = accuracy_score(y_val, y_pred)
    accuracy_diff = accuracy - feature_accuracy
    feature_dict[feature] = accuracy_diff

for k,v in feature_dict.items():
    print(f'{k}: {v}')

# lead_score / lead_score / location

lead_source: 0.0
industry: 0.004566210045662045
number_of_courses_viewed: 0.0639269406392694
annual_income: -0.06849315068493156
employment_status: 0.004566210045662045
location: 0.0
interaction_count: 0.004566210045662045
lead_score: 0.0


In [None]:
# Q6
# Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=SEED)

c_vals = [0.01, 0.1, 1, 10, 100]
acc_scores = {}
for c in c_vals:
    pipe.set_params(model=LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=SEED))
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    acc_scores[c] = acc

for k,v in acc_scores.items():
    print(f'{k}: {v}')

# 10

0.01: 0.6986301369863014
0.1: 0.7442922374429224
1: 0.7488584474885844
10: 0.7534246575342466
100: 0.7534246575342466
