In [4]:
# ML Zoomcamp Module 3 Homework - Tristan Kilper

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [2]:
# Dataset

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [4]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
# Data Preparation

In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df[['lead_source','industry','employment_status','location']] = df[['lead_source','industry','employment_status','location']].fillna('NA')

In [8]:
df[['annual_income']] = df[['annual_income']].fillna(0.0)

In [9]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [10]:
# Q1

In [11]:
df[['industry']].value_counts()

industry     
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [12]:
# Q2

In [13]:
correlation_matrix = df[['lead_score','interaction_count','number_of_courses_viewed','annual_income']].corr()
correlation_matrix

Unnamed: 0,lead_score,interaction_count,number_of_courses_viewed,annual_income
lead_score,1.0,0.009888,-0.004879,0.01561
interaction_count,0.009888,1.0,-0.023565,0.027036
number_of_courses_viewed,-0.004879,-0.023565,1.0,0.00977
annual_income,0.01561,0.027036,0.00977,1.0


In [14]:
# Split the data

In [15]:
from sklearn.model_selection import train_test_split

In [88]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

In [73]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [17]:
# Q3

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
mis = mutual_info_score(df_full_train.converted, df_full_train.industry)
round(mis, 2)

0.01

In [20]:
mis = mutual_info_score(df_full_train.converted, df_full_train.location)
round(mis, 2)

0.0

In [21]:
mis = mutual_info_score(df_full_train.converted, df_full_train.lead_source)
round(mis, 2)

0.03

In [22]:
mis = mutual_info_score(df_full_train.converted, df_full_train.employment_status)
round(mis, 2)

0.01

In [23]:
# Q4

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [74]:
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

In [75]:
dv = DictVectorizer(sparse=False)

In [76]:
X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)

In [77]:
from sklearn.linear_model import LogisticRegression

In [78]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'liblinear'


In [95]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.63785336, 0.8132004 , 0.51896655, 0.47091749, 0.59122001,
       0.42785561, 0.86157989, 0.84175994, 0.83288585, 0.61107442,
       0.55508993, 0.79017496, 0.68357281, 0.78217476, 0.51833041,
       0.92038349, 0.51843093, 0.41400424, 0.32179776, 0.84459395,
       0.80695169, 0.74937652, 0.4306413 , 0.66773061, 0.44466428,
       0.74808991, 0.90589607, 0.34090656, 0.41738574, 0.96836958,
       0.91705804, 0.3782163 , 0.65463231, 0.90006858, 0.74396924,
       0.63354061, 0.82472668, 0.82694326, 0.65551402, 0.32830565,
       0.78300491, 0.34989916, 0.963914  , 0.64959683, 0.54092075,
       0.55727835, 0.81782093, 0.73723895, 0.72415099, 0.6870885 ,
       0.49137171, 0.84228422, 0.58407574, 0.92914276, 0.64424511,
       0.62307229, 0.6293298 , 0.27960934, 0.49257772, 0.56473308,
       0.3779644 , 0.6136367 , 0.38741484, 0.59768185, 0.8625757 ,
       0.75422888, 0.8919823 , 0.72495259, 0.95271333, 0.88703102,
       0.77864286, 0.33357231, 0.62850943, 0.51501878, 0.63835

In [96]:
converted_decision = (y_pred >= 0.5)

In [90]:
acc_con = (y_val == converted_decision).mean()

In [91]:
(y_val == converted_decision).mean().round(2)

np.float64(0.7)

In [33]:
# Q5

In [35]:
df_train.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score'],
      dtype='str')

In [36]:
no_ind = ['lead_source', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score']
no_emp_stat = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'location', 'interaction_count', 'lead_score']
no_lead_score = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count']

In [92]:
train_dicts = df_train[no_ind].to_dict(orient='records')
val_dicts = df_val[no_ind].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
converted_decision = (y_pred >= 0.5)
acc_test_1 = (y_val == converted_decision).mean()
abs(acc_con - acc_test_1)

np.float64(0.0)

In [94]:
train_dicts = df_train[no_emp_stat].to_dict(orient='records')
val_dicts = df_val[no_emp_stat].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
converted_decision = (y_pred >= 0.5)
acc_test_2 = (y_val == converted_decision).mean()
abs(acc_con - acc_test_2)

np.float64(0.0034129692832763903)

In [93]:
train_dicts = df_train[no_lead_score].to_dict(orient='records')
val_dicts = df_val[no_lead_score].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
converted_decision = (y_pred >= 0.5)
acc_test_3 = (y_val == converted_decision).mean()
abs(acc_con - acc_test_3)

np.float64(0.0068259385665528916)

In [56]:
# Q6

In [98]:
cs = [0.01, 0.1, 1, 10, 100]
accs = []
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)
for c in cs:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    converted_decision = (y_pred >= 0.5)
    accs.append((y_val == converted_decision).mean())
accs

[np.float64(0.6996587030716723),
 np.float64(0.6996587030716723),
 np.float64(0.6996587030716723),
 np.float64(0.6996587030716723),
 np.float64(0.6996587030716723)]