In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [9]:
sumOfMissingValues = df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [19]:
dfTypes = df.dtypes
dfTypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [35]:
categoricalTypes = dfTypes[dfTypes==object]
numericalTypes = dfTypes[(dfTypes=='float64')|(dfTypes=='int64')]

for column in categoricalTypes.index:
    df[column] = df[column].fillna('NA')

for column in numericalTypes.index:
    df[column] = df[column].fillna(0.0)

sumOfMissingValues = df.isnull().sum()
sumOfMissingValues

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [37]:
#q1
df.mode()
##answer: retail

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,organic_search,retail,1,0.0,self_employed,north_america,3,0.6,1


In [44]:
#q2
# What are the two features that have the biggest correlation?

# interaction_count and lead_score // 0.00988818249691311
# number_of_courses_viewed and lead_score // -0.004878998354681267
# number_of_courses_viewed and interaction_count // -0.023565222882887937
# annual_income and interaction_count // 0.027036472404814355

correlationMatrix = {}
for column1 in numericalTypes.index:
    correlationMatrix[column1] = {}
    for column2 in numericalTypes.index:
        if (column1 == column2):
            continue
        correlationMatrix[column1][column2] = df[column1].corr(df[column2])

correlationMatrix
#answer:  annual_income and interaction_count // 0.027036472404814355 is the biggest out of the pairs above
    
    

{'number_of_courses_viewed': {'annual_income': np.float64(0.009770285756444645),
  'interaction_count': np.float64(-0.023565222882887937),
  'lead_score': np.float64(-0.004878998354681267),
  'converted': np.float64(0.43591365802117915)},
 'annual_income': {'number_of_courses_viewed': np.float64(0.009770285756444645),
  'interaction_count': np.float64(0.027036472404814355),
  'lead_score': np.float64(0.015609546050138912),
  'converted': np.float64(0.05313144169625197)},
 'interaction_count': {'number_of_courses_viewed': np.float64(-0.02356522288288794),
  'annual_income': np.float64(0.027036472404814355),
  'lead_score': np.float64(0.00988818249691311),
  'converted': np.float64(0.3745725177994037)},
 'lead_score': {'number_of_courses_viewed': np.float64(-0.004878998354681266),
  'annual_income': np.float64(0.015609546050138912),
  'interaction_count': np.float64(0.00988818249691311),
  'converted': np.float64(0.1936734975869029)},
 'converted': {'number_of_courses_viewed': np.float64

In [56]:
#q3
# Split the data
# Split your data in train/val/test sets with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
# Make sure that the target value y is not in your dataframe.

# Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).
# Which of these variables has the biggest mutual information score?

# industry
# location
# lead_source
# employment_status

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

target = 'converted'
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

mutualInfoDict={}
for column in categoricalTypes.index:
    mutualInfoDict[column] = round(mutual_info_score(X_train[column], y_train), 2)

mutualInfoDict
#answer: lead_source, 0.4



{'lead_source': 0.04,
 'industry': 0.01,
 'employment_status': 0.01,
 'location': 0.0}

In [125]:
#q4
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

vec = DictVectorizer(sparse=False)
numericalTypes = numericalTypes[numericalTypes.index != 'converted']
cols = list(categoricalTypes.index) + list(numericalTypes.index)
train_dict = X_train[cols].to_dict(orient='records')

X_train_vec = vec.fit_transform(train_dict)
vec.get_feature_names_out()

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)

X_val_vec = vec.transform(X_val[cols].to_dict(orient='records'))
y_pred = model.predict_proba(X_val_vec)[:,1]

converted = y_pred > 0.5
(y_val == converted).mean()
##choosing closest answer: 0.74

np.float64(0.6996587030716723)

In [142]:
#q5
accuracyOriginal = 0.6996587030716723

def train_without_column(numericalTypes, categoricalTypes, X_train, y_train, X_val, y_val, drop: str):
    vec = DictVectorizer(sparse=False)
    numericalTypes = numericalTypes[numericalTypes.index != 'converted']
    cols = list(categoricalTypes.index) + list(numericalTypes.index)
    cols = [col for col in cols if col != drop]

    train_dict = X_train[cols].to_dict(orient='records')
    
    X_train_vec = vec.fit_transform(train_dict)
    vec.get_feature_names_out()
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_vec, y_train)
    
    X_val_vec = vec.transform(X_val[cols].to_dict(orient='records'))
    y_pred = model.predict_proba(X_val_vec)[:,1]

    converted = y_pred > 0.5
    return (y_val == converted).mean()

dropColumns = ['industry', 'employment_status', 'lead_score']
accuracyDiffAnalyses = {}
for dropColumn in dropColumns:
    acc_drop_col = train_without_column(numericalTypes, categoricalTypes, X_train, y_train, X_val, y_val, dropColumn)
    accuracyDiffAnalyses[dropColumn]= acc_drop_col-accuracyOriginal

min_key = min(accuracyDiffAnalyses, key=lambda k: abs(accuracyDiffAnalyses[k]))
print(min_key, accuracyDiffAnalyses[min_key])

industry 0.0


In [149]:
#q6

listC = [0.01, 0.1, 1, 10, 100]
def train_regularized(numericalTypes, categoricalTypes, X_train, y_train, X_val, y_val, C: float):
    vec = DictVectorizer(sparse=False)
    numericalTypes = numericalTypes[numericalTypes.index != 'converted']
    cols = list(categoricalTypes.index) + list(numericalTypes.index)
    train_dict = X_train[cols].to_dict(orient='records')
    
    X_train_vec = vec.fit_transform(train_dict)
    vec.get_feature_names_out()
    
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_vec, y_train)
    
    X_val_vec = vec.transform(X_val[cols].to_dict(orient='records'))
    y_pred = model.predict_proba(X_val_vec)[:,1]

    converted = y_pred > 0.5
    return (y_val == converted).mean()
    
accuracyCDict = {}
for C in listC:
    accC = train_regularized(numericalTypes, categoricalTypes, X_train, y_train, X_val, y_val, C)
    accuracyCDict[C] = round(accC,3)

print(accuracyCDict)

#answer: 0.01

{0.01: np.float64(0.7), 0.1: np.float64(0.7), 1: np.float64(0.7), 10: np.float64(0.7), 100: np.float64(0.7)}
