In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

In [3]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

In [4]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
for col in df.select_dtypes(['object']).columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [8]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [9]:
categorical_cols = df.select_dtypes(include='object').columns
categorical_cols

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')

In [10]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('converted') 
numerical_cols

Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score'],
      dtype='object')

In [11]:
df[categorical_cols] = df[categorical_cols].fillna('NA')
df[categorical_cols].head()

Unnamed: 0,lead_source,industry,employment_status,location
0,paid_ads,,unemployed,south_america
1,social_media,retail,employed,south_america
2,events,healthcare,unemployed,australia
3,paid_ads,retail,,australia
4,referral,education,self_employed,europe


In [12]:
df[numerical_cols] = df[numerical_cols].fillna(0)
df[numerical_cols].head()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.8
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62


In [13]:
industry_modes = df['industry'].mode().iloc[0]
industry_modes

'retail'

### Mode for industry is retail

In [14]:
correlation = df[numerical_cols.tolist() + ['converted']].corr()
correlation['converted'].sort_values(ascending=False)

converted                   1.000000
number_of_courses_viewed    0.435914
interaction_count           0.374573
lead_score                  0.193673
annual_income               0.053131
Name: converted, dtype: float64

In [15]:
df['interaction_count']

0       4
1       1
2       3
3       1
4       3
       ..
1457    4
1458    2
1459    3
1460    0
1461    3
Name: interaction_count, Length: 1462, dtype: int64

In [16]:
corr1 = df['interaction_count'].corr(df['lead_score'])
corr1

np.float64(0.009888182496913084)

In [17]:
corr2 = df['number_of_courses_viewed'].corr(df['lead_score'])
corr2

np.float64(-0.004878998354681256)

In [18]:
corr3 = df['number_of_courses_viewed'].corr(df['interaction_count'])
corr3

np.float64(-0.023565222882888044)

In [19]:
corr4 = df['annual_income'].corr(df['interaction_count'])
corr4

np.float64(0.027036472404814348)

In [20]:
max_corr = max(abs(corr1), abs(corr2), abs(corr3), abs(corr4))
max_corr

np.float64(0.027036472404814348)

### Biggest correlation between numerical column pairs is between annual income and interaction count

In [21]:
df_train, df_test_val = train_test_split(df, test_size=0.4, random_state=42 ,stratify=df['converted'])
df_test, df_val = train_test_split(df_test_val, test_size=0.5, random_state=42, stratify=df_test_val['converted'])

In [22]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [23]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,organic_search,healthcare,0,69942.0,student,europe,2,0.50,0
1,referral,retail,2,0.0,student,australia,4,0.14,1
2,social_media,retail,2,79978.0,student,australia,3,0.14,1
3,social_media,technology,0,51283.0,,middle_east,2,0.57,0
4,events,education,2,77612.0,student,south_america,2,0.69,1
...,...,...,...,...,...,...,...,...,...
872,paid_ads,manufacturing,0,62384.0,student,australia,4,0.82,0
873,organic_search,technology,3,69299.0,unemployed,asia,4,0.39,0
874,social_media,other,1,45257.0,employed,south_america,3,0.47,1
875,social_media,technology,0,59878.0,unemployed,europe,4,0.85,0


In [24]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [25]:
X_train= df_train.drop(columns=['converted'])
X_val= df_val.drop(columns=['converted'])
X_test = df_test.drop(columns=['converted'])

In [26]:
categorical_cols = X_train.select_dtypes(include='object').columns
categorical_cols

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')

In [27]:
X_train_encoded = X_train[categorical_cols].copy()
for col in categorical_cols:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col].astype(str))


In [28]:
X_train_encoded

Unnamed: 0,lead_source,industry,employment_status,location
0,2,3,3,4
1,4,6,3,3
2,5,6,3,3
3,5,7,0,5
4,1,1,3,7
...,...,...,...,...
872,3,4,3,3
873,2,7,4,2
874,5,5,1,7
875,5,7,4,4


In [29]:
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True, random_state=42)

mi_scores_rounded = {col: round(score, 2) for col, score in zip(categorical_cols, mi_scores)}
print("Mutual Information Scores:", mi_scores_rounded)

Mutual Information Scores: {'lead_source': np.float64(0.03), 'industry': np.float64(0.01), 'employment_status': np.float64(0.01), 'location': np.float64(0.0)}


In [30]:
max_mi_variable = max(mi_scores_rounded, key=mi_scores_rounded.get)
max_mi_variable

'lead_source'

### max mi score = Lead_source

In [31]:
categorical_cols = ['industry', 'location', 'lead_source', 'employment_status']

numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [32]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_cols + numerical_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_cols + numerical_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [33]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)


In [34]:
accuracy = accuracy_score(y_val, y_pred)
accuracy = round(accuracy, 2)
accuracy

0.73

### Accuracy = 0.73

In [35]:
features_to_test = ['industry', 'employment_status', 'lead_score']
feature_names = dv.get_feature_names_out()
differences = {}

for feature in features_to_test:
    
    cols = [i for i, f in enumerate(feature_names) if f.startswith(feature + "=") or f == feature]
    
    X_train_red = np.delete(X_train, cols, axis=1)
    X_val_red = np.delete(X_val, cols, axis=1)
    
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_red, y_train)
    acc = accuracy_score(y_val, model.predict(X_val_red))
    
    differences[feature] = accuracy - acc


least_useful = min(differences, key=differences.get)

print("Accuracy differences:", differences)
print("Least useful feature:", least_useful)

Accuracy differences: {'industry': 0.0030375426621159773, 'employment_status': 0.0030375426621159773, 'lead_score': -0.003788395904436914}
Least useful feature: lead_score


### Least useful score = lead_score

In [36]:
C_values = [0.01, 0.1, 1, 10, 100]

In [37]:

results = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    
    results[C] = round(acc, 4)

for C, acc in results.items():
    print(f"C = {C}: Validation Accuracy = {acc}")

C = 0.01: Validation Accuracy = 0.7338
C = 0.1: Validation Accuracy = 0.7304
C = 1: Validation Accuracy = 0.7304
C = 10: Validation Accuracy = 0.7304
C = 100: Validation Accuracy = 0.7304


In [38]:
best_C = max(results, key=results.get)
best_C

0.01

### Best accuracy for c value = 0.01