## Gathered Notebook

This notebook was generated by the Gather Extension. The intent is that it contains only the code and cells required to produce the same results as the cell originally selected for gathering. Please note that the Python analysis is quite conservative, so if it is unsure whether a line of code is necessary for execution, it will err on the side of including it.

**Please let us know if you are satisfied with what was gathered [here](https://aka.ms/gatherfeedback).**

Thanks

In [107]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)

import warnings
warnings.filterwarnings('ignore')

In [108]:
lead_score_df = pd.read_csv('../Leads.csv')

In [109]:
!pwd

/home/vk/Documents/00_my_tech_stuffs/upgrad_ds_main_program/Course-5-Machine-Learning/10-Lead-Scoring-Case-Study/others


In [110]:
def classify_feature_dtype(df, cols):
    d_categories = {'int_cat': [], "float_ts":[] }
    for col in cols:
        if (len(df[col].unique()) < 20):
            d_categories['int_cat'].append(col)
        else:
            if not isinstance(df[col][df[col].notna()].unique()[0], str):
                d_categories['float_ts'].append(col)
            else:
                d_categories['int_cat'].append(col)
    return d_categories
# Print all statistical information for a given set of columns
def check_cols_null_pct(df):
    df_non_na = df.count() / len(df)  # Ratio of non null values
    df_na_pct = (1 - df_non_na) * 100 # Find the Percentage of null values
    return df_na_pct.sort_values(ascending=False) # Sort the resulting values in descending order
# Generates charts based on the data type of the cols, as part of the univariate analysis 
# it takes dataframe, columns, train data 0,1, and feature type as args.
def univariate_plots(df, cols, target=None, ftype=None, l_dict = None):
    for col in cols:
        #generate plots and graphs for category type. (generates piechart, countplot, boxplot / if training data is provided it generates bar chart instead)
        if ftype == "categorical":
            fig, axs = plt.subplots(1, 3, figsize=(20, 6))
 
            col_idx = 0
            axs[col_idx].pie(x=df[col].value_counts().head(12), labels=df[col].value_counts().head(12).index.str[:10], autopct="%1.1f%%", 
                    radius=1, textprops={"fontsize": 10, "color": "Black"}, startangle=90, rotatelabels=False, )
            axs[col_idx].set_title("PieChart of {0}".format(col), y=1); plt.xticks(rotation=45); plt.ylabel("Percentage")
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1
            sns.countplot(data=df, y=col, order=df[col].value_counts().head(15).index, palette="viridis",  ax=axs[col_idx])
            if (l_dict is not None) and (l_dict.get(col) is not None):
                axs[col_idx].legend([ f'{k} - {v}' for k,v in l_dict[col].items()])
            axs[col_idx].set_title("Countplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col); plt.ylabel("Count")
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            col_idx += 1
            ax = sns.barplot(data=df, x=df[col].str[:10], y=target, order=df[col].value_counts().index.str[:10], palette="viridis",  ax=axs[col_idx], errwidth=0)
            for i in ax.containers:
                ax.bar_label(i,)
            axs[col_idx].set_title('Barplot against target'); plt.xticks(rotation=90); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            plt.suptitle("Univariate analysis of {0}".format(col), fontsize=12, y=0.95)
            plt.tight_layout()
            plt.subplots_adjust(top=0.85)
            plt.show();
            plt.clf()
        #generate plots and graphs for numerical types. (generates boxplot, histplot, kdeplot, scatterplot)
        elif ftype == "non_categorical":        
            fig, axs = plt.subplots(1, 4, figsize=(20, 6))
            
            col_idx = 0
            
            sns.boxplot(data=df, y=col, palette="viridis", flierprops=dict(marker="o", markersize=6, markerfacecolor="red", markeredgecolor="black"),
                        medianprops=dict(linestyle="-", linewidth=3, color="#FF9900"), whiskerprops=dict(linestyle="-", linewidth=2, color="black"),
                        capprops=dict(linestyle="-", linewidth=2, color="black"), ax=axs[col_idx])
            axs[col_idx].set_title("Boxplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1
            axs[col_idx].hist(data=df, x=col, label=col)
            axs[col_idx].set_title("Histogram of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1
            sns.kdeplot(df[col], shade=True, ax=axs[col_idx])
            axs[col_idx].set_title("KDE plot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1
            sns.scatterplot(df[col], ax=axs[col_idx])
            axs[col_idx].set_title("Scatterplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            plt.suptitle("Univariate analysis of {0}".format(col), fontsize=12, y=0.95)
            plt.tight_layout()
            plt.subplots_adjust(top=0.85)
            plt.show()
            plt.clf()
# Perform Outlier analysis on the given dataframe.
# Find Lower threshold, Upper threshold and IQR values. 
# Return the Result as a dataframe. 
# find_outlier = True argument: restricts the output df to outlier columns. whereas find_outlier = False: returns results for all columns
def get_extremeval_threshld(df, find_outlier=False):
    outlier_df = pd.DataFrame(columns=[i for i in df.columns if find_outlier == True], data=None)
    
    for col in df.columns:
        thirdq, firstq = df[col].quantile(0.75), df[col].quantile(0.25)
        iqr = 1.5 * (thirdq - firstq)
        extvalhigh, extvallow = iqr + thirdq, firstq - iqr
        
        if find_outlier == True:
            dfout = df.loc[(df[col] > extvalhigh) | (df[col] < extvallow)]
            dfout = dfout.assign(name=col, thresh_low=extvallow, thresh_high=extvalhigh)
        else:
            dfout = pd.DataFrame([[col, extvallow, extvalhigh]], columns=['name', 'thresh_low', 'thresh_high'])
            
        outlier_df = pd.concat([outlier_df, dfout])
    # outlier_df = outlier_df.reset_index(drop=True)
    outlier_df = outlier_df.set_index('name',drop=True)
    return outlier_df

In [111]:
lead_score_df = lead_score_df.drop(columns=['Prospect ID', 'I agree to pay the amount through cheque', 'Last Notable Activity'])

In [112]:
lead_score_df = lead_score_df.rename(columns={'Total Time Spent on Website':'ttime_on_site', 'Page Views Per Visit':'pg_view_pv', 'How did you hear about X Education':'info_abt_X_Edu', 'What is your current occupation':'curr_occupation',
    'What matters most to you in choosing a course':'reason_behind_course', 'Receive More Updates About Our Courses':'more_course_updates', 'Update me on Supply Chain Content':'supply_chain_info', 'Get updates on DM Content':'get_dm',
    'Asymmetrique Activity Index':'asym_activ_idx', 'Asymmetrique Profile Index':'asym_prof_idx', 'Asymmetrique Activity Score':'asym_activ_score', 'Asymmetrique Profile Score':'asym_prof_score',
    'A free copy of Mastering The Interview':'avail_free_copy'})

In [113]:
lead_score_df.columns = lead_score_df.columns.str.replace(pat=' ',repl='_', regex=True)
lead_score_df.columns = lead_score_df.columns.str.lower()

In [114]:
lead_score_df = lead_score_df.replace(to_replace=['select','Select'], value=np.nan)

In [115]:
from fast_ml.feature_selection import get_constant_features
constant_features = get_constant_features(lead_score_df)

In [116]:
lead_score_df = lead_score_df.drop(['magazine', 'more_course_updates', 'supply_chain_info', 'get_dm', 'x_education_forums', 
                                    'newspaper', 'do_not_call', 'newspaper_article', 'digital_advertisement', 'through_recommendations', 'search'], axis=1)

In [117]:
obj_cols = lead_score_df.select_dtypes(include='object').columns
lead_score_df[obj_cols] = lead_score_df[obj_cols].astype(dtype='category')

In [118]:
null_pct = check_cols_null_pct(lead_score_df)
lead_score_df = lead_score_df.drop(null_pct[null_pct > 40].index, axis=1)

In [119]:
for i in lead_score_df.select_dtypes(include='category'):
    temp = lead_score_df[i].value_counts(normalize=True, ascending=False) * 100
    if temp.iloc[0] > 50:
        lead_score_df[i] = lead_score_df[i].fillna(temp.index[0])

In [120]:
dtype_dict = classify_feature_dtype(lead_score_df, lead_score_df.columns )
# univariate_plots(lead_score_df, dtype_dict['float_ts'], ftype='non_categorical', target='converted')

In [121]:
cols = dtype_dict['int_cat'].copy()
cols.remove('converted')
# univariate_plots(lead_score_df, cols, ftype='categorical', target='converted')

In [122]:
# sns.pairplot(lead_score_df)

In [123]:
ex_val_df = get_extremeval_threshld(df=lead_score_df.select_dtypes(exclude=['category','object']) )

In [124]:
lower_cutoff = ex_val_df.loc['pg_view_pv','thresh_low']
lead_score_df['pg_view_pv'] = np.where((lead_score_df['pg_view_pv'] < lower_cutoff), lower_cutoff, lead_score_df['pg_view_pv'])
upper_cutoff = ex_val_df.loc['pg_view_pv','thresh_high']
lead_score_df['pg_view_pv'] = np.where((lead_score_df['pg_view_pv'] > upper_cutoff), upper_cutoff, lead_score_df['pg_view_pv'])

lower_cutoff = ex_val_df.loc['totalvisits','thresh_low']
lead_score_df['totalvisits'] = np.where((lead_score_df['totalvisits'] < lower_cutoff), lower_cutoff, lead_score_df['totalvisits'])
upper_cutoff = ex_val_df.loc['totalvisits','thresh_high']
lead_score_df['totalvisits'] = np.where((lead_score_df['totalvisits'] > upper_cutoff), upper_cutoff, lead_score_df['totalvisits'])

lower_cutoff = ex_val_df.loc['ttime_on_site','thresh_low']
lead_score_df['ttime_on_site'] = np.where((lead_score_df['ttime_on_site'] < lower_cutoff), lower_cutoff, lead_score_df['ttime_on_site'])
upper_cutoff = ex_val_df.loc['ttime_on_site','thresh_high']
lead_score_df['ttime_on_site'] = np.where((lead_score_df['ttime_on_site'] > upper_cutoff), upper_cutoff, lead_score_df['ttime_on_site'])

In [125]:
lead_score_df['totalvisits'] = lead_score_df['totalvisits'].replace(to_replace=np.nan, value=lead_score_df['totalvisits'].mean())
lead_score_df['pg_view_pv'] = lead_score_df['pg_view_pv'].replace(to_replace=np.nan, value=lead_score_df['pg_view_pv'].mean())

In [126]:
lead_score_df = lead_score_df.replace(to_replace=['Yes', 'No'], value=[1, 0])

In [127]:
lead_score_df.isna().sum().sort_values(ascending=False)

specialization          3380
tags                    3353
last_activity            103
lead_source               36
lead_number                0
lead_origin                0
do_not_email               0
converted                  0
totalvisits                0
ttime_on_site              0
pg_view_pv                 0
country                    0
curr_occupation            0
reason_behind_course       0
city                       0
avail_free_copy            0
dtype: int64

In [128]:
lead_score_df = lead_score_df.drop(['specialization','tags'], axis=1)

In [129]:
lead_score_df.isna().sum().sort_values(ascending=True)

lead_number               0
lead_origin               0
do_not_email              0
converted                 0
totalvisits               0
ttime_on_site             0
pg_view_pv                0
country                   0
curr_occupation           0
reason_behind_course      0
city                      0
avail_free_copy           0
lead_source              36
last_activity           103
dtype: int64

In [130]:
lead_score_df.last_activity.value_counts(normalize=True)*100

last_activity
Email Opened                    37.616285
SMS Sent                        30.042684
Olark Chat Conversation         10.649010
Page Visited on Website          7.004487
Converted to Lead                4.684251
Email Bounced                    3.567911
Email Link Clicked               2.922185
Form Submitted on Website        1.269563
Unreachable                      1.017840
Unsubscribed                     0.667615
Had a Phone Conversation         0.328335
Approached upfront               0.098501
View in browser link Clicked     0.065667
Email Received                   0.021889
Email Marked Spam                0.021889
Resubscribed to emails           0.010945
Visited Booth in Tradeshow       0.010945
Name: proportion, dtype: float64

In [131]:
lead_score_df.last_activity.mode()[0]

'Email Opened'

In [132]:
adf = lead_score_df.copy()

In [133]:
adf.last_activity = lead_score_df.last_activity.fillna(lead_score_df.last_activity.mode()[0])
adf.last_activity.value_counts(normalize=True) * 100

last_activity
Email Opened                    38.311688
SMS Sent                        29.707792
Olark Chat Conversation         10.530303
Page Visited on Website          6.926407
Converted to Lead                4.632035
Email Bounced                    3.528139
Email Link Clicked               2.889610
Form Submitted on Website        1.255411
Unreachable                      1.006494
Unsubscribed                     0.660173
Had a Phone Conversation         0.324675
Approached upfront               0.097403
View in browser link Clicked     0.064935
Email Received                   0.021645
Email Marked Spam                0.021645
Resubscribed to emails           0.010823
Visited Booth in Tradeshow       0.010823
Name: proportion, dtype: float64

In [134]:
adf.lead_source = lead_score_df.lead_source.fillna(lead_score_df.lead_source.mode()[0])
adf.lead_source.value_counts(normalize=True) * 100

lead_source
Google               31.428571
Direct Traffic       27.521645
Olark Chat           18.993506
Organic Search       12.489177
Reference             5.779221
Welingak Website      1.536797
Referral Sites        1.352814
Facebook              0.595238
bing                  0.064935
google                0.054113
Click2call            0.043290
Press_Release         0.021645
Social Media          0.021645
Live Chat             0.021645
WeLearn               0.010823
Pay per Click Ads     0.010823
NC_EDM                0.010823
blog                  0.010823
testone               0.010823
welearnblog_Home      0.010823
youtubechannel        0.010823
Name: proportion, dtype: float64

In [135]:
adf.dtypes

lead_number                int64
lead_origin             category
lead_source             category
do_not_email            category
converted                  int64
totalvisits              float64
ttime_on_site            float64
pg_view_pv               float64
last_activity           category
country                 category
curr_occupation         category
reason_behind_course    category
city                    category
avail_free_copy         category
dtype: object

In [136]:
lsdf = pd.get_dummies(lead_score_df, columns=lead_score_df.select_dtypes('category').columns, drop_first=True, dtype=float)

In [137]:
lsdf.head()

Unnamed: 0,lead_number,converted,totalvisits,ttime_on_site,pg_view_pv,lead_origin_Landing Page Submission,lead_origin_Lead Add Form,lead_origin_Lead Import,lead_origin_Quick Add Form,lead_source_Direct Traffic,...,curr_occupation_Unemployed,curr_occupation_Working Professional,reason_behind_course_Flexibility & Convenience,reason_behind_course_Other,city_Other Cities,city_Other Cities of Maharashtra,city_Other Metro Cities,city_Thane & Outskirts,city_Tier II Cities,avail_free_copy_1
0,660737,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,660728,0,5.0,674.0,2.5,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,660727,1,2.0,1532.0,2.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,660719,0,1.0,305.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,660681,1,2.0,1428.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# - 

## --

In [138]:
err

NameError: name 'err' is not defined

## --

In [67]:
X = lsdf.drop(['converted'], axis=1)
y = lsdf['converted']

In [68]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

In [None]:
from sklearn.discriminant_analysis import StandardScaler

to_scale = ['totalvisits', 'ttime_on_site', 'pg_view_pv', 'lead_origin_Landing Page Submission', 'lead_origin_Lead Add Form', 'lead_origin_Lead Import',
       'lead_origin_Quick Add Form', 'lead_source_Direct Traffic', 'lead_source_Facebook', 'lead_source_Google', 'lead_source_Live Chat', 'lead_source_NC_EDM', 
       'lead_source_Olark Chat', 'lead_source_Organic Search', 'lead_source_Pay per Click Ads', 'lead_source_Press_Release', 'lead_source_Reference',
       'lead_source_Referral Sites', 'lead_source_Social Media', 'lead_source_WeLearn', 'lead_source_Welingak Website', 'lead_source_bing', 'lead_source_blog', 
       'lead_source_google', 'lead_source_testone', 'lead_source_welearnblog_Home', 'lead_source_youtubechannel', 'do_not_email_1', 'last_activity_Converted to Lead', 
       'last_activity_Email Bounced', 'last_activity_Email Link Clicked', 'last_activity_Email Marked Spam', 'last_activity_Email Opened', 'last_activity_Email Received',
       'last_activity_Form Submitted on Website', 'last_activity_Had a Phone Conversation', 'last_activity_Olark Chat Conversation', 'last_activity_Page Visited on Website',
       'last_activity_Resubscribed to emails', 'last_activity_SMS Sent', 'last_activity_Unreachable', 'last_activity_Unsubscribed', 'last_activity_View in browser link Clicked', 
       'last_activity_Visited Booth in Tradeshow', 'country_Australia', 'country_Bahrain', 'country_Bangladesh', 'country_Belgium','country_Canada', 'country_China', 
       'country_Denmark', 'country_France', 'country_Germany', 'country_Ghana', 'country_Hong Kong', 'country_India', 'country_Indonesia', 'country_Italy', 'country_Kenya',
       'country_Kuwait', 'country_Liberia', 'country_Malaysia', 'country_Netherlands', 'country_Nigeria', 'country_Oman', 'country_Philippines', 'country_Qatar', 
       'country_Russia', 'country_Saudi Arabia', 'country_Singapore', 'country_South Africa', 'country_Sri Lanka', 'country_Sweden', 'country_Switzerland', 'country_Tanzania', 
       'country_Uganda', 'country_United Arab Emirates', 'country_United Kingdom', 'country_United States', 'country_Vietnam', 'country_unknown', 'curr_occupation_Housewife', 
       'curr_occupation_Other', 'curr_occupation_Student', 'curr_occupation_Unemployed', 'curr_occupation_Working Professional', 'reason_behind_course_Flexibility & Convenience', 
       'reason_behind_course_Other', 'city_Other Cities', 'city_Other Cities of Maharashtra', 'city_Other Metro Cities', 'city_Thane & Outskirts', 'city_Tier II Cities', 
       'avail_free_copy_1']

scaler = StandardScaler()
X_train[to_scale] = scaler.fit_transform(X_train[to_scale],y_train)
X_train.head()

In [70]:
# We create custom functions for model veiling since iteration we reuse certain functions again and again
# Train and predict function trains the model and predicts on the same data and returns the model its probability and predicted values based on cutoff
# The matrix function returns confusion matrix and accuracy score
# The vif function returns the vif score for the features
import statsmodels.api as sm 
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, recall_score,precision_score, roc_auc_score, confusion_matrix, f1_score, roc_curve, precision_recall_curve


def logreg_train_pred_fn(fX_train, fy_train, fcol, fcutoff):
    fX_train_sm = sm.add_constant(fX_train[fcol])
    flogm = sm.GLM(fy_train, fX_train_sm, family = sm.families.Binomial())
    fres = flogm.fit()
    fy_train_pred = fres.predict(fX_train_sm)
    fy_train_pred = fy_train_pred.values.reshape(-1)
    fy_train_pred_final = pd.DataFrame({'Converted':fy_train.values, 'Conv_Prob':fy_train_pred})
    fy_train_pred_final['ID'] = fy_train.index
    fy_train_pred_final['predicted'] = fy_train_pred_final.Conv_Prob.map(lambda x: 1 if x > fcutoff else 0)
    return fres, fy_train_pred,fy_train_pred_final

def logreg_metrics_fn(fy_train_pred_final):
    fconfusion = confusion_matrix(fy_train_pred_final.Converted, fy_train_pred_final.predicted )
    faccuracy = accuracy_score(fy_train_pred_final.Converted, fy_train_pred_final.predicted)
    return fconfusion, faccuracy
    
def logreg_VIF_score_fn(fX_train, fcol):
    fvif = pd.DataFrame()
    fvif['Features'] = fX_train[fcol].columns
    fvif['VIF'] = [variance_inflation_factor(fX_train[fcol].values, i) for i in range(fX_train[fcol].shape[1])]
    fvif['VIF'] = round(fvif['VIF'], 2)
    fvif = fvif.sort_values(by = "VIF", ascending = False)
    return fvif

In [None]:
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

logreg = LogisticRegression()

rfe = RFE(estimator=logreg, n_features_to_select=15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

col = X_train.columns[rfe.support_]
X_train.columns[~rfe.support_]

# Now we perform model iteration as many times as possible till we get an optimum result

In [None]:
cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_train_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_train_pred_final
print('\nVIF Score:')            # VIF Score:
vif
# print('\nConfusion Matrix:')     # Confusion Matrix: 
# confusion
# print(f'\nAccuracy Score: {accuracy}\n')       # Accuracy Score:

In [None]:
col = col.drop('lead_source_Reference', 1)
col

In [None]:
cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
print('\nY_Predicted Values:')   # Y_Predicted Values:
y_train_pred
print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
y_train_pred_final
print('\nVIF Score:')            # VIF Score:
vif
print('\nConfusion Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy Score: {accuracy}\n')       # Accuracy Score:

## --

In [None]:
err

## --

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('spec_tag_analysis.csv')
df

In [3]:
df = df.drop(['Unnamed: 0', 'lead_number'],axis=1)

In [None]:
df.isna().sum().sort_values(ascending=False)

In [5]:
dff = df[df['specialization'].isna()]

In [None]:
type(dff)

In [None]:
['size', 'width', 'color', 'tickdir', 'pad', 'labelsize', 'labelcolor', 'labelfontfamily', 'zorder', 'gridOn', 'tick1On', 'tick2On', 'label1On', 'label2On', 'length', 
 'direction', 'left', 'bottom', 'right', 'top', 'labelleft', 'labelbottom', 'labelright', 'labeltop', 'labelrotation', 'grid_agg_filter', 'grid_alpha', 'grid_animated',
 'grid_antialiased', 'grid_clip_box', 'grid_clip_on', 'grid_clip_path', 'grid_color', 'grid_dash_capstyle', 'grid_dash_joinstyle', 'grid_dashes', 'grid_data', 
 'grid_drawstyle', 'grid_figure', 'grid_fillstyle', 'grid_gapcolor', 'grid_gid', 'grid_in_layout', 'grid_label', 'grid_linestyle', 'grid_linewidth', 'grid_marker', 
 'grid_markeredgecolor','grid_markeredgewidth','grid_markerfacecolor','grid_markerfacecoloralt','grid_markersize','grid_markevery','grid_mouseover','grid_path_effects', 
 'grid_picker', 'grid_pickradius', 'grid_rasterized', 'grid_sketch_params', 'grid_snap', 'grid_solid_capstyle', 'grid_solid_joinstyle', 'grid_transform', 'grid_url', 
 'grid_visible', 'grid_xdata', 'grid_ydata', 'grid_zorder', 'grid_aa', 'grid_c', 'grid_ds', 'grid_ls', 'grid_lw', 'grid_mec', 'grid_mew', 'grid_mfc', 'grid_mfcalt', 
 'grid_ms']

In [None]:
dff.dtypes

In [None]:
# import warnings
# import matplotlib 
# matplotlib.rcParams['ytick.labelsize'] = 8
# matplotlib.rcParams['ytick.major.size'] = 2
# warnings.filterwarnings('ignore')
# idx = 1
# plt.figure(figsize=(25,6))
# for i in dff.columns.difference(['specialization']):
#     for j in dff.columns.difference(['specialization']):
#         if i != j :
#             ax = plt.subplot(1, 4, idx)
            
#             x_f = lead_score_df[x_col]
#             y_f = lead_score_df[y_col]
            
#             if isinstance(x_f,(object,'category')):
#                 x_f = x_f.astype('str',copy=True).str.slice(0,10)
#             if isinstance(y_f,(object,'category')):
#                 y_f = y_f.astype('str',copy=True).str.slice(0,10)
                
#             sns.boxplot(x = xd, y= yd, ax=ax, palette='tab10')
#             ax.tick_params(axis='x', rotation=90)
#             ax.yaxis.set_label_coords(-0.01, 0.5)
#             # label = f'Converter: {ax.yaxis.converter}\n '
#             # label += f'Locator: {ax.yaxis.get_major_locator()}\n'
#             # label += f'Formatter: {ax.yaxis.get_major_formatter()}\n'
#             # ax.set_xlabel(label)
#             idx += 1  
#             if idx >= 5:
#                 plt.show();
#                 plt.figure(figsize=(25,6))
#                 idx = 1

In [None]:
# idx = 1
# plt.figure(figsize=(25,6))
# for i in dff.columns.difference(['specialization']):
#     for j in dff.columns.difference(['specialization']):
#         if i != j :
            
#             ax = plt.subplot(1, 4, idx)
            
#             xd = dff[i]
#             yd = dff[j]
            
#             if isinstance(xd,(object,'category')):
#                 xd = xd.astype('str',copy=True).str.slice(0,10)
#             if isinstance(xd,(object,'category')):
#                 yd = yd.astype('str',copy=True).str.slice(0,10)
                
#             sns.barplot(x = xd, y= yd, ax=ax, palette='tab10')
#             ax.tick_params(axis='x', rotation=90)
#             ax.yaxis.set_label_coords(-0.01, 0.5)
#             # label = f'Converter: {ax.yaxis.converter}\n '
#             # label += f'Locator: {ax.yaxis.get_major_locator()}\n'
#             # label += f'Formatter: {ax.yaxis.get_major_formatter()}\n'
#             # ax.set_xlabel(label)
#             idx += 1  
#             if idx >= 5:
#                 plt.show();
#                 plt.figure(figsize=(25,6))
#                 idx = 1

In [None]:
# !conda install -c conda-forge ipympl -y 
# matplotlib.rcParams.keys()

In [None]:
dff['tags'].astype('str',copy=True).str.slice(0,10)

In [None]:
(isinstance(dff['tags'],(object)) == isinstance(dff['tags'],(object)))

## --

In [None]:
err

## --

# -

In [139]:
X = lsdf.drop(['lead_number', 'converted'], axis=1)
y = lsdf['converted'].copy()

In [140]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [141]:
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, test_size=0.2, random_state=100)

In [142]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train, y_train)

In [143]:
logregm = LogisticRegression()
res = logregm.fit(X_train_scaled,y_train)


In [144]:
coef_df = pd.DataFrame({'features':X.columns, 'coef':res.coef_.reshape(-1), 'acoef':abs(res.coef_.reshape(-1))})
coef_df.sort_values(by='acoef',ascending=False)
abs(res.coef_)

Unnamed: 0,features,coef,acoef
1,ttime_on_site,1.092445,1.092445
39,last_activity_SMS Sent,0.799536,0.799536
4,lead_origin_Lead Add Form,0.672699,0.672699
85,curr_occupation_Working Professional,0.632115,0.632115
7,lead_source_Direct Traffic,-0.390222,0.390222
...,...,...,...
74,country_Tanzania,0.000000,0.000000
24,lead_source_testone,0.000000,0.000000
73,country_Switzerland,0.000000,0.000000
60,country_Liberia,0.000000,0.000000


array([[0.36724692, 1.09244546, 0.30161197, 0.10494229, 0.67269925,
        0.02160534, 0.07037172, 0.39022222, 0.08762475, 0.28905715,
        0.08346572, 0.07298155, 0.12111512, 0.25836865, 0.06815201,
        0.09352831, 0.07371883, 0.11518925, 0.02709831, 0.07207528,
        0.29862403, 0.00241531, 0.07604712, 0.17839088, 0.        ,
        0.07316405, 0.01624745, 0.33485974, 0.13896607, 0.08804369,
        0.00944579, 0.09038475, 0.27544372, 0.08431129, 0.01582999,
        0.14689649, 0.30763572, 0.0614637 , 0.09849606, 0.79953611,
        0.06830651, 0.09874894, 0.0118595 , 0.05364069, 0.0374499 ,
        0.02529952, 0.06249375, 0.06433852, 0.14347048, 0.07883676,
        0.0384465 , 0.00680336, 0.01514969, 0.08937787, 0.00561704,
        0.03020724, 0.07230727, 0.        , 0.0633185 , 0.08545145,
        0.        , 0.05492566, 0.02404168, 0.13550514, 0.00465412,
        0.09041   , 0.2401341 , 0.04469596, 0.05128675, 0.01710416,
        0.00770129, 0.0478338 , 0.00465859, 0.  

In [145]:
y_train_pred = res.predict(X_train_scaled)

In [146]:
y_train_pred

array([0, 0, 0, ..., 0, 0, 0])

In [147]:
y_train_pred.reshape(-1)

array([0, 0, 0, ..., 0, 0, 0])

In [148]:
y_train_pred_final = pd.DataFrame({'Actual':y_train ,'Actual_Prob':y_train_pred })

In [149]:
y_train_pred_final

Unnamed: 0,Actual,Actual_Prob
7263,1,0
6468,0,0
7833,1,0
4461,0,1
8453,0,0
...,...,...
350,1,1
79,1,1
8039,1,0
6936,0,0


In [150]:
y_train_pred_final['predicted'] = y_train_pred_final.Actual_Prob.map(lambda x : 1 if x>0.5 else 0)

In [151]:
y_train_pred_final.reindex(labels=['Actual', 'Actual_Prob', 'predicted'],axis=1)

Unnamed: 0,Actual,Actual_Prob,predicted
7263,1,0,0
6468,0,0,0
7833,1,0,0
4461,0,1,1
8453,0,0,0
...,...,...,...
350,1,1,1
79,1,1,1
8039,1,0,0
6936,0,0,0


In [152]:
confusion_matrix(y_train,y_train_pred_final.predicted)

array([[4035,  528],
       [ 806, 2023]])

In [153]:
accuracy_score(y_train,y_train_pred_final.predicted)

0.8195346320346321

In [154]:
from sklearn.linear_model import LinearRegression


# res = LinearRegression().fit(X_train, y_train)
# res.score(X,y)

scores = cross_val_score(res, X, y, cv=5, scoring='accuracy')
np.sqrt(np.abs(scores))

array([0.88273483, 0.90572972, 0.89006056, 0.90273754, 0.88671044])

In [155]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=6)

In [156]:
X_train_scaled

array([[-1.12819081, -0.88494165, -1.27154449, ..., -0.29748863,
        -0.08892903, -0.67335094],
       [-0.42937356, -0.63220106, -0.14566937, ..., -0.29748863,
        -0.08892903,  1.48510967],
       [-0.42937356, -0.74025682, -0.14566937, ..., -0.29748863,
        -0.08892903,  1.48510967],
       ...,
       [-1.12819081, -0.88494165, -1.27154449, ..., -0.29748863,
        -0.08892903, -0.67335094],
       [-0.42937356, -0.44905571, -0.14566937, ..., -0.29748863,
        -0.08892903, -0.67335094],
       [-1.12819081, -0.88494165, -1.27154449, ..., -0.29748863,
        -0.08892903, -0.67335094]])

In [157]:
for k,(i,j) in enumerate(kf.split(X,y)):
   j.shape

(1540,)

(1540,)

(1540,)

(1540,)

(1540,)

(1540,)

In [163]:
X_train[X_train.index]

KeyError: "None of [Index([7263, 6468, 7833, 4461, 8453, 7094, 7932,  449, 5382, 3768,\n       ...\n       4376,  802, 5646, 7906, 4149,  350,   79, 8039, 6936, 5640],\n      dtype='int64', length=7392)] are in the [columns]"