## Loading the Dataset

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Train_pjb2QcD.csv')
df.shape

(8844, 19)

In [3]:
df.head()

Unnamed: 0,ID,Office_PIN,Applicant_City_PIN,Applicant_Gender,Applicant_Marital_Status,Applicant_Occupation,Applicant_Qualification,Manager_Joining_Designation,Manager_Current_Designation,Manager_Grade,Manager_Status,Manager_Gender,Manager_Num_Application,Manager_Num_Coded,Manager_Business,Manager_Num_Products,Manager_Business2,Manager_Num_Products2,Business_Sourced
0,FIN1000001,842001,844120,M,M,Others,Graduate,Level 1,Level 2,3.0,Confirmation,M,2.0,1.0,335249.0,28.0,335249.0,28.0,0
1,FIN1000002,842001,844111,M,S,Others,Class XII,Level 1,Level 2,3.0,Confirmation,M,2.0,1.0,335249.0,28.0,335249.0,28.0,1
2,FIN1000003,800001,844101,M,M,Business,Class XII,Level 1,Level 1,2.0,Confirmation,M,0.0,0.0,357184.0,24.0,357184.0,24.0,0
3,FIN1000004,814112,814112,M,S,Salaried,Class XII,Level 1,Level 3,4.0,Confirmation,F,0.0,0.0,318356.0,22.0,318356.0,22.0,0
4,FIN1000005,814112,815351,M,M,Others,Class XII,Level 1,Level 1,2.0,Confirmation,M,2.0,1.0,230402.0,17.0,230402.0,17.0,0


In [4]:
df.columns

Index(['ID', 'Office_PIN', 'Applicant_City_PIN', 'Applicant_Gender',
       'Applicant_Marital_Status', 'Applicant_Occupation',
       'Applicant_Qualification', 'Manager_Joining_Designation',
       'Manager_Current_Designation', 'Manager_Grade', 'Manager_Status',
       'Manager_Gender', 'Manager_Num_Application', 'Manager_Num_Coded',
       'Manager_Business', 'Manager_Num_Products', 'Manager_Business2',
       'Manager_Num_Products2', 'Business_Sourced'],
      dtype='object')

## Imputing Missing Values

In [5]:
df.isnull().sum()

ID                                0
Office_PIN                        0
Applicant_City_PIN                0
Applicant_Gender                 53
Applicant_Marital_Status         59
Applicant_Occupation           1090
Applicant_Qualification          71
Manager_Joining_Designation       0
Manager_Current_Designation       0
Manager_Grade                     0
Manager_Status                    0
Manager_Gender                    0
Manager_Num_Application           0
Manager_Num_Coded                 0
Manager_Business                  0
Manager_Num_Products              0
Manager_Business2                 0
Manager_Num_Products2             0
Business_Sourced                  0
dtype: int64

In [6]:
df[['Applicant_Gender','Applicant_Marital_Status', 'Applicant_Occupation', 'Applicant_Qualification']].head()

Unnamed: 0,Applicant_Gender,Applicant_Marital_Status,Applicant_Occupation,Applicant_Qualification
0,M,M,Others,Graduate
1,M,S,Others,Class XII
2,M,M,Business,Class XII
3,M,S,Salaried,Class XII
4,M,M,Others,Class XII


**1. Missing Values in Applicant Gender**

In [7]:
#checking value_Counts
df['Applicant_Gender'].value_counts()

M    6656
F    2135
Name: Applicant_Gender, dtype: int64

In [8]:
#imputing missing with mode
df['Applicant_Gender'].fillna('M', inplace=True)

**2. Missing Values in Applicant Marital Status**

In [9]:
#checking value_Counts
df['Applicant_Marital_Status'].value_counts()

M    5733
S    3042
W       6
D       4
Name: Applicant_Marital_Status, dtype: int64

In [10]:
#imputing missing with mode

df['Applicant_Marital_Status'].fillna('M', inplace=True)

**3. Missing Values in Applicant Occupation**

In [11]:
#checking value_Counts
df['Applicant_Occupation'].value_counts()

Salaried         3546
Business         2157
Others           1809
Self Employed     146
Student            96
Name: Applicant_Occupation, dtype: int64

In [12]:
#imputing missing with mode

df['Applicant_Occupation'].fillna('Salaried', inplace=True)

**4. Missing Values in Applicant Qualification**

In [13]:
#checking value_Counts
df['Applicant_Qualification'].value_counts()

Class XII                                                           5426
Graduate                                                            2958
Class X                                                              195
Others                                                               116
Masters of Business Administration                                    71
Associate / Fellow of Institute of Chartered Accountans of India       3
Associate/Fellow of Acturial Society of India                          1
Associate/Fellow of Institute of Company Secretories of India          1
Associate/Fellow of Insurance Institute of India                       1
Professional Qualification in Marketing                                1
Name: Applicant_Qualification, dtype: int64

In [14]:
#imputing missing with mode

df['Applicant_Qualification'].fillna('Class XII', inplace=True)

In [15]:
df.isnull().sum()

ID                             0
Office_PIN                     0
Applicant_City_PIN             0
Applicant_Gender               0
Applicant_Marital_Status       0
Applicant_Occupation           0
Applicant_Qualification        0
Manager_Joining_Designation    0
Manager_Current_Designation    0
Manager_Grade                  0
Manager_Status                 0
Manager_Gender                 0
Manager_Num_Application        0
Manager_Num_Coded              0
Manager_Business               0
Manager_Num_Products           0
Manager_Business2              0
Manager_Num_Products2          0
Business_Sourced               0
dtype: int64

## Dealing with Categorical Variables

In [16]:
df.dtypes

ID                              object
Office_PIN                       int64
Applicant_City_PIN               int64
Applicant_Gender                object
Applicant_Marital_Status        object
Applicant_Occupation            object
Applicant_Qualification         object
Manager_Joining_Designation     object
Manager_Current_Designation     object
Manager_Grade                  float64
Manager_Status                  object
Manager_Gender                  object
Manager_Num_Application        float64
Manager_Num_Coded              float64
Manager_Business               float64
Manager_Num_Products           float64
Manager_Business2              float64
Manager_Num_Products2          float64
Business_Sourced                 int64
dtype: object

In [17]:
df.columns

Index(['ID', 'Office_PIN', 'Applicant_City_PIN', 'Applicant_Gender',
       'Applicant_Marital_Status', 'Applicant_Occupation',
       'Applicant_Qualification', 'Manager_Joining_Designation',
       'Manager_Current_Designation', 'Manager_Grade', 'Manager_Status',
       'Manager_Gender', 'Manager_Num_Application', 'Manager_Num_Coded',
       'Manager_Business', 'Manager_Num_Products', 'Manager_Business2',
       'Manager_Num_Products2', 'Business_Sourced'],
      dtype='object')

In [18]:
categorical_cols = ['Applicant_Gender','Applicant_Marital_Status','Applicant_Occupation','Applicant_Qualification',
                   'Manager_Joining_Designation', 'Manager_Current_Designation', 'Manager_Status', 'Manager_Gender']

for i in categorical_cols:
    print('*****', i, '*****')
    print(df[i].value_counts())
    print('')

***** Applicant_Gender *****
M    6709
F    2135
Name: Applicant_Gender, dtype: int64

***** Applicant_Marital_Status *****
M    5792
S    3042
W       6
D       4
Name: Applicant_Marital_Status, dtype: int64

***** Applicant_Occupation *****
Salaried         4636
Business         2157
Others           1809
Self Employed     146
Student            96
Name: Applicant_Occupation, dtype: int64

***** Applicant_Qualification *****
Class XII                                                           5497
Graduate                                                            2958
Class X                                                              195
Others                                                               116
Masters of Business Administration                                    71
Associate / Fellow of Institute of Chartered Accountans of India       3
Associate/Fellow of Acturial Society of India                          1
Associate/Fellow of Institute of Company Secretories of In

In [19]:
df = pd.get_dummies(df)

## Logistic regression

### Train Test split

In [20]:
x = df.drop(['Business_Sourced'],axis=1)
y = df['Business_Sourced']

In [21]:
from sklearn.model_selection import train_test_split
train_x, valid_x, train_y, valid_y= train_test_split(x, y, test_size = 0.3, random_state=1)

### Linear regression Model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, auc
from sklearn.metrics import roc_auc_score

In [23]:
logreg = LogisticRegression()
logreg.fit(train_x, train_y)

LogisticRegression()

In [24]:
pred_train = logreg.predict_proba(train_x)
pred_valid = logreg.predict_proba(valid_x)

In [25]:
roc_auc_score(train_y, pred_train[:,1])

0.4723339792143738

In [26]:
roc_auc_score(valid_y, pred_valid[:,1])

0.4697625904030554

### Random Forest Classifier

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth= 4, random_state=2)

In [28]:
rf.fit(train_x, train_y)

RandomForestClassifier(max_depth=4, random_state=2)

In [29]:
pred_train_rf = rf.predict_proba(train_x)
pred_valid_rf = rf.predict_proba(valid_x)

In [30]:
roc_auc_score(train_y, pred_train_rf[:,1])

0.6219390869687003

In [31]:
roc_auc_score(valid_y, pred_valid_rf[:,1])

0.5707349946164473