In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn  as sns 
import plotly.express as px 
import warnings 
warnings.filterwarnings("ignore")

%matplotlib inline


In [3]:
df = pd.read_csv("C:/Users/HP/Datasets/Travel.csv")

In [4]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [5]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [6]:
## Check all categorical features for any disruptency 
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [7]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [8]:
df['TypeofContact'].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [9]:
df['Gender'] = df['Gender'].replace('Fe Male','Female')
df['MaritalStatus'] = df['MaritalStatus'].replace('Single''Unmarried')

In [10]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [11]:
# check Missig Values 
### These are the features with nan valueabs
features_with_na = [features for features in df.columns if df[features].isnull().sum()>=1]

for features in features_with_na:
    print(features,np.round(df[features].isnull().mean()*100,5) ,'% missing values')

Age 4.62357 % missing values
TypeofContact 0.51146 % missing values
DurationOfPitch 5.13502 % missing values
NumberOfFollowups 0.92062 % missing values
PreferredPropertyStar 0.53191 % missing values
NumberOfTrips 2.86416 % missing values
NumberOfChildrenVisiting 1.35025 % missing values
MonthlyIncome 4.76678 % missing values


In [12]:
# Statistics on numerical columns (Null colist)
df[features_with_na].select_dtypes(exclude='object').describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


# Imputing Null Values 


 ## 1.Impute Median value for Age column
 ## 2.Impute Mode for Type of Contract
 ## 3.Impute Median for Duration of Pitch 
 ## 4.Impute Mode for NumberofFoloowup as it is Discrete feature
 ## 5.Impute Mode for PreferredPropertyStar
 ## 6.Impute Median for NumberofTrips 
 ## 7.Impute Mode for NumberofChildrenVisiting 
 ## 8.Impute Median for MonthlyIncome

In [13]:
# Age 
df.Age.fillna(df.Age.median(),inplace=True)

# TypeofContract 

df.TypeofContact.fillna(df.TypeofContact.mode()[0],inplace=True)

# DurationofPitch 
df.DurationOfPitch.fillna(df.DurationOfPitch.median(),inplace=True)

# Number of Followups 
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0],inplace=True)

# PrefferedPropertyStar

df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0],inplace=True)

## Number of Trips 

df.NumberOfTrips.fillna(0,inplace=True)


# Number of children Vistiting 
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0],inplace=True)

# Monthly Income 

df.MonthlyIncome.fillna(df.MonthlyIncome.median(),inplace=True)

## Feature Engineering

### Feature Extraction

In [14]:
df.drop('CustomerID',inplace=True,axis=1)

In [15]:
# create new columns for total people visiting 

df['TotalVisiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(['NumberOfChildrenVisiting','NumberOfPersonVisiting'],inplace=True,axis=1)

In [16]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisiting
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [17]:
## Get all numeric features 
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of numerical features :' , len(num_features))

Num of numerical features : 12


In [18]:
# Number of Categorical features 
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical features :' , len(cat_features))

Num of Categorical features : 6


In [19]:
# number of discrete features 
disc_features = [feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of dicrete features :' , len(disc_features))

Num of dicrete features : 9


In [20]:
## Continous features 

cont_features = [feature for feature in num_features if feature not in disc_features]
print('Num of continous features :' , len(cont_features))

Num of continous features : 3


### Train Test Split and Model Training

In [21]:
from  sklearn.model_selection import train_test_split 
X = df.drop('ProdTaken',axis=1)
Y = df['ProdTaken']

In [22]:
X.head() 

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisiting
0,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [23]:
Y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [24]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=42,test_size=0.2)
x_train.shape,x_test.shape

((3910, 17), (978, 17))

In [25]:
# Create Column Transformer with 3 types of Trandformers 
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude = 'object').columns 

In [26]:
from sklearn.preprocessing import OneHotEncoder , StandardScaler 
from sklearn.compose import ColumnTransformer 

num_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop = 'first')

preprocessor = ColumnTransformer( [("OneHotEncoder" , oh_transformer,cat_features) , ("StandardScaler",num_transformer,num_features)] )

In [27]:
x_train = preprocessor.fit_transform(x_train)


In [28]:
x_test = preprocessor.transform(x_test)

## Random Forest Classifier 

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,roc_auc_score,roc_curve

In [30]:
models = {
    "Random Forest" : RandomForestClassifier()
}
for i in range(len(list(models))):

    model = list(models.values())[i]
    model.fit(x_train,y_train)

    # Make predictions 
    y_train_pred = model.predict(x_train)
    y_test_predict = model.predict(x_test)



    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    
    model_test_accuracy = accuracy_score(y_test,y_test_predict)
    
    model_train_precision = precision_score(y_train,y_train_pred)
    
    model_test_precision = precision_score(y_test,y_test_predict)
    
    model_train_recall = recall_score(y_train,y_train_pred)
    
    model_test_recall = recall_score(y_test,y_test_predict)
    
    model_train_rocau_score = roc_auc_score(y_train,y_train_pred)
    
    model_test_rocau_score = roc_auc_score(y_test,y_test_predict)

    print(f"Model is {list(models.keys())[i]}")

    print(f"Model Performance for Trainning set")

    print(f"accuracy : {model_train_accuracy} , recall: {model_train_recall} , precision: {model_train_precision} , racou : {model_train_rocau_score}")


    print("-------------------------------------------------------------------------------------------------------------------------------------------")

    print(f"Model Performance for Testing set")

    print(f"accuracy : {model_test_accuracy} , recall: {model_test_recall} , precision: {model_test_precision} , racou : {model_test_rocau_score}")









    


Model is Random Forest
Model Performance for Trainning set
accuracy : 1.0 , recall: 1.0 , precision: 1.0 , racou : 1.0
-------------------------------------------------------------------------------------------------------------------------------------------
Model Performance for Testing set
accuracy : 0.9253578732106339 , recall: 0.6492146596858639 , precision: 0.9538461538461539 , racou : 0.8207953857514453


In [36]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression 

models = {
    "logistic Regression" : LogisticRegression(),
    "Decision Tree" : DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier()
}
for i in range(len(list(models))):

    model = list(models.values())[i]
    model.fit(x_train,y_train)

    # Make predictions 
    y_train_pred = model.predict(x_train)
    y_test_predict = model.predict(x_test)



    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    
    model_test_accuracy = accuracy_score(y_test,y_test_predict)
    
    model_train_precision = precision_score(y_train,y_train_pred)
    
    model_test_precision = precision_score(y_test,y_test_predict)
    
    model_train_recall = recall_score(y_train,y_train_pred)
    
    model_test_recall = recall_score(y_test,y_test_predict)
    
    model_train_rocau_score = roc_auc_score(y_train,y_train_pred)
    
    model_test_rocau_score = roc_auc_score(y_test,y_test_predict)

    print(f"Model is {list(models.keys())[i]}")

    print(f"Model Performance for Trainning set")

    print(f"accuracy : {model_train_accuracy} , recall: {model_train_recall} , precision: {model_train_precision} , racou : {model_train_rocau_score}")


    print("-------------------------------------------------------------------------------------------------------------------------------------------")

    print(f"Model Performance for Testing set")

    print(f"accuracy : {model_test_accuracy} , recall: {model_test_recall} , precision: {model_test_precision} , racou : {model_test_rocau_score}")
    print("-------------------------------------------------------------------------------------------------------------------------------------------")










    


Model is logistic Regression
Model Performance for Trainning set
accuracy : 0.8457800511508952 , recall: 0.30727023319615915 , precision: 0.6956521739130435 , racou : 0.6382311555795319
-------------------------------------------------------------------------------------------------------------------------------------------
Model Performance for Testing set
accuracy : 0.83640081799591 , recall: 0.3036649214659686 , precision: 0.6823529411764706 , racou : 0.6346787123212942
-------------------------------------------------------------------------------------------------------------------------------------------
Model is Decision Tree
Model Performance for Trainning set
accuracy : 1.0 , recall: 1.0 , precision: 1.0 , racou : 1.0
-------------------------------------------------------------------------------------------------------------------------------------------
Model Performance for Testing set
accuracy : 0.9100204498977505 , recall: 0.743455497382199 , precision: 0.7845303867403315

In [42]:
# Hyper parameter tuning 

rf_params = { "max_depth" : [5, 8, 15, None, 10],
             "max_features" : [5, 7, "auto", 8], 
             "min_samples_split" : [2, 8, 15, 20],  
             "n_estimators": [100, 200, 500 ,1000]
    
}

In [43]:
## Models Lsit for hyper paramter tuning  
randomcv_models = [
    ("RF",RandomForestClassifier(),rf_params)
    
]

In [44]:
from sklearn.model_selection import RandomizedSearchCV 

model_parm = {}
for name,model,params in randomcv_models: 
    random = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=3,verbose=2,n_jobs=-1)
    random.fit(x_train,y_train)
    model_parm[name] = random.best_params_

for model_name in model_parm: 
    print(f"------------== BEst parameters for {model_name} --------=")
    print(model_parm[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------== BEst parameters for RF --------=
{'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 8, 'max_depth': None}
