## Source Code pipeline

#Importing necessary libraries 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import FunctionTransformer
import joblib

In [2]:
def treat_missing_values(data):
    columns_to_keep = ['custAge', 'profession', 'marital', 'schooling', 'default', 'housing',
                       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
                       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                       'euribor3m', 'nr.employed', 'pmonths', 'pastEmail', 'responded']

    data = data[columns_to_keep]

    # Calculate the mean for each category in the 'profession' column
    mean_by_profession = train_data.groupby('profession')['custAge'].mean().round()

    # Impute mean values for null categories in the 'profession' column using lambda function
    data.loc[data['custAge'].isnull(), 'custAge'] = data.apply(lambda row: mean_by_profession.get(row['profession'], row['custAge']) if pd.isna(row['custAge']) else row['custAge'], axis=1)

    # Imputation of missing values in education based on profession
    mapping = {
    'blue-collar' : 'basic.4y',
    'blue-collar' : 'basic.6y',
    'blue-collar' : 'basic.9y',
    'self-employed': 'illiterate',
    'technician'   : 'professional.course',
    'admin.'        : 'university.degree',
    'services'      : 'high.school',
    'management'    : 'university.degree',
    'retired'       : 'unknown',
    'entrepreneur'  : 'university.degree'
             }

     # Using fillna method to fill nulll values by mapping to above dictionary 

    train_data['schooling'].fillna(train_data['profession'].map(mapping), inplace=True)
    
    train_data['schooling'].fillna('unknown', inplace=True)
    
    import numpy as np

    # Define the list of days
    days_list = ['mon', 'tue', 'wed', 'thu', 'fri']

    # Replace null values in 'day_of_week' column with random choices
    train_data['day_of_week'] = train_data['day_of_week'].fillna(np.random.choice(days_list))

    
    
    # Combining 'unknown' category with 'single' category
    train_data['marital'] = train_data['marital'].replace('unknown', 'single')
    
    
    
     # Drop remaining missing values
    data = data.dropna()

    return data


In [3]:
def label_encoding(data):
    # pdays
    conditions = [
        (data['pdays'] == 999),
        (data['pdays'] < 10),
        (data['pdays'] >= 10)
    ]

    choices = ['first visit', 'less than 10 days', 'greater than 10 days']

    # Create the 'pduration' column based on conditions
    data['pduration'] = np.select(conditions, choices, default='unknown')

    # pmonths
    conditions = [
        (data['pmonths'] == 999),
        (data['pmonths'] <= 0.3),
        (data['pmonths'] > 0.3)
    ]

    choices = ['first visit', 'less than 2 months', 'greater than 2 months']

    # Create the 'pduration_m' column based on conditions
    data['pduration_m'] = np.select(conditions, choices, default='unknown')

    return data


In [4]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def feature_transformation(data):
    # Dropping target and unnecessary columns
    X = data.drop(['responded', 'pdays', 'pmonths'], axis=1)
    y = data['responded']

    # One-hot encode categorical columns
    X_encoded = pd.get_dummies(X, columns=['loan', 'marital', 'schooling', 'default', 'housing', 'day_of_week',
                                           'poutcome', 'pduration', 'pduration_m', 'profession', 'month', 'contact'],
                               drop_first=True)

    # Continuous columns for normalization
    continuous_columns = ['custAge', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                          'euribor3m', 'nr.employed', 'pastEmail']

    # Extracting the continuous columns from X_encoded
    X_continuous = X_encoded[continuous_columns]

    # StandardScaler
    scaler = StandardScaler()

    # Fit and transform
    X_continuous_normalized = scaler.fit_transform(X_continuous)

    # Replacing the original continuous columns in X_encoded with the normalized ones
    X_encoded[continuous_columns] = X_continuous_normalized

    return X_encoded, y


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.combine import SMOTEENN
import numpy as np

def train_propensify_model(X_encoded, y):
    # Set a random seed for reproducibility
    np.random.seed(42)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    # Create a Random Forest classifier
    rf_classifier = RandomForestClassifier(random_state=42)

    # Create a Support Vector Machine (SVM) classifier with RBF kernel
    svm_classifier = SVC(kernel='rbf', probability=True, random_state=42)

    # Ensemble the classifiers using a VotingClassifier
    ensemble_classifier = VotingClassifier(estimators=[
        ('rf', rf_classifier),
        ('svm', svm_classifier)
    ], voting='hard')  # 'hard' for probability voting

    # Define the preprocessing steps and classifiers for the pipeline
    preprocessing_steps = [('smoteenn', SMOTEENN(random_state=42, sampling_strategy=0.5)),
                           ('ensemble_classifier', ensemble_classifier)]
    pipeline = ImbPipeline(preprocessing_steps)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'smoteenn__sampling_strategy': [0.5],
        'ensemble_classifier__voting': ['hard'],
        'ensemble_classifier__rf__n_estimators': [50],
        'ensemble_classifier__rf__max_depth': [None, 2],
        'ensemble_classifier__rf__min_samples_split': [2, 3],
    }

    # Create GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

    # Fit GridSearchCV on the training data
    grid_search.fit(X_train, y_train)

    return grid_search


In [6]:
import pandas as pd

# Load training and testing datasets
train_data = pd.read_excel(r"C:\\Users\\veenu\\Downloads\\Propensify\\train.xlsx")
test_data = pd.read_excel(r"C:\\Users\\veenu\\Downloads\\Propensify\\test.xlsx")

# New column 'responded' to the test data and assigning a value ('yes' or 'no')
test_data['responded'] = 'yes'  

In [7]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from joblib import dump

from sklearn.preprocessing import FunctionTransformer

# preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('missing_values', FunctionTransformer(func=treat_missing_values)),
    ('label_encoding', FunctionTransformer(func=label_encoding)),
    ('feature_transformation', FunctionTransformer(func=feature_transformation)),
])

# Fitting the pipeline on the training data
X_train_transformed, y_train = preprocessing_pipeline.fit_transform(train_data)

# Training the model 
trained_model = train_propensify_model(X_train_transformed, y_train)

# Saving pipeline
joblib.dump(preprocessing_pipeline, 'preprocessing_pipeline.joblib')

# Saving the trained model
joblib.dump(trained_model, 'propensify_model.joblib')


['propensify_model.joblib']

In [8]:

loaded_model = joblib.load('propensify_model.joblib')

preprocessing_pipeline = joblib.load('preprocessing_pipeline.joblib')

In [9]:
X_test_transformed, _ = preprocessing_pipeline.transform(test_data)

In [10]:
predictions = loaded_model.predict(X_test_transformed)

In [11]:
#column 'Predictions' to the preprocessed test data
X_test_transformed['Predicted_Response'] = predictions

#DataFrame with predictions
X_test_transformed

Unnamed: 0,custAge,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,pastEmail,loan_unknown,...,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,contact_telephone,Predicted_Response
1,-0.525480,-0.202662,1.665493,-2.211954,-2.066289,2.296557,-1.633517,-2.064448,1.312616,False,...,False,False,False,False,False,False,False,True,False,yes
2,1.051504,-0.559551,1.665493,-1.195537,-1.179859,-1.240710,-1.322916,-0.936747,1.312616,False,...,False,False,False,False,True,False,False,False,False,no
3,-1.051141,-0.559551,-0.350019,0.837298,-0.229619,0.937729,0.772915,0.846016,-0.277382,False,...,False,False,False,False,False,False,False,False,False,no
4,-0.104951,-0.559551,-0.350019,-0.115593,-0.650415,-0.334824,0.305574,0.399634,-0.277382,False,...,False,False,False,False,False,True,False,False,False,no
7,-1.051141,-0.559551,-0.350019,0.837298,0.587829,-0.485805,0.772339,0.846016,-0.277382,False,...,False,True,False,False,False,False,False,False,True,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32943,-0.420348,-0.202662,-0.350019,-1.195537,-0.865986,-1.434828,-1.250309,-0.936747,-0.277382,False,...,False,False,False,False,False,False,False,False,False,yes
32944,-0.840877,-0.202662,-0.350019,0.837298,-0.229619,0.937729,0.774068,0.846016,-0.277382,False,...,False,False,False,False,False,False,False,False,False,no
32947,-0.840877,-0.202662,-0.350019,-1.195537,-1.179859,-1.240710,-1.339051,-0.936747,-0.277382,False,...,False,False,False,False,True,False,False,False,False,no
32948,-0.840877,-0.559551,-0.350019,0.837298,1.531170,-0.291687,0.770034,0.846016,-0.277382,False,...,False,False,True,False,False,False,False,False,True,no


In [12]:
X_test_transformed['Predicted_Response'].value_counts()

Predicted_Response
no     17422
yes     3423
Name: count, dtype: int64

In [13]:
csv_file_path = 'test_with.predictions.csv'
X_test_transformed.to_csv(csv_file_path, index=False)

In [14]:
excel_file_path = 'test_with.predictions.xlsx'
X_test_transformed.to_excel(excel_file_path, index=False)