In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import warnings 
warnings.filterwarnings('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")
train_df = train_df.reindex(np.random.permutation(train_df.index)) # shuffle the examples
test_df = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")

In [None]:
#columns in data...
print(train_df.columns)
#NaN or null values in data...
train_df.isnull()

In [None]:
#clean the data from null or NaN values...
#dropna remove rows or columns with missing values ( NaN ) from a DataFrame
train_df.dropna()
test_df.dropna()

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
#let's remove the columns that don't have any effect and will be hard to encode...
train_df.drop(columns='Surname',inplace=True)
test_df.drop(columns='Surname',inplace=True)
train_df.drop(columns=['id','CustomerId'],inplace=True)
test_df.drop(columns=['id','CustomerId'],inplace=True)


In [None]:
train_df['Exited'].value_counts()

In [None]:
#numerical and non numerical columns... apart from Exited - column
num_col = []
non_num_col = []
for each_col in train_df.drop(columns = 'Exited').columns:
    if train_df[each_col].dtype == 'object':
        non_num_col.append(each_col)
    else : 
        num_col.append(each_col)
        
#print...
print(num_col)
print(non_num_col)

In [None]:
#plot the numerical columns...
train_df[num_col].hist(figsize=(12,12))
plt.tight_layout()
plt.show()


In [None]:
# #male and female percentage ...
# figure = plt.figure(figsize=(12,12))
# cnt = 1

# for i in 

In [None]:
#let's segregate the continous value columns ...
#from the num_col=> discrete value,,,
cont_col = train_df[['CreditScore','Age','Balance','EstimatedSalary']]

In [None]:
#correlation matrix...
cor_rel = train_df.corr(numeric_only=True)
mask = np.triu(cor_rel)
mask = mask
plt.figure(figsize=(10,8))
sns.heatmap(cor_rel,annot=True,mask=mask,cmap='Reds',fmt='.3f')
plt.show()

In [None]:
#finding outlier... in the data -> Data prep for preprocessing...
def outlier_threshold(dataframe,column,q1=0.25,q3=0.75):
    Q1 = dataframe[column].quantile(q1)
    Q3 = dataframe[column].quantile(q3)
    iqr = Q3-Q1 #Interquartile Range
    upper_lim = Q3+1.5*iqr
    low_lim = Q1-1.5*iqr
    return low_lim,upper_lim

def outlier_per(dataframe,column):
    low_lim,upper_lim = outlier_threshold(dataframe,column)
    outliers = [x for x in dataframe[col] if (x>upper_lim) | (x<low_lim)]
    print(col)
    print("Outliers %:",(len(outliers)/(dataframe[column].shape[0]))*100,"%")


In [None]:
for col in (cont_col.columns):
    outlier_per(train_df,col)

In [None]:
#since the outliers are not that high -> we could
#leave them or drop them or impute with the IQR...
#imputing with IQR meaning -> replacing outliers with either the closest non-outlier value or with a predetermined value based on domain knowledge.
def replace_with_threshholds(dataframe1,dataframe2,column):
    low_lim,upper_lim=outlier_threshold(dataframe1,column)
    dataframe1.loc[(dataframe1[column]<low_lim),column]=low_lim
    dataframe1.loc[(dataframe1[column]>upper_lim),column]=upper_lim
    dataframe2.loc[(dataframe2[column]<low_lim),column]=low_lim
    dataframe2.loc[(dataframe2[column]>upper_lim),column]=upper_lim 
for column in (cont_col.columns):
    replace_with_threshholds(train_df,test_df,column)

In [None]:
#categorial and numerical col for perprocessing...
cat_attribs=[]
num_attribs=[]
for col in train_df.drop(columns='Exited').columns:
    if train_df[col].dtype=="object" or len(train_df[col].unique())==2 :
        cat_attribs.append(col)
    else:
        num_attribs.append(col)    
print(cat_attribs)
print(num_attribs)

In [None]:
#split target and label...
X,y = train_df.drop(columns=['Exited']),train_df['Exited']

In [None]:
#import some ML packages - sklearn lib...
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score,classification_report,f1_score,mean_squared_error,roc_auc_score,precision_score,recall_score,roc_curve,ConfusionMatrixDisplay,confusion_matrix,auc
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,OrdinalEncoder,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression,SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator,TransformerMixin
from xgboost import XGBClassifier

In [None]:
#Logtransformer - good for skewed data...
class LogTransform(BaseEstimator,TransformerMixin):
    def __init__(self, columns, domain_shift=1):
        self.columns = columns
        self.domain_shift = domain_shift

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.columns] = np.log(X_copy[self.columns] + self.domain_shift)
        return X_copy
    
    def fit_transform(self, X, y=None):
        return self.transform(X)  

In [None]:
num_pipeline = make_pipeline(LogTransform(['EstimatedSalary','Balance']),StandardScaler())
cat_pipeline = make_pipeline(OrdinalEncoder())
preprocessing = ColumnTransformer([('num',num_pipeline,num_attribs),('cat',cat_pipeline,cat_attribs)])

In [None]:
x_1 = pd.DataFrame(preprocessing.fit_transform(X,y))
x_1_test = pd.DataFrame(preprocessing.transform(test_df))
x_1

In [None]:
#training and validation data...
X_train,X_valid,y_train,y_valid=train_test_split(x_1,y,test_size=0.3,random_state=42)

#Random Forest Classifier... => Create and train...
model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)
feature_importance=model.feature_importances_

#Dataframe => feature names with their importances...
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance':feature_importance})

#feature by importance -> sort...
feature_importance_df=feature_importance_df.sort_values(by='Importance',ascending=False)
feature_importance_df

In [None]:
#plot the feature importance...
plt.figure(figsize=(8,6))
plt.barh(feature_importance_df['Feature'],feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance')
plt.show()

In [None]:
#model Score...
#fpr = false positive rate...
#tpr = true positive rate...
def plot_roc_score(fpr,tpr,label = None):
    plt.plot(fpr,tpr,linewidth = 2, label = 'auc='+str(label))
    plt.plot([0,1],[0,1],"k--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.axis([0,1,0,1])
    plt.legend(loc=4)
    plt.show()

In [None]:
def plot_confusion_matrix(target_test,target_pred):
    conf=confusion_matrix(target_test,target_pred)
    plot=ConfusionMatrixDisplay(confusion_matrix=conf,display_labels=['Not exited','Exited'])
    plot.plot()
    plt.show()

In [None]:
#Machine Learning...
model_name=[]
accuracy=[]
roc_auc=[]
F1_score=[]

#models...
models=[
    #tree Models, Boosting...
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    ExtraTreesClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    
    #Linear Models...
    LogisticRegression(random_state=42),
    XGBClassifier(random_state=42),
]

for model in models:
    model.fit(X_train, y_train)
    prediction = model.predict(X_valid)
    y_predict = model.predict_proba(X_valid)[:,1]
    model_name.append(model.__class__.__name__)
    accuracy.append((accuracy_score(prediction,y_valid)*100))
    roc_auc.append((roc_auc_score(y_valid,y_predict)*100))
    F1_score.append((f1_score(y_valid,prediction)*100))

In [None]:
models_df = pd.DataFrame({'Model-Name':model_name,'Accuracy': accuracy, 'AUC':roc_auc,'F1-Score':F1_score})
models_df

In [None]:
#graphical resprestation of models...
plt.figure(figsize=(8,6))
sns.pointplot(x='Model-Name',y='AUC',data=models_df)
plt.xticks(rotation=90)
plt.title('Model Comparison: AUC Score')
plt.tight_layout()
plt.show()

In [None]:
#we will use AUC score as our main performance metric -> GradientBoostingClassifier...
gbc=GradientBoostingClassifier(random_state=42)
gbc.fit(X_train,y_train)

In [None]:
y_gbc_prediction = gbc.predict_proba(X_valid)[:,1]
auc_score = roc_auc_score(y_valid,y_gbc_prediction)
print("AUC score:",auc_score)

In [None]:
#plot the confusion matrix...
y_pred=gbc.predict(X_valid)
plot_confusion_matrix(y_valid,y_pred)

In [None]:
#ROC curve...
fpr,tpr,threshold=roc_curve(y_valid,y_gbc_prediction)
#plot_roc_curve(fpr,tpr)

In [None]:
#Submission...
x_submission = x_1_test
id=x_1_test.index
y_submission=gbc.predict_proba(x_submission)
y_submission=y_submission[:,1]
id_data=pd.DataFrame(id)
id_data.columns.name='id'
ids=id_data.values
submission = np.concatenate((ids.reshape(-1,1),y_submission.reshape(-1,1)),axis=1)
submission_df=pd.DataFrame(submission,columns=['id', 'Exited'])
#submission_df.head()

submission_df.to_csv('submission.csv',index=False)