## Important Functions

In [20]:
# For filling up the missing values
def null_fill(train):
  for i in train.columns:
    if train[i].dtypes in ['object']:
      train[i] = train[i].fillna(train[i].mode())
    else:
      if i in ['no_of_children','no_of_days_employed','total_family_members','migrant_worker']:
        train[i] = train[i].fillna(math.floor(train[i].mean()))
      else:
        train[i] = train[i].fillna(train[i].mean())

  train['owns_car'] = train['owns_car'].fillna(train['owns_car'].value_counts().sort_values(ascending = False).index[0])
  train['gender'] = train['gender'].fillna(train['gender'].value_counts().sort_values(ascending = False).index[0])

  for i in ['no_of_children','no_of_days_employed','total_family_members','migrant_worker']:
    train[i] = train[i].astype('int')
  
  train['credit_limit_used(%)'] = train['credit_limit_used(%)'].astype('float')

In [21]:
# For dropping unimportant columns in starting itself
def remove_unimportant_columns(train):
  train.drop(columns=['customer_id','name'],inplace=True)

In [22]:
# To extract discrete features
def discrete_obtain(train):
  discrete_features = train.dtypes == int
  return discrete_features

In [23]:
from sklearn.feature_selection import mutual_info_classif

# For getting the mutual information scores of all features w.r.t to the target variable
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores



In [24]:
# To encode categorical features to numerical data
def label_encoder(X):
  for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()
    X[colname] = X[colname].astype('int')

## Feature Extraction and Choosing Best Model

In [25]:
import pandas as pd
import numpy as np
import math

In [26]:
train = pd.read_csv('/content/train.csv')
y_train = train.pop('credit_card_default')
# Applying all the above functions to clean and make the data useful
remove_unimportant_columns(train)
null_fill(train)
label_encoder(train)
discrete = discrete_obtain(train)

In [27]:
# To standardize the data
def standardize_float(train):
  new = train.select_dtypes('float')

  new = (new - new.mean())/new.std()
  a = []
  for i in train.columns:
    if train[i].dtypes == 'float':
      a.append(i)

  train[a] = new
  train['no_of_days_employed'] = (train['no_of_days_employed'] - train['no_of_days_employed'].mean())/train['no_of_days_employed'].std()

In [28]:
standardize_float(train)
train.head()

Unnamed: 0,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
0,46,0,0,0,0,-0.138582,-0.48334,0,1,1,0.073808,-0.16707,0.706847,-2.373413,2,1
1,29,1,0,0,0,-0.135699,-0.467764,1,2,0,-0.95454,-0.039004,-0.008,0.737582,0,0
2,37,1,0,0,0,0.044087,-0.486283,1,2,0,0.963362,-0.013107,-0.314363,-1.31985,0,0
3,39,0,0,0,0,-0.117072,-0.401609,2,2,0,-0.534591,-0.073399,-1.097291,-0.286164,0,0
4,46,1,1,0,0,0.278938,-0.477229,2,1,0,0.375962,0.063193,0.774928,1.433331,0,0


In [29]:
discrete = discrete_obtain(train)

In [30]:
imp_feat = make_mi_scores(train,y_train,discrete)

In [31]:
imp_feat

credit_score               0.219968
prev_defaults              0.159089
default_in_last_6months    0.146276
credit_limit_used(%)       0.090940
no_of_days_employed        0.003876
occupation_type            0.003140
gender                     0.001607
credit_limit               0.001134
migrant_worker             0.000549
no_of_children             0.000486
total_family_members       0.000367
age                        0.000330
net_yearly_income          0.000294
owns_car                   0.000148
owns_house                 0.000004
yearly_debt_payments       0.000000
Name: MI Scores, dtype: float64

In [32]:
important_features = imp_feat[imp_feat > 0.001].index  # selecting importatn features, i,e, mutual score > 0.001

In [33]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()

train = pd.DataFrame(st.fit_transform(train), columns=train.columns)

In [34]:
X_train = train[important_features]

In [35]:
from sklearn.model_selection import cross_val_score

In [36]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [37]:
xg = XGBClassifier()
rf = RandomForestClassifier()

In [38]:
# Choosing the best model by checking the cross validation scores

acc1 = cross_val_score(xg,X_train,y_train,cv=5,scoring='accuracy').mean()
print(acc1)

0.9804296242208281


In [39]:
acc2 = cross_val_score(rf,X_train,y_train,cv=5,scoring='accuracy').mean()
print(acc2)

0.9792654986791118


## USING XGBOOST TO PREDICT TEST DATA RESULTS AND SUBMISSION

In [40]:
test = pd.read_csv('/content/test.csv')

In [41]:
remove_unimportant_columns(test)

In [42]:
null_fill(test)

In [43]:
label_encoder(test)

In [44]:
standardize_float(test)

In [45]:

test = pd.DataFrame(st.transform(test), columns = test.columns)

In [46]:
X_test = test[important_features]

In [47]:
X_train.columns == X_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True])

In [48]:
xg.fit(X_train,y_train)

XGBClassifier()

In [49]:
def create_submission(xg,X_test,test_file,name_of_file):
  final = pd.DataFrame(test_file['customer_id'])
  pred = xg.predict(X_test)
  final['credit_card_default'] = pred
  final.to_csv('/content/'+ name_of_file+'.csv',index=False)


In [50]:
test_file = pd.read_csv('/content/test.csv')

In [51]:
create_submission(xg,X_test,test_file,'submission_amexpert_final') 
#Submission csv file made
#Achieved accuracy of 91.88975%