In [None]:
#Importing Libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Importing Dataset
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

## Data Exploration and Preprocessing

## Missing Values

In [None]:
#Missing Value Checking on training set
train.isnull().sum()

In [None]:
#Missing Value Checking on test set
test.isnull().sum()

### Missing Values are imputed with Backfill method

In [None]:
## Categorical Variables
cat_vars = [x for x in train.columns if 'cat_' in x]

In [None]:
#Missing Value imputing on both training and test data
for x in cat_vars:
    train[x] = train[x].fillna(method='bfill')
    test[x] = test[x].fillna(method='bfill')

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#Dropping Missing values
train.dropna(inplace=True)

### Label Encoding of Categorical Variables

In [None]:
##Total number of unique labels present in each categorical variables
train.iloc[:,8:-1].nunique()

In [None]:
test.iloc[:,8:].nunique()

### Here,I have created an array with all the labels present in Categorical Variables and that array is fitted to LabelEncoder object for homogenous label encoding.

In [None]:
#Used Numpy Union Method to capture all the labels present in dataset
labels=np.union1d(train.cat_var_3,train.cat_var_1)

In [None]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()

In [None]:
##Label Encoder is fiited to labels
lb.fit(labels)

In [None]:
##Transforming all Categorical Variables(Training Set)
for x in cat_vars:
    if train[x].dtype=='object':
        train[x]=lb.transform(train[x])
    

In [None]:
##Transforming all Categorical Variables(Test Set)
for x in cat_vars:
    if test[x].dtype=='object':
        test[x]=lb.transform(test[x])
    

In [None]:
#Saving Preprocessed Data
train.to_csv("Cleaned_Train_data.csv",index=False)
test.to_csv("Cleaned_Test_data.csv",index=False)

## Modelling

In [None]:
import numpy as np
import pandas as pd
import sklearn

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
train=pd.read_csv("Cleaned_Train_data.csv")
test=pd.read_csv("Cleaned_Test_data.csv")

In [None]:
X=train.iloc[:,1:-1]
y=train['target']

In [None]:
test=test.iloc[:,1:]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y)

In [None]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [None]:
### Scoring Function
def Score(model,X_train,y_train,X_test,y_test,train=True):
    if train==True:
        print("Training Result \n")
        print("ROC AOC Value:  {0:0.4f} \n".format(roc_auc_score(y_train,model.predict(X_train))))
        scores=cross_val_score(estimator=model,X=X_train,y=y_train,cv=10,scoring='roc_auc',n_jobs=-1)
        print("Cross-Validation Score: \n",scores.mean())
        print("Standard Deviation: \n",scores.std())
    elif train==False:
        print("TestResult \n")
        print("ROC AOC Value:  {0:0.4f} \n".format(roc_auc_score(y_test,model.predict(X_test))))
             

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_RF=model_RF=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Performance on Partial Training Data

In [None]:
model_RF.fit(X_train,y_train)

In [None]:
Score(model_RF,X_train,y_train,X_test,y_test,train=True)

In [None]:
Score(model_RF,X_train,y_train,X_test,y_test,train=False)

### Training on Full Training Data

In [None]:
model_RF.fit(sc.transform(X),y)

In [None]:
Score(model_RF,X_train,y_train,X_test,y_test,train=True)

In [None]:
Score(model_RF,X_train,y_train,X_test,y_test,train=False)

In [None]:
pred_RF=model_RF.predict_proba(test)[:,1]

## XGBoost Model

In [None]:
model_XGB=XGBClassifier(n_estimators=150,max_depth=10)

### Performance Over Partial Training Data

In [None]:
model_XGB.fit(X_train,y_train)

In [None]:
Score(model_XGB,X_train,y_train,X_test,y_test,train=True)

In [None]:
Score(model_XGB,X_train,y_train,X_test,y_test,train=False)

### Training on Full Training Data

In [None]:
model_XGB.fit(sc.tranform(X),y)

In [None]:
Score(model_XGB,X_train,y_train,X_test,y_test,train=True)

In [None]:
Score(model_XGB,X_train,y_train,X_test,y_test,train=False)

In [None]:
pred_XGB=model_XGB.predict_proba((test))[:,1]

### Final Prediction

#### 60% weightage of XGBoost model and 40% weightage of Random Forest model ensembled together for final submission 

In [None]:
#Model Ensembling
final_pred=0.6*pred_XGB+0.4*pred_RF

### Submission

In [None]:
sub=pd.read_csv("sample_submissions.csv")

In [None]:
sub["target"]=final_pred

In [None]:
sub.to_csv("Submission_XGB_RF.csv",index=False)