### Importing Library

In [None]:
try:
    import pandas as pd
    import numpy as np 
        
    import seaborn as sns
    import matplotlib.pyplot as plt
    %matplotlib inline
    sns.set(color_codes=True)
    
    print("all loaded")
except:
    print("error")

### Loading Dataset

In [None]:
# Loading data from train.csv file
train_df = pd.read_csv("train.csv")
train_df.head(5)

In [None]:
# Loading data from test.csv file
test_df = pd.read_csv("test.csv")
test_df.head(5)

In [None]:
print(train_df.shape)
test_df.shape

In [None]:
train_df.info()

### DATA PROCESSING

### STEP 1: Dealing with Null values

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df = train_df.fillna("None")
test_df = test_df.fillna("None")

### Step 2 : Dealing with Duplicated Values

#### Duplicate Rows:

In [None]:
train_df.duplicated().sum()

#### Duplicate Columns:

In [None]:
# train_t = train_df.T
# train_t.shape
# print(train_t.duplicated().sum())

No duplicate values thus are present

### Step 3: Handling Outliers

#### a) Finding Oultiers Using Boxplot:  Boxplot Outliers only for Numerical Variables and not categorical

In [None]:
sns.boxplot(data=train_df)
plt.show

#### b) Finding Outliers Using Scatterplot

In [None]:
sns.scatterplot(data=train_df["Avg_Account_Balance"])
plt.show()

In [None]:
train_df = train_df.drop("ID",axis=1)

#### Defining Target Variable

In [None]:
train = train_df.drop(["Is_Lead"],axis=1)
y = train_df["Is_Lead"]
test = test_df

#### Differentiating Numericala and Categorical Data for further processing

In [None]:
#We have 2 types of data in our dataset : int64 and object

train_categorical = train.select_dtypes(exclude = ['int64'])
test_categorical = test.select_dtypes(exclude = ['int64'])

train_numerical = train.select_dtypes(include = ['int64'])
test_numerical = test.select_dtypes(include = ['int64'])


#### Defining column names for numerical data

In [None]:
numcol_names_train = train_numerical.columns.values
numcol_names_test = test_numerical.columns.values

numcol_names_train

In [None]:
#Converting these to list from array

numcol_names_train.tolist()
numcol_names_test.tolist()

### Checking Skewness and Kurtosis for Numerical Columns

#### For Train Data

In [None]:
sns.kdeplot(train_numerical['Avg_Account_Balance'], bw=0.5)    #bw is smoothing parameter
plt.show()

In [None]:
sns.kdeplot(train_numerical['Age'], bw=0.5)

In [None]:
sns.kdeplot(train_numerical['Vintage'],bw=0.5)

#### For test Data 

In [None]:
sns.kdeplot(test_numerical['Avg_Account_Balance'], bw=0.5)    #bw is smoothing parameter
plt.show()

There is skewness in Avg Account Balance Column for bith train and test data.

#### Dealing with Skewness Using Log Transformation

In [None]:
train_numerical['Avg_Account_Balance'] = np.log(train_numerical['Avg_Account_Balance'])
sns.kdeplot(train_numerical['Avg_Account_Balance'])
plt.show()

In [None]:
test_numerical['Avg_Account_Balance'] = np.log(test_numerical['Avg_Account_Balance'])
sns.kdeplot(test_numerical['Avg_Account_Balance'])
plt.show

In [None]:
train_numerical.agg(['skew', 'kurtosis'])

#### Both skew and kurtosis can be analyzed through descriptive statistics. Acceptable values of skewness fall between − 3 and + 3, and kurtosis is appropriate from a range of − 10 to + 10

In [None]:
#Histograms for numerical Columns
train_numerical.hist(figsize=(15, 10), bins=50, xlabelsize=8, ylabelsize=8);

### Normalinzing and Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

#Using standard scaler
scaler = StandardScaler()
train_numerical = scaler.fit_transform(train_numerical.values)
train_numerical = pd.DataFrame(train_numerical, columns = numcol_names_train)

test_numerical = scaler.fit_transform(test_numerical.values)
test_numerical = pd.DataFrame(test_numerical, columns = numcol_names_test)

### Encoding Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder

train_categorical = train_categorical.apply(LabelEncoder().fit_transform)
test_categorical = test_categorical.apply(LabelEncoder().fit_transform)

In [None]:
pd.DataFrame(train_categorical)

#### Combining the Numnerical and Categorical Database

In [None]:
train_new = pd.concat([train_categorical,train_numerical,y],axis=1)
test_new = pd.concat([test_categorical,test_numerical],axis=1)

### Checking For Correlations:

In [None]:
#For coplete Database

corr_train = train_new.corr()
plt.figure(figsize=(13,5)) 


ax = sns.heatmap(corr_train,annot=True)
plt.show


#### Checking Variable Correlation with Target Variable:

In [None]:
imp = train_new.drop("Is_Lead", axis=1).apply(lambda x: x.corr(train_new.Is_Lead))
print(imp)

In [None]:
indices = np.argsort(imp)
print(imp[indices])     #Sorted in ascending order

#### Removing Variable with Low correlation with Target Variables

In [None]:
for i in range(0, len(indices)):
    if np.abs(imp[i])>0.02:
        print(train_new.columns[i])

Can drop occupation and Id

In [None]:
# train_new1 = train_new.drop(["Occupation"],axis=1)

In [None]:
# import matplotlib.pyplot as plt

# names=['cylinders','displacement','horsepower','weight','acceleration','model year', 'name']
# plt.title('Miles Per Gallon')

# #Plotting horizontal bar graph
# plt.barh(range(len(indices)), imp[indices], color='g', align='center')
# plt.yticks(range(len(indices)), [names[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()

#### Checking Predictors Co-relation With each other

In [None]:
for i in range(0,len(train_new1.columns)):
    for j in  range(0,len(train_new1.columns)):
        if i!=j:
            corr_1=np.abs(train_new1[train_new1.columns[i]].corr(train_new1[train_new1.columns[j]]))
            if corr_1 <0.3:
                print( train_new1.columns[i] , " is not correlated  with ", train_new1.columns[j])
            elif corr_1>0.75:
                print( train_new1.columns[i] , " is highly  correlated  with ", train_new1.columns[j])

Thus there exist no such great correlation mong our variables. Great news!

### Find Mutual Information OR Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_regression

col = train_new.drop(["Is_Lead"],axis=1)
       
mig = mutual_info_regression(col, y);
mig

In [None]:
mig = pd.Series(mig)
mig.index = col.columns
mig

In [None]:
#Plotting the mutual information

mig.sort_values(ascending=False).plot.bar(figsize=(10, 4))

### Modelling

In [None]:
# Split the Train data into predictors and target

X = train_new.drop(['Is_Lead'],axis=1)
predictor_test = test_new.drop(['ID'], axis =1)
y = train_new['Is_Lead']

In [None]:
predictor_test.columns

In [None]:
X.columns

In [None]:
# Model Evaluation Metric & Cross Validation Libraries
from sklearn.metrics import *
# Boosting Algorithm Librarie
import xgboost
from lightgbm import LGBMClassifier

In [None]:
model = LGBMClassifier(metric = 'auc', 
                       n_estimators=50000,    
                       bagging_fraction=0.95, 
                       subsample_freq = 2, 
                       objective ="binary",
                       importance_type = "gain",
                       verbosity = -1,
                       random_state=294,
                       num_leaves = 300,
                       boosting_type = 'gbdt',
                       learning_rate=0.15,
                       max_depth=4, 
                       scale_pos_weight=2,
                       n_jobs=-1 
                      )

In [None]:
from sklearn.model_selection import StratifiedKFold

accuracy = []
skf = StratifiedKFold(n_splits=10,shuffle=True)

skf.get_n_splits(X,y)   #this will return train_index and test_index

for train_index,test_index in skf.split(X,y):
    print("Train",train_index,"Validation:",test_index)
    print(train_index.shape,test_index.shape)
    
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y.iloc[train_index],y.iloc[test_index]
    
    model.fit(X_train,y_train,eval_set=[(X_train, y_train),(X_test, y_test)],early_stopping_rounds=100 ,verbose=100)
    pred = model.predict(X_test)
    
    score = accuracy_score(pred,y_test)
    
    accuracy.append(score)
    print(np.array(accuracy).mean())
    
    
    rocauc = []
    

    roc_auc = roc_auc_score(y_test,model.predict_proba(X_test)[:, 1])
    rocauc.append(roc_auc)
   
    print(np.array(roc_auc).mean())