# Load the data

In [16]:
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, ensemble, tree
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score,GridSearchCV,cross_validate
from sklearn.metrics import confusion_matrix, auc,roc_auc_score,roc_curve,recall_score,classification_report
pd.set_option('display.max_rows', 100, 'display.max_columns', 400)

# Import and Explore the data 

In [3]:
"""
Step 1 :- Load the data
Step 2 :- Print Number of rows and Columns
Step 3 :- Print datatypes of all the columns in the data
Step 4 :- Check for the duplicate rows in the data and remove them
Step 5 :- Check for the missing values in the data 

"""

def Load_File(path) :
    if os.path.isfile(path) :
        data = pd.read_csv(path)
        print("\n" + "Number of rows in data are %s" % len(data))
        print("Number of columns in data are %s" % len(data.columns) + "\n")
        print("Following are the data types of columns:- ")
        print(data.dtypes)
        print("Data Import is Complete")
        print(data.isnull().sum())
        return data
    else:
        print(path + " does not exist. Enter the correct path")

In [None]:
"""
Check if there is imbalance in the data

"""

def overfit_columns(data, column_name):
    
    max_value = data[column_name].value_counts()[0]
    total_sum = data[column_name].value_counts().sum()
    
    if (max_value/total_sum)*100 > 90:
        print('%s feature categories are imbalanced')

In [4]:
"""
Impute the missing values. 
* For Numerical Data use mean or median to impute the data.
* For Categorical Data use mode to impute the data 

"""

def categorical_imputing(df, column_name):
    df[column_name] = df[column_name].fillna(df[column_name].mode().iloc[0])
    print(df[column_name].unique())
    return df

def numerical_imputing_and_encoding(df, column_name):
    df[column_name] = df[column_name].fillna(df.median())
    df[column_name] = (df[column_name]-df[column_name].mean())/df[column_name].std()
    return df


In [6]:
"""Make Two Lists
1. One for columns whose datatype is object
2. Second for columns whose datatype is numeric
"""

categorical_columns = [column for column in data.columns if data[column].dtype == 'object']
numerical_columns = [column for column in data.columns if data[column].dtype != 'object']

In [7]:
"""Make a barplot for object columns and see the counts of categories"""

def barplot_count(data, Column_Name, size=(4,5), width = 0.25, height = 10., hue = None):
    plt.figure(figsize=size)
    plt.title('Count of %s Variable' % (Column_Name))
    palette = sns.color_palette("husl")
    if not hue:
        ax = sns.countplot(Column_Name, data=data, palette=palette, order = data[Column_Name].value_counts().index)
    else:
        ax = sns.countplot(Column_Name, data=data, palette=palette, hue=hue, order = data[Column_Name].value_counts().index)
    
    total = len(data[Column_Name])
    for p in ax.patches:
        height_bar = np.nan_to_num(p.get_height(), 0)
        ax.annotate(height_bar, (p.get_x() + width, p.get_height()+height), fontsize=8)

In [9]:
# """Plot For all the categorical columns """

# fig, ax = plt.subplots(1, len(categorical_columns), figsize=(25, 7))
# for i, column in enumerate(categorical_columns):
#     if len(data[column].unique()) <= 10:
#         sns.countplot(column, data=data, palette=sns.color_palette("husl"),
#                       order = data[column].value_counts().index)

# fig.show()

fig,axs= plt.subplots(2,1,figsize=(8,12))
fig.subplots_adjust(hspace=0.6)

for i,ax in zip(categorical_columns,axs.flatten()):
    sns.countplot(i, data=data, palette='husl', order = data[i].value_counts().index, ax=ax)
    plt.xlabel(i,fontsize=12)
    ax.set_title(str(i), fontweight='bold', size=20)
    
    total = len(data[i])
    height = int(total*0.005)
    print(total)
    for p in ax.patches:
        height_bar = np.nan_to_num(p.get_height(), 0)
        ax.annotate(height_bar, (p.get_x() + 0.20, p.get_height()+height), fontsize=8)

In [10]:
"""Plot Dist plot and see which variables are skewed. Then we can use some transformation to make it less skewed
For example :- Log Transformations, Square Transformation
"""

fig, ax = plt.subplots(1, len(numerical_columns), figsize=(25, 7))
for i, column in enumerate(numerical_columns):
    sns.distplot(data[column], ax=ax[i], kde=True, hist=False, norm_hist=True)
    stats.probplot(data[column], plot=plt)

fig.show()

data[column] = np.log10(data[col]+1)


In [11]:
"""Next Step is to CHECK FOR OUTLIERS in the data"""

def drop_outliers(data, column):
    iqr = 1.5 * (np.percentile(data[column], 75) - np.percentile(data[column], 25))
    data.drop(data[data[column] > (iqr + np.percentile(data[column], 75))].index, inplace=True)
    data.drop(data[data[column] < (np.percentile(data[column], 25) - iqr)].index, inplace=True)
    return data
    

In [None]:
"""NEXT STEPS IS ONE-HOT ENCODING OF THE DATA"""

def categorical_encoding(df, column_name):
    print(df[column_name].unique())
    categorical_columns = pd.get_dummies(df[column_name], prefix = column_name, prefix_sep = '_', drop_first = True)
    df = pd.concat([df, categorical_columns], axis = 1)
    df = df.drop(column_name, axis = 1)
    return df

In [None]:
"""STANDARDIZED THE DATASET FOR LOGISTIC REGRESSION"""


from sklearn.preprocessing import StandardScaler, RobustScaler
rob_scaler = RobustScaler()

sns.kdeplot(np.log(std_scaler.fit_transform(data['amount'].values.reshape(-1,1))+1), bw=0.5)

In [12]:
"""
1. NEXT STEP IS TO CREATE FEATURE FROM THE DATA. FOR EXAMPLE :- WEEK FROM DATE, MONTH FROM DATE, ETC
2. THEN WE CAN PLOT A CORRELATION HEATMAP TO SEE THE COLUMNS WHICH ARE HIGHLY RELATED. WE CAN REMOVE COLUMN WHICH ARE 
HIGHLY CORRELATED
3. WE CAN ALSO USE VIF (VARIANCE INFLATION FACTOR) FOR THE COLUMNS
4. THEN WE CAN REMOVE THE UNWANTED COLUMNS FROM THE DATA
"""

In [13]:
feature = final_data.drop([column_name], axis = 1)
target  = final_data[target_column]

In [14]:
"""
Split the data into training and testing data
If the data is imbalanced then use SMOTE to balance the data

"""

from imblearn.over_sampling import SMOTE
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, 
                                                    test_size = 0.3, 
                                                    random_state = 0, 
                                                    stratify=target)


smote = SMOTE()
X_Smote_Train, Y_Smote_Train = smote.fit_sample(X_train, Y_train)



In [15]:
"""Use Logistic Regression Model for Classification using Class Weights """

logistic_class_weights = {0:10, 1:100} 
logistic_regression_smote = LogisticRegression(random_state=12, class_weight=logistic_class_weights)
logistic_regression_smote.fit(X_Smote_Train, Y_Smote_Train)


coefficients = pd.concat([pd.DataFrame(X_Smote_Train.columns),
                          pd.DataFrame(np.transpose(logistic_regression_smote.coef_))], axis = 1)
coefficients.columns = ['Features', 'Coeff']
coefficients['odds'] = np.exp(coefficients['Coeff'])
coefficients

In [None]:
"""Use K-Fold to built model"""

from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
cv = KFold(shuffle=True, random_state=2, n_splits=3)
scores = cross_val_score(logistic_regression_smote, X_Smote_Train, Y_Smote_Train ,cv = cv, scoring = 'roc_auc')


def printing_Kfold_scores(x_train_data,y_train_data):
    
    fold = KFold(shuffle=False, random_state=2, n_splits=3)

    # Different C parameters
    c_param_range = [0.01,0.1,1,10,100]
    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range
    
    j = 0
    for c_param in c_param_range:
        print('C parameter: ', c_param)
        recall_accs = []
        for train_index, test_index in cv.split(x_train_data):
            print(train_index)
            print(test_index)
            X_train, X_test = x_train_data.iloc[train_index], x_train_data.iloc[test_index]
            Y_train, Y_test = y_train_data.iloc[train_index], y_train_data.iloc[test_index]
            print(X_train.shape)
            print(Y_train.shape)
            lr = LogisticRegression(C = c_param, penalty = 'l2')
            lr.fit(X_train, Y_train)
            y_pred_undersample = lr.predict(X_test)
            recall_acc = recall_score(Y_test,y_pred_undersample)
            recall_accs.append(recall_acc)
        results_table.iloc[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')

    best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
    
    # Finally, we can check which C parameter is the best amongst the chosen.
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    
    return best_c

In [None]:
"""We can see from the model output that our model starts to perform better with SMOTE"""

print(classification_report(Y_test, pred_smote, digits=2))

In [None]:
"""ROC-AUC Curve for the model"""

from sklearn import metrics
preds = pred_smote_prob[:,1]
y_test = np.array(Y_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# RANDOM FOREST MODEL

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state=7)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50 , stop = 150, num = 25)]   # returns evenly spaced 25 numbers
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 5)]  # returns evenly spaced numbers can be changed to any
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5,6,7,8,9,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method of selecting samples for training each tree
learning_rate = [float(x) for x in np.linspace(0.1, 1, num = 10)]

# Create the random grid
params_r = {'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'learning_rate':learning_rate}

# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
random = RandomizedSearchCV(estimator=rf, param_distributions=params_r,cv = k, random_state=7)

# Fit the random search model
random.fit(xtrain, ytrain)
random.best_params_

print(random.score(xtrain,ytrain))
print(random.score(xtest,ytest))