In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

### Data Preprocessing functions

#### Data column transformation for regression

In [2]:
def data_col_transformation_regression(data, numerical_cols, categorical_cols, target, ohenc=0):
    """Function returns feature matrix & label array for given dataset"""
    
    # continuous numerical columns or 2 categoried column(0/1)
    x_values = data.loc[:,numerical_cols].values
    
    # using one-hot-encoding for at least 3 categoried categorical column
    one_hot_encode = OneHotEncoder()
    if ohenc == 0:
        new_columns = one_hot_encode.fit_transform(data.loc[:,categorical_cols]).toarray()
    else:
        new_columns = ohenc.transform(data.loc[:,categorical_cols]).toarray()
    x_values = np.append(x_values, new_columns, axis=1)
    
    # target column
    y_values = data[target].values
    
    return x_values, y_values, one_hot_encode

#### Removing outliers

In [3]:
def remove_outliers(data, cols):
    for col in cols:
        quartile1 = data[col].quantile(0.25)
        quartile3 = data[col].quantile(0.75)
        inter_quartile_range = quartile3 - quartile1
        outliers = data.loc[(data[col] < (quartile1 - 1.5 * inter_quartile_range)) |\
                            (data[col] > (quartile3 + 1.5 * inter_quartile_range))].index
        data = data.drop(outliers)
    return data 

### Plotting color map for classification for AdClick Dataset

In [4]:
from matplotlib.colors import ListedColormap

def plot_colormap(x_set, y_set, classifier, title):
    """Function plots colormap showing classification using decision boundary"""
    # createting grid of continuous points in given range of values from two columns of feature
    # meshgrid returns matrices for their cartesian product after giving set of arrays 
    x1_grids, x2_grids = np.meshgrid(np.arange(x_set[:,0].min(), x_set[:,0].max(), 0.01), 
                            np.arange(x_set[:,1].min(), x_set[:,1].max(), 0.01))
    
    # we are patitioning data-ponts using decision boundary so coloring area on either side
    # created feature matrix for area/continuous values from grid points from 2 features 
    x_continuous_values = np.array([x1_grids.flatten(), x2_grids.flatten()]).T
    
    # plotting area i.e continuous points & classification using regressor prediction 
    plt.contourf(x1_grids, x2_grids, classifier.predict(x_continuous_values).reshape(x1_grids.shape),
                alpha=0.6, cmap= ListedColormap(('red', 'blue')))
    
    # 
    plt.xlim(x1_grids.min(), x1_grids.max())
    plt.ylim(x2_grids.min(), x2_grids.max())
    
    # plotting actual classified data-points/observations with thier repective category
    for j in np.unique(y_set):
        plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
                   c=('red', 'blue')[j], label=j, s=6)
    plt.xlabel('Age')
    plt.ylabel('Estimated Salary')     # labeling axes
    plt.legend()
    plt.title(title)
    plt.show() 

### Plotting CAP curve for classifier model

In [5]:
def plot_cap_curve(actual_values, x_values, classifier, prob, x_label_name, y_label_name, classifier_model_name):
    """Function plots cumulative accuracy profile curve for given classifier model"""
    
    total = len(actual_values)
    one_count = len(actual_values[actual_values==1])
    
    # Plotting random model
    plt.plot([0, total], [0, one_count], ls='--', c='red', label='Random Model', alpha=0.7)
    
    # Plotting perfect model
    plt.plot([0, one_count, total], [0, one_count, one_count], c='green', label='Perfect Model', alpha=0.4)
    
    # Plotting given classifier model
    # getting predicted probabilities for default class among 0 & 1 , 1 is default class
    default_cls_prob = classifier.predict_proba(x_values)[:,1]
    
    # sorting predicted proababilities for to get those earlier those are proabable to default class
    sorted_predictions = list(sorted(zip(default_cls_prob, actual_values), reverse=True))

   # creating list for accurate prediction of default class 1 if accurate prediction of default class else 0
    accurate_one_predictions = []
    for index in range(total):
        if sorted_predictions[index][0] >= prob and sorted_predictions[index][1] == 1:
            accurate_one_predictions.append(1)
        else:
            accurate_one_predictions.append(0)
                
    # creating values for both axes 
    x_axis_values = np.arange(total+1)
    # taking cumulative sums for accurate predictions of default class
    y_axis_values = np.cumsum([0] + accurate_one_predictions)
    plt.plot(x_axis_values, y_axis_values, c='blue', label=classifier_model_name)
    
    # finding accuracy using CAP curve
    mid_value = y_axis_values[int(total//2)]
    print(f'Accuracy using CAP curve: {mid_value/one_count}')
    
    plt.xlabel(x_label_name)
    plt.ylabel(y_label_name)
    plt.legend()
    plt.title('Cumulative Accuracy Profile')
    plt.show()

### Function dividing a feature(string) to 8 different features

In [6]:
def separate_feature_column(data, feature):
    """Function returns 8 different features from 1 string value of feature"""
    x_values = np.ones((1,8))
    for value in data[feature]:
        # creating numpy array of list of characters in each record & appending row at end
        x_values = np.append(x_values, np.array(list(value)).reshape(1,8), axis=0)
    return x_values[1:]

### Data cleansing in NLP for english language

In [7]:
import re
import nltk

# import class method object for stemming i.e. getting root words
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# getting stopwords
from nltk.corpus import stopwords
 
corpus = []
def get_corpus(data, col):
    for index in range(len(data)):
        text = re.sub('[^a-zA-Z]',' ',data[col][index])
        text = text.lower()
        text = text.split()
        text = [ps.stem(word) for word in text if word not in set(stopwords.words('english'))] 
        text = " ".join(text) 
        corpus.append(text) 
    
    return corpus

### Calculating AUC while comparing different classifiation models

In [8]:
from sklearn.metrics import auc
# calculating area under curve
def calculate_auc(model, fpr, tpr):
    roc_auc = auc(fpr, tpr)
    print('AUC '+model+':', roc_auc)