In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

### Data Preprocessing functions

#### Data column transformation for regression

In [2]:
def data_col_transformation_regression(data, numerical_cols, categorical_cols, target, ohenc=0):
    """Function returns feature matrix & label array for given dataset"""
    
    # continuous numerical columns or 2 categoried column(0/1)
    x_values = data.loc[:,numerical_cols].values
    
    # using one-hot-encoding for at least 3 categoried categorical column
    one_hot_encode = OneHotEncoder()
    if ohenc == 0:
        new_columns = one_hot_encode.fit_transform(data.loc[:,categorical_cols]).toarray()
    else:
        new_columns = ohenc.transform(data.loc[:,categorical_cols]).toarray()
    x_values = np.append(x_values, new_columns, axis=1)
    
    # target column
    y_values = data[target].values
    
    return x_values, y_values, one_hot_encode

#### Removing outliers

In [3]:
def remove_outliers(data, cols):
    for col in cols:
        quartile1 = data[col].quantile(0.25)
        quartile3 = data[col].quantile(0.75)
        inter_quartile_range = quartile3 - quartile1
        outliers = data.loc[(data[col] < (quartile1 - 1.5 * inter_quartile_range)) |\
                            (data[col] > (quartile3 + 1.5 * inter_quartile_range))].index
        data = data.drop(outliers)
    return data 

#### Scaling of data

####  Standardisation 

In [4]:
# standardising data (z-score normalisation)
def standardisation(data):
    for col in data.columns:
        mean = data[col].mean()
        std_deviation = data[col].std()
        data[col] = (data[col] - mean)/std_deviation
        
    return data

#### MinMax Normalisation 

In [5]:
# scaling all columns in range 0-1 
def min_max_normalisation(data):
    for col in data.columns:
        minimum = data[col].min()
        maximum = data[col].max()
        data[col] = (data[col] - minimum)/(maximum - minimum)
    
    return data

#### Splitting dataset into train & test set

In [6]:
# Splitting 70 % dataset into train set & 30 % dataset into dataset into test set
def splitting_dataset(data):
    train = data.sample(frac=0.7, random_state=3)   # selecting random 0.7 fraction of dataset as train set   
    # chossing different random state will give different random rows
    test = data.drop(train.index)            # selecting remaining i.e. 30% as test set
    return train, test

### Categorising functions for different algorithms

In [7]:
# categorising predicted values of logistic regression
def categorise_logistic(y_values):
    return np.where(y_values >= 0.5, 1, 0)   

In [8]:
def categorise_knn(y_values):
    return np.where(y_values == True, 1, 0) 

### Evaluations functions

#### Root mean squared error (RMSE)

In [9]:
def root_mean_sqr_error(predicted_values, actual_values):
    mean_sqr_error = ((predicted_values - actual_values)**2).mean()        # mean squared error
    return np.sqrt(mean_sqr_error)              # root mean squared error

#### R^2 score (Coeff of Determination) 

In [10]:
def r_sqr_score(predicted_values, actual_values):    
    sum_sqr_errors = ((predicted_values - actual_values)**2).sum()        # sum of square of residuals/errors
    sum_sqr_deviations = ((actual_values - actual_values.mean())**2).sum()  # sum of squares of actual deviations
    return (1 - sum_sqr_errors/sum_sqr_deviations)   

#### Regression accuracy

In [11]:
def accuracy_regression(predicted_values, actual_values):
    total_error = abs(predicted_values - actual_values)/actual_values
    return (1- total_error.mean())*100        # percentage of accuracy

#### Classification accuracy

In [12]:
def accuracy_classification(predicted_values, actual_values):
    return (predicted_values == actual_values).mean()*100