# 1. IMPORTING

In [None]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sb
import os

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from fast_ml.feature_selection import get_duplicate_features
from fast_ml.feature_selection import get_constant_features

In [None]:
#import dataset
df = pd.read_csv("../data/source/dataset.csv")

# 2. SPLITTING

In [None]:
# data on which we base
X = df.drop(columns=['target'])

# data which we predict
y = df['target']

In [None]:
# getting train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

# getting test and validation sets
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.3, random_state=42
)

In [None]:
# checking shapes of X's
print(X.shape, X_train.shape, X_test.shape, X_val.shape)

In [None]:
# checking shapes of y's
print(y.shape, y_train.shape, y_test.shape, y_val.shape)

# EXPORTING

# exporting data
relative_path = "../data/validation"

if not os.path.exists(relative_path):
    os.makedirs(relative_path)

X_train.to_csv(relative_path + '/train_X.csv', encoding='utf-8')
X_test.to_csv(relative_path + '/test_X.csv', encoding='utf-8')
X_val.to_csv(relative_path + '/val_X.csv', encoding='utf-8')

y_train.to_csv(relative_path + '/train_y.csv', encoding='utf-8')
y_test.to_csv(relative_path + '/test_y.csv', encoding='utf-8')
y_val.to_csv(relative_path + '/val_y.csv', encoding='utf-8')

# 3. DATASET INFO

In [None]:
# first five rows
X_train.head()

In [None]:
y_train.head()

In [None]:
# shape
X_train.shape

In [None]:
# info
X_train.info(verbose=True, show_counts=True)

In [None]:
# description
X_train.describe()

# 4. TRANSFORMATIONS

## 4.11 NAs

In [None]:
def nan_handler(df):
    # medians per column
    medians_train = {}
    
    # iterate over every column
    for column in df.columns:
        # calculate median for given column
        medians_train[column] = df[column].median()
        
        # replace NAs with median
        df[column].fillna(medians_train[column], inplace=True)
        
    print("NAs handled")
    return df

## 4.2. OUTLIERS

In [None]:
from scipy.stats import zscore

def outliers_handler(df):
    # calculate z-scores for each column
    z_scores = np.abs(zscore(df))
    
    # set a threshold for the z-score values
    threshold = 2.5
    
    # define the means_train dictionary
    means_train = {}

    # loop over each column in X_train
    for column in df.columns:
        # ientify the rows where the z-score exceeds the threshold  
        outliers = z_scores[column] > threshold

        # calculate the mean of the column excluding outliers
        means_train[column] = np.nanmean(df[column][~outliers])

        # replace values that exceed the threshold
        if outliers.any():
            # values in current columns with outliers
            values = df[column].values
            
            # mean with no outliers
            mean = means_train[column]
            
            # True is written where outlier is
            mask = outliers
            
            # iterate through every outlier in column
            for i in np.where(mask)[0]:
                if values[i] > mean:
                    # find last value greater than mean and within 2.5 std
                    replacement_candidates = values[(values > mean) & ~outliers]
                    if len(replacement_candidates) > 0:
                        replacement = sorted(replacement_candidates)[-1]
                    else:
                        replacement = mean
                else:
                    # find last value less than mean and within 2.5 std
                    replacement_candidates = values[(values < mean) & ~outliers]
                    if len(replacement_candidates) > 0:
                        replacement = sorted(replacement_candidates)[0]
                    else:
                        replacement = mean
                df.iloc[i, df.columns.get_loc(column)] = replacement
                
    print("outliers handled")
    return df

## 4.3. STANDARIZATION

In [None]:
from sklearn.preprocessing import StandardScaler

# scaler imported from sklearn instead of self written code
scaler = StandardScaler().fit(X_train)
scaler.transform(X_train)

In [None]:
def standarization_handler(df):
    scaler.transform(df)
        

# 5. FEATURE NUMBER REDUCTION

## 5.1. DUPLICATES

In [None]:
def drop_duplicate_features(df):
    # retrieve duplicate features object
    duplicate_features = get_duplicate_features(df)
    
    # retrieve names of duplicate features
    duplicate_features = duplicate_features[duplicate_features['Desc'] == 'Duplicate Values']['feature2'].tolist()
    
    # remove duplicates
    df.drop(columns = duplicate_features, inplace=True)
    
    print("duplicate features dropped")
    return df

## 5.2. CONSTANTS

In [None]:
def drop_constant_features(df):
    # retrieve constant features object
    constant_features = get_constant_features(df)
    
    # retrieve names of constant features
    constant_features = constant_features[constant_features['Perc'] > 98]['Var'].tolist()
    
    # remove constant features
    df.drop(columns = constant_features, inplace=True)
    
    print("constant features dropped")
    return df

## 5.3. CORRELATED

In [None]:
def drop_correlated_features(df):
    # calculate correlation
    corr = df.corr(method="spearman").abs()
    
    # retrieve correlation data
    upper_triangle = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    
    # retrieve highly correlated features' names
    corr_features = []
    for col in upper_triangle:
        if any(upper_triangle[col] > 0.8): 
            corr_features.append(col)

    # remove correlated features
    corr_features = pd.Series(corr_features)
    df.drop(columns = corr_features, inplace=True)
    
    print("correlated features dropped")
    return df

# 6. MODELS 

## 6.1 Test data preparing

In [None]:
def prepare_dataset(df, consistent_cols = []):
    if len(consistent_cols)==0:
        drop_duplicate_features(df)
        print("duplicated featured deleted")
        drop_constant_features(df)
        print("constant features dropped")
        drop_correlated_features(df)
        print("correlated features deleted")
    else:
        to_drop = filter(lambda i: i not in consistent_cols, df.columns)
        df.drop(columns = list(to_drop), inplace=True)
        print("columns consistent")
    outliers_handler(df)
    print("outliers removed")
    nan_handler(df)
    print("nan removed")
    standarization_handler(df)
    print("dataset standaraized")
    nan_removing(df)

prepare_dataset(X_test, X_train.columns)
print(X_test.shape, "TEST dataset")
print(X_train.shape, "TRAIN dataset")

In [None]:
# Train and test shape are the same
print(X_test.columns)
print(X_train.columns)
to_drop = filter(lambda i: i not in list(X_train.columns), list(X_test.columns))
X_test.drop(columns = list(to_drop), inplace=True)
print(list(to_drop))

## 6.2 Evaluating Models

In [None]:
# importing ROC-AUC score to valuate models
from sklearn.metrics import roc_auc_score

# importing ROC curve to visualize ROC curve
from sklearn.metrics import roc_curve

# matplotlib to draw plots of ROC-AUC curve
import matplotlib.pyplot as plt

# confusion matrix to show TP, TN, FP, FN
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# classification report to describe model accuracy
from sklearn.metrics import classification_report

In [None]:
def model_evaluation(model, X_df, y_df):
    y_pred_proba = model.predict_proba(X_df)
    y_pred = model.predict(X_df)
    # roc curve for models
    fpr_model, tpr_model, thresh_model = roc_curve(y_test, y_pred_proba[:,1], pos_label=1)

    # roc curve for tpr = fpr 
    random_probs = [0 for i in range(len(y_df))]
    p_fpr, p_tpr, _ = roc_curve(y_df, random_probs, pos_label=1)

    plt.style.use('seaborn')

    # plot roc curves
    plt.plot(fpr_model, tpr_model, linestyle='--',color='orange', label='Random Forest')
    plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
    # title
    plt.title('ROC curve')
    # x label
    plt.xlabel('False Positive Rate')
    # y label
    plt.ylabel('True Positive rate')

    plt.legend(loc='best')
    plt.savefig('ROC',dpi=300)
    plt.show();
    
    # auc scores
    auc_score_model = roc_auc_score(y_df, y_pred_proba[:,1])

    print("Classification report: ", "\n" , classification_report(y_df, y_pred))
    print("Random Forest Score: ", model.score(X_df,y_df))
    print("AUC score: ", auc_score_model)
    
    ConfusionMatrixDisplay.from_estimator(model, X_df, y_df)
    plt.show()

## 6.3 K Neighbors model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KN_model = KNeighborsClassifier(n_neighbors=4)
KN_model.fit(X_train, y_train)

### EVALUATION OF KNEIGHBORS MODEL

In [None]:
model_evaluation(KN_model, X_test, y_test)

## 6.4 Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression()
LR_model.fit(X_train, y_train)

### EVALUATION OF LOGISTIC REGRESSION MODEL

In [None]:
model_evaluation(LR_model, X_test, y_test)

## 6.5 Random Forest

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=2)
rf_model.fit(X_train, y_train)

### EVALUATION OF RANDOM FOREST MODEL

In [None]:
model_evaluation(rf_model, X_test, y_test)

### VERIFYING FEATURE IMPORTANCE IN RANDOM FOREST MODEL

In [None]:
# Get feature importances and sort them in descending order
importances = rf_model.feature_importances_
sorted_indices = importances.argsort()[::-1]
columns = X_test.columns

# Identify non-predictive columns
non_predictive_cols = []
for i in sorted_indices:
    if importances[i] < 0.05:
        non_predictive_cols.append(columns[i])

# Remove non-predictive columns
reduced_X = pd.DataFrame(X_test).drop(non_predictive_cols, axis=1)

# Train a new random forest classifier on the reduced dataset
rf_model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_reduced.fit(reduced_X, y_test)

### 2ND EVALUATION OF RANDOM FOREST MODEL

In [None]:
model_evaluation(rf_model_reduced, reduced_X, y_test)

## 6.6 Gradient Boosting model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification

In [None]:
# train a Gradient Boosting classifier
GB_clf = GradientBoostingClassifier(random_state=42)
GB_clf.fit(X_train, y_train)

### EVALUATION OF GRADIENT BOOSTING MODEL

In [None]:
model_evaluation(GB_clf, X_test, y_test)

## 6.7 XGBoost model

In [None]:
from xgboost import XGBClassifier
from sklearn.datasets import make_classification

# train an XGBoost classifier
XGB_model = XGBClassifier(random_state=42)
XGB_model.fit(X_train, y_train)

### EVALUATION OF XGBOOST MODEL

In [None]:
model_evaluation(XGB_model, X_test, y_test)

# EXPORTING

# exporting preprocessed dataset to csv
relative_path = "../data/preprocessed"

if not os.path.exists(relative_path):
    os.makedirs(relative_path)
    
X_train.to_csv(relative_path +'/train_dataset.csv', encoding='utf-8')