In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%run Mappings.ipynb
%run Helpers.ipynb

In [None]:
# DIR = '/content/drive/MyDrive/DentistDataAnalysis/Experiments/'
DIR = ''

In [None]:
data = pd.read_csv(DIR+'dataset/final_data.csv')
data

In [None]:
X_cols = ['Que1', 'Que2', 'Que3', 'Que4', 'Que5', 'Que6', 'Que10_a', 'Que10_b', 'Que10_c', 'Que10_d', 'Que10_e', 'Que10_f', 'Que14', 'Que15', 'Que17', 'Que18_age', 'Que19', 'Que20', 'Que21', 'Que22', 'Que_smoking']
y_col = ['Que16']

# Strategy 1 - dropping all missing

In [None]:
data_dropped = data.dropna().reset_index(drop=True)
data_dropped = perform_mapping(data_dropped, questions_map_EN)
model_data = data_dropped.drop(columns=['Que16'])
target_data = data_dropped['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

### Undersampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

### Oversampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

### SMOTEENN

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

## Random Forest

### No balance

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

## Decision Tree

### No balance

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')

# Strategy 2 - median

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
data_median = pd.DataFrame(imputer.fit_transform(data))
data_median.columns = data.columns
data_median.index = data.index
data_median = perform_mapping(data_median, questions_map_EN)
model_data = data_median.drop(columns=['Que16'])
target_data = data_median['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

### Undersampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

### Oversampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

### SMOTEENN

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

## Random Forest

### No balance

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

## Decision Tree

### No balance

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')

# Strategy 3 - mean

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data_mean = pd.DataFrame(imputer.fit_transform(data))
data_mean.columns = data.columns
data_mean.index = data.index
# mean values are not integer numbers so rounding to int was necessary
data_mean = data_mean.round()
data_mean = perform_mapping(data_mean, questions_map_EN)
model_data = data_mean.drop(columns=['Que16'])
target_data = data_mean['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

### Undersampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

### Oversampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

### SMOTEENN

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

## Random Forest

### No balance

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

## Decision Tree

### No balance

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')

# Strategy 4 - most frequent

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data_most_frequent = pd.DataFrame(imputer.fit_transform(data))
data_most_frequent.columns = data.columns
data_most_frequent.index = data.index
# mean values are not integer numbers so rounding to int was necessary
data_most_frequent = perform_mapping(data_most_frequent, questions_map_EN)
model_data = data_most_frequent.drop(columns=['Que16'])
target_data = data_most_frequent['Que16']
X_train, X_test, y_train, y_test = train_test_split(model_data.values, target_data.values, test_size=0.20, random_state=42)

## Logistic Regression

### No balance

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col)

### Undersampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='under')

### Oversampling

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='over')

### SMOTEENN

In [None]:
perform_logit(X_train, X_test, y_train, y_test, X_cols, y_col, balance='smoteen')

## Random Forest

### No balance

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_random_forest(X_train, X_test, y_train, y_test, balance='smoteen')

## Decision Tree

### No balance

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test)

### Undersampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='under')

### Oversampling

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='over')

### SMOTEENN

In [None]:
perform_decision_tree(X_train, X_test, y_train, y_test, balance='SMOTEEN')