In [35]:
# using CSV
# using PyCall
# using Printf
# using PyPlot
# using DataFrames

# pd = pyimport("pandas");

In [None]:
import pickle
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats as ss

import sweetviz as sv

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from imblearn.combine import SMOTEENN, SMOTETomek 
from umap.umap_ import UMAP

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from scikitplot.metrics import plot_confusion_matrix, plot_calibration_curve
from sklearn.metrics import f1_score, fbeta_score, fowlkes_mallows_score, recall_score, precision_score, accuracy_score

warnings.filterwarnings('ignore')

### 0.1. Aux Functions

In [None]:
# function split_data_types(df)
#     num_att = select(df, findall(col -> eltype(col) <: Int64, eachcol(df)));
#     cat_att = df[:,[k for k in names(df) if k ∉ names(num_att)]];
#     return Dict("n" => num_att, "c" => cat_att) 
# end;

In [26]:
palette = sns.diverging_palette(359, 359, n=5, s=999, l=50, center='dark')

def plot_vars(df4, msk, k):
    _, _ = plt.subplots(figsize=(10, 10))
    ax0 = plt.subplot2grid((2,2), (0,0))
    ax1 = plt.subplot2grid((2,2), (0,1))
    ax2 = plt.subplot2grid((2,2), (1,0), colspan=2)

    for m, a, c in zip([msk, ~msk], [ax0, ax1], ['r', 'b']):
        a.scatter(df4[m].age_of_respondent, df4[m][k], color=c, label="With Bank Account")
        a.set_xlabel("age_of_respondent")
        a.set_ylabel(k)
        a.legend()

    sns.scatterplot(df4.age_of_respondent, df4[k], hue=df4.bank_account, ax=ax2);
    plt.tight_layout()
    
def cramer_v(x, y):
    cm = pd.crosstab( x, y ).values

    chi2 = ss.chi2_contingency( cm )[0]
    n    = cm.sum()
    r, k = cm.shape

    chi2corr = max( 0, chi2 - (r-1)*(k-1)/(n-1) )
    kcorr = k - (k-1)**2 / (n-1)
    rcorr = r - (r-1)**2 / (n-1)

    return np.sqrt( (chi2corr/n) / min(kcorr-1, rcorr-1) )

def get_features_importance(model, X):
    features = {k: v for k, v in zip(X.columns.tolist(), model.feature_importances_)}

    fig, ax = plt.subplots(figsize=(10,5))
    ax.bar(features.keys(), features.values(), color='k')
    ax.tick_params(axis='x', rotation=80)
    ax.set_title(type(model).__name__ + ' Feature Importance');
    
    return None

def ml_error(model_name, y_true, yhat):
    f1 = f1_score(y_true, yhat)
    fm = fbeta_score(y_true, yhat, average='macro', beta=0.5)
    fi = fbeta_score(y_true, yhat, average='micro', beta=0.5)
    re = recall_score(y_true, yhat)
    pr = precision_score(y_true, yhat)
    ac = accuracy_score(y_true, yhat)
    
    plot_confusion_matrix(y_true, yhat)
    
    d = pd.DataFrame(columns=["Model Name", "F1-Score", "F-Beta Macro", "F-Beta Micro", "Recall", "Precision", "Accuracy"], index=[0])
    d["Model Name"]   = model_name
    d["F1-Score"]     = f1
    d["F-Beta Macro"] = fm
    d["F-Beta Micro"] = fi
    d["Recall"]       = re
    d["Precision"]    = pr
    d["Accuracy"]     = ac
    
    
    return d.reset_index(drop=True)

def dataset_selection(df, cols_selected, is_train=True):
    df.cellphone_access = df.cellphone_access.apply(lambda x: 1 if x == "Yes" else 0)
    df.education_level = df.education_level.apply(lambda x: x.lower().replace(' ', '_').replace('/', '_'))
    df.job_type = df.job_type.apply(lambda x: x.lower().replace(' ', '_').replace('/', '_'))
    
    if is_train:
        df = df[cols_selected]
        #df.bank_account = df.bank_account.apply(lambda x: 1 if x == "Yes" else 0)
        
    else:
        cols_test = cols_selected
        cols_test.pop(-1)
        df = df[cols_test]
    
    return df

def frequency_encoding(df, var='dummy'):
    df_ = df.copy()
    
    if var == 'dummy':
        print("Provide a Feature")
    else:
        f_ = df_.groupby(var).size() / len(df_)
        df_[var] = df_[var].apply(lambda x: f_[x])
    return df_[var]

def map_frequency(df, map_vars_frequency='dummy', no_freq_list="dummy"):
    df2 = df.copy()
    if no_freq_list == "dummy" and map_vars_frequency == "dummy":
        cat_att = df2.select_dtypes(include=['object', 'string'])
        try:
            for i in cat_att.columns.tolist():
                df2[i] = frequency_encoding(df2, i)
            return df2
        except:
            AttributeError(f"I Can't Convert: {i}")
            
        return df2
    else:
        if map_vars_frequency == 'dummy':
            print("Provide a Feature")
        else:
            
            for k in map_vars_frequency:
                col_name = list(k.keys())[0]
                series = pd.Series(list(k.values())[0])
                
                if col_name not in df2.columns.tolist(): pass
                    
                else: df2[col_name] = df2[col_name].apply(lambda x: series[x])
    
            return df2
    
def data_rescaling(df, scalers, feat_to_scale):
    df2 = df.copy()
    try:
        for feat, scaler in zip(feat_to_scale, scalers):
            df2[feat] = scaler.transform(df2[[feat]].values)
            
        return df2
    except:
        raise AttributeError(f"I Can't Rescaling the feature*")
        
def get_dataset(df, c, frequency_maps, is_train=True):
    df = dataset_selection(df, c, is_train=is_train)

    # Dataset Preparation
    age_scaler = pickle.load(open("age_scaler.pkl", "rb"))
    house_scaler = pickle.load(open("house_scaler.pkl", "rb"))

    scals = [age_scaler, house_scaler]
    feat_scals = ['age_of_respondent', 'household_size']

    # Rescaling
    df2 = data_rescaling(df, scals, feat_scals)

    # Encoding
    df3 = map_frequency(df2, frequency_maps)
    
    return df3

In [27]:
frequency_maps = [{'country': {'Kenya': 0.2594796803264751,
   'Rwanda': 0.3685597687468118,
   'Tanzania': 0.2795442951878932,
   'Uganda': 0.09241625573881992}},
 {'location_type': {'Rural': 0.6036388369324944, 'Urban': 0.3963611630675055}},
 {'gender_of_respondent': {'Female': 0.5868049651419827,
   'Male': 0.41319503485801734}},
 {'relationship_with_head': {'Child': 0.09343649039279034,
   'Head of Household': 0.540554327495324,
   'Other non-relatives': 0.007651759904778099,
   'Other relative': 0.03128719605509267,
   'Parent': 0.047610950518619286,
   'Spouse': 0.2794592756333957}},
 {'education_level': {'no_formal_education': 0.19044380207447714,
   'other_dont_know_rta': 0.0017003910899506887,
   'primary_education': 0.5448053052202007,
   'secondary_education': 0.18100663152525082,
   'tertiary_education': 0.048121067845604486,
   'vocational_specialised_training': 0.033922802244516236}},
 {'job_type': {'dont_know_refuse_to_answer': 0.0050161537153545314,
   'farming_and_fishing': 0.23227342288726407,
   'formally_employed_government': 0.017514028226492093,
   'formally_employed_private': 0.0437000510117327,
   'government_dependent': 0.010627444312191803,
   'informally_employed': 0.237034517939126,
   'no_income': 0.026356061894235675,
   'other_income': 0.04837612650909709,
   'remittance_dependent': 0.10644448223091312,
   'self_employed': 0.27265771127359295}},
 {'martial_status': {'Divorced': 0.0879102193504506,
   'Dont know': 0.0003400782179901377,
   'Married': 0.46191123958510455,
   'Single': 0.3375276313552117,
   'Widowed': 0.11231083149124299}},
 {'geral_martial_status': {'Living together': 0.46191123958510455,
   'Never Married': 0.3375276313552117,
   'None': 0.11265090970923312,
   'Seperated': 0.0879102193504506}}]

### 0.2. Load Dataset

In [None]:
df = DataFrame(CSV.File("../data/train.csv"));

# 1.0. Descriptive Statistical

In [None]:
df2 = pd.read_csv("../data/train.csv");

In [None]:
first(df, 5)

## 1.1. Rename Columns

1. Do Not Need Rename Columns

## 1.2. Data Dimension

In [None]:
@printf "Number of Rows: %d" size(df)[1]
@printf "\nNumber of Columns: %d" size(df)[2]

## 1.3. Check Data Types

In [None]:
DataFrame(name=names(df), type=eltype.(eachcol(df)))

## 1.4. Check Na

In [None]:
filter(ismissing, df)

## 1.5. Fillout Na

1. Do Not Have Na Values 

## 1.6. Change Data Types

1. Do Not have to change data types

## 1.7. Descriptive Statistical

In [None]:
df_ = split_data_types(df);

### 1.7.1. Numerical Variables

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15,4))
for i in zip([1, 2, 3], ["age_of_respondent","year","household_size"], ["r", "k", "b"])
    ax[i[1]].hist(df_["n"][:, i[2]] , color=i[3], bins=15);
end;

### 1.7.2. Categorical Variables

In [None]:
df2.select_dtypes(include=["object"]).describe().T

# 2.0. Feature Engineering

In [23]:
df3 = df2.copy();

NameError: name 'df2' is not defined

## 2.1. Mind Map 

1. Next Cycle

## 2.2. Hypothesis List

1. Next Cycle

## 2.3. Feature Engineering

In [None]:
df3 = pd.concat([df3, df3.marital_status.str.split("/", expand=true).rename(columns=Dict(0 => "martial_status", 1 => "geral_martial_status"))], axis=1);

df3 = (df3.fillna("None")).drop_duplicates();

# 3.0. Dataset Filtering

In [None]:
df3 = pd.read_csv("../data/df2.csv", index_col=0)

## 3.1. Filtering Rows

1. Next Cycle

## 3.2. Filtering Columns 

In [None]:
df3 = df3.drop("marital_status", axis=1);

# 4.0. Exploratory Data Analysis

In [None]:
df4 = df3.copy();

In [None]:
advert_report = sv.analyze(df4)

advert_report.show_html('Advertising.html')

## 4.1. Univariable Analysis

In [None]:
msk = (df4.bank_account == "Yes")

### 4.1.1. Response Variable

In [None]:
(df4.bank_account.value_counts() / len(df4)).to_dict()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(13, 4))
sns.countplot(df4.country, hue=df4.bank_account, palette=palette)
sns.countplot(df4.bank_account, palette=palette, ax=ax[0]);

In [None]:
c = df4.select_dtypes(include=['object'])
for i in c.columns.tolist():
    fig, ax = plt.subplots(figsize=(3,3))
    sns.countplot(df4[i], hue=df4['bank_account'], palette=palette, ax=ax)

### 4.1.2. Numerical Attributes

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15,4))
for i in zip([1, 2, 3], ["age_of_respondent","year","household_size"], ["r", "k", "b"])
    ax[i[1]].hist(df_["n"][:, i[2]] , color=i[3], bins=15);
end;

In [None]:
n = df4.select_dtypes(include=['int64', 'float64'])

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15,4))
for i in zip([0, 1, 2], ["age_of_respondent","year","household_size"]):
    ax[i[0]].hist(n[msk][i[1]] , color='r', bins=15, alpha=.6);
    ax[i[0]].hist(n[~msk][i[1]] , color='k', bins=15, alpha=.5);

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15, 4))
for k, i in zip(['year', 'household_size', 'age_of_respondent'], [0, 1, 2]):
    sns.boxplot(df4[k], ax=ax[i], color="r")

## 4.2. Bivariable Analysis

### 4.2.1. Variable per Target

In [None]:
aux1 = df4.drop(["uniqueid", "uid"], axis=1)

In [None]:
for k in aux1.columns.tolist():
    plot_vars(df4, msk, k)

## 4.3. Multivariable Analysis

### 4.3.1. Numerical Attributes

In [None]:
sns.heatmap(df4[['year', 'household_size', 'age_of_respondent']].corr(), annot=True);

### 4.3.2. Categorical Attributes

In [None]:
c_att_un = df4.select_dtypes(include=['object'])
c_att_un = c_att_un.drop(['uniqueid', 'uid'], axis=1)

results = []
for i in c_att_un.columns:
    new_list = []
    for j in c_att_un.columns:
        new_list.append(cramer_v(c_att_un[i], c_att_un[j], ))
    results.append(new_list)
    
c_corr = pd.DataFrame( results )
c_corr.columns = c_att_un.columns
c_corr = c_corr.set_index( c_att_un.columns )

In [None]:
_, _, plt.subplots(figsize=(14,10))
sns.heatmap(c_corr, annot=True);

# 5.0. Data Preparation

In [None]:
df5 = pd.read_csv("../data/df2.csv", index_col=0)

## 5.1. Normalization

1. Next Cycle

## 5.2. Rescaling

In [None]:
mms = MinMaxScaler()

In [None]:
df5.year = mms.fit_transform(df5[['year']])
pickle.dump(mms, open("year_scaler.pkl", "wb"))

df5.household_size = mms.fit_transform(df5[['household_size']])
pickle.dump(mms, open("house_scaler.pkl", "wb"))

df5.age_of_respondent = mms.fit_transform(df5[['age_of_respondent']])
pickle.dump(mms, open("age_scaler.pkl", "wb"))

## 5.3. Transformation

In [None]:
f_country = df5.groupby('country').size() / len(df5)
df5.country = df5.country.apply(lambda x: f_country[x])

f_location = df5.groupby('location_type').size() / len(df5)
df5.location_type = df5.location_type.apply(lambda x: f_location[x])

f_cell = df5.groupby('cellphone_access').size() / len(df5)
df5.cellphone_access = df5.cellphone_access.apply(lambda x: f_cell[x])

f_gender = df5.groupby('gender_of_respondent').size() / len(df5)
df5.gender_of_respondent = df5.gender_of_respondent.apply(lambda x: f_gender[x])

f_relash = df5.groupby('relationship_with_head').size() / len(df5)
df5.relationship_with_head = df5.relationship_with_head.apply(lambda x: f_relash[x])

f_edu = df5.groupby('education_level').size() / len(df5)
df5.education_level = df5.education_level.apply(lambda x: f_edu[x])

f_job = df5.groupby('job_type').size() / len(df5)
df5.job_type = df5.job_type.apply(lambda x: f_job[x])

f_m1 = df5.groupby('martial_status').size() / len(df5)
df5.martial_status = df5.martial_status.apply(lambda x: f_m1[x])

f_m2 = df5.groupby('geral_martial_status').size() / len(df5)
df5.geral_martial_status = df5.geral_martial_status.apply(lambda x: f_m2[x])

freq_columns = ['country', 'location_type', 'cellphone_access', 'gender_of_respondent', 'relationship_with_head', 'education_level', 'job_type', 'martial_status', 'geral_martial_status',]
frequency_codes = [f_country.to_dict(), f_location.to_dict(), f_cell.to_dict(), f_gender.to_dict(), f_relash.to_dict(), f_edu.to_dict(), f_job.to_dict(), f_m1.to_dict(), f_m2.to_dict()]
frequency_list = [{c: f} for c, f in zip(freq_columns, frequency_codes)]

# 6.0. Feature Importance

In [None]:
df6 = df5.copy()

df6 = df6.drop(['uniqueid', 'uid'], axis=1)

X = df6.drop("bank_account", axis=1)
Y = df6.loc[:, 'bank_account'].apply(lambda x: 1 if x == "Yes" else 0)

In [None]:
xg = XGBClassifier(n_jobs=-1).fit(X, Y)
ex = ExtraTreesClassifier(n_jobs=-1).fit(X, Y)
rf = RandomForestClassifier(n_jobs=-1).fit(X, Y)

In [None]:
get_features_importance(ex, X)

In [None]:
get_features_importance(xg, X)

In [None]:
get_features_importance(rf, X)

# 7.0. Machine Learning Models

In [None]:
c = ['country', 'location_type','cellphone_access','household_size','age_of_respondent','gender_of_respondent','education_level','job_type','bank_account']

df7 = pd.read_csv("../data/df6.csv", index_col=0)

df7 = df7[c]

X = df7.iloc[:, :-1]
Y = df7.iloc[:, -1]

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=.1, )

## 7.1. Geral Models

### 7.1. SGD

In [None]:
sg = SGDClassifier(loss="log").fit(x_train, y_train)

yhat_sg = sg.predict(x_val)

ml_error(type(sg).__name__, y_val, yhat_sg)

### 7.2. XGBoost

In [None]:
xg = XGBClassifier(n_jobs=-1).fit(x_train, y_train)

yhat_xg = xg.predict(x_val)

ml_error(type(xg).__name__, y_val, yhat_xg)

### 7.3. ExtraTrees

In [None]:
ex = ExtraTreesClassifier(n_jobs=-1).fit(x_train, y_train)

yhat_ex = xg.predict(x_val)

ml_error(type(ex).__name__, y_val, yhat_ex)

### 7.4. Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs=-1).fit(x_train, y_train)

yhat_rf = xg.predict(x_val)

ml_error(type(rf).__name__, y_val, yhat_rf)

### 7.5. Balanced Random Forest

In [None]:
bl = BalancedRandomForestClassifier(n_jobs=-1).fit(x_train, y_train)

yhat_bl = xg.predict(x_val)

ml_error(type(bl).__name__, y_val, yhat_bl)

### 7.6. K-Means

In [None]:
km = KMeans(n_clusters=2).fit(x_train)

yhat_km = km.predict(x_val)

ml_error(type(km).__name__, y_val, yhat_km)

## 7.2. Final Pipelines

### 7.2.1. Train Dataset Preparation

In [None]:
df7 = pd.read_csv("../data/df3_safe.csv", index_col=[0]) # Raw CSV

df71 = df7.copy()

c = ['country', 'location_type','cellphone_access','household_size','age_of_respondent','gender_of_respondent','education_level','job_type','bank_account']

df72 = get_dataset(df71, c, frequency_maps)

X, Y = df72.iloc[:, :-1], df72.iloc[:, -1]


df_test = pd.read_csv("../data/test.csv")
x_test = get_dataset(df_test, c, frequency_maps, is_train=False)

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=.1)

### 7.2.2. Model Trainings

In [None]:
models = [SGDClassifier(loss="log"), 
          RandomForestClassifier(n_jobs=-1), 
          XGBClassifier(n_jobs=-1),
          ExtraTreesClassifier(n_jobs=-1),
          KMeans(n_clusters=2)]

models_names = ['SGD', 'RF', 'XGB', 'ET', 'KM']

means, stds = [], []

scoring = 'f1_micro'

for model, name in zip(models, models_names):
    cv = cross_val_score(model, x_train, y_train, cv=10, scoring=scoring)
    mean, std = cv.mean(), cv.std()
    means.append(mean)
    stds.append(std)
    
    print(f'{name}: {mean:.3f} +/- {std:.3f}')

In [None]:
xg = XGBClassifier(n_jobs=-1).fit(x_train, y_train)

yhat_xg = xg.predict(x_val)

ml_error(type(xg).__name__, y_val, yhat_xg)

### 7.2.3. Two Final Models

In [None]:
models = [RandomForestClassifier(n_jobs=-1), XGBClassifier(n_jobs=-1)]
models_names = ['RF', 'XG']

y_probas = []
thresholds = np.arange(0.5, 1., .1)
f1_scores = []

for model, name in zip(models, models_names):
    y_proba = cross_val_predict(model, x_train, y_train, cv=10, method='predict_proba')[:, 1]
    y_probas.append(y_proba)
    print(f'F1-Scores for {name}:')
    scores = []
    for t in thresholds:
        y_pred = (y_proba >= t).astype(int)
        score = fbeta_score(y_train, y_pred, average='micro', beta=0.5)
        scores.append(score)
        print(f'Threshold {t:.1f}: {score:.2f}')
    print()
    f1_scores.append(scores)

### 7.2.4. Tuning Best Model

In [None]:
clf_name = 'xgbclassifier'

params = {}
params[f'{clf_name}__learning_rate']    = [0.1, 0.2]
params[f'{clf_name}__min_child_weight'] = [5, 6]
params[f'{clf_name}__gamma']            = [1.]
params[f'{clf_name}__subsample']        = [0.9, 1., 1.1]
params[f'{clf_name}__colsample_bytree'] = [1.0]
params[f'{clf_name}__max_depth']        = [3, 4]

In [None]:
grid = GridSearchCV(XGBClassifier(n_jobs=-1), params, cv=10, n_jobs=-1, scoring='f1_micro')
grid.fit(x_train, y_train)

print(grid.best_params_)

In [None]:
grid.best_score_

In [None]:
# Tuning Best Model
pipe = grid.best_estimator_
yhat = cross_val_predict(pipe, x_train, y_train, cv=10, n_jobs=-1)

In [None]:
ml_error("XGBoost Tuned", y_train, yhat)

In [None]:
xg = grid.best_estimator_

yhat2 = xg.predict(x_val)

In [None]:
ml_error("XGBoost Tuned", y_val, yhat2)

In [32]:
yhat_sub = xg.predict(df_test2[c])

In [34]:
sub_file = pd.DataFrame()
sub_file['uid'] = df_test['uid']
sub_file['bank_account'] = yhat_sub
sub_file['bank_account'] = sub_file['bank_account'].apply(lambda x: 1 if x == 'Yes' else 'No')

sub_file.to_csv('subfil.csv', index=False)

# <font color="red"> 8.0. Dataset Balance</font>

In [28]:
df_test = pd.read_csv("../data/test.csv")

df_test = pd.concat([df_test, df_test.marital_status.str.split("/", expand=True).rename(columns={0:"martial_status", 1:"geral_martial_status"})], axis=1);

df_test = (df_test.fillna("None")).drop_duplicates();

df_test1 = get_dataset(df_test, df_test.columns.tolist(), frequency_maps)
df_test1.year = df_test1.year.apply(lambda x: 2 if x == 2018 else 1 if x == 2017 else 0)
#df_test1.bank_account = df_test1.bank_account.apply(lambda x: 1 if x == "Yes" else 0 )

df_test1 = df_test1.drop("marital_status", axis=1)
df_test1 = df_test1.drop("uniqueid", axis=1)

df_test2 = df_test1.drop('uid', axis=1)

In [29]:
df8 = pd.read_csv("../data/df2.csv", index_col=0)

df8 = df8.drop( ['marital_status', 'uid', 'uniqueid'], axis=1)

# Get Datset
df81 = get_dataset(df8, df8.columns.tolist(), frequency_maps)
df81.year = df81.year.apply(lambda x: 2 if x == 2018 else 1 if x == 2017 else 0)
df81.bank_account = df81.bank_account.apply(lambda x: 1 if x == "Yes" else 0 )

In [None]:
# Split Dataset in train and test

X = df81.drop('bank_account', axis=1)
Y = df81.bank_account

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=.1)

In [None]:
# Lower Dataset
df82 = df81[c+['bank_account']]

Xl = df82.drop('bank_account', axis=1)
Yl = df82.bank_account

x_trainl, x_vall, y_trainl, y_vall = train_test_split(Xl, Yl, test_size=.1)

In [None]:
#r = {0: 10077, 1: 1685}
r = {0: 10077, 1: 7685}

## <font color="red">8.1. Smoteen </font>

### 8.1.1. Full Dataset

In [None]:
sn = SMOTEENN(sampling_strategy=r, n_jobs=-1)
x_sn, y_sn = sn.fit_resample(x_train, y_train)

In [None]:
y_sn.value_counts().to_dict()

In [None]:
df_sn = pd.concat([x_sn, y_sn], axis=1)
df_sn.head()

### 8.1.2. Lower Dataset

In [None]:
sn = SMOTEENN(sampling_strategy=r, n_jobs=-1)
x_snl, y_snl = sn.fit_resample(x_trainl, y_trainl)

In [None]:
y_snl.value_counts().to_dict()

In [None]:
df_snl = pd.concat([x_snl, y_snl], axis=1)
df_snl.head()

## <font color="red">8.2. Smotetomek </font>

### 8.2.1. Full Dataset

In [None]:
sk = SMOTETomek(sampling_strategy=r, n_jobs=-1)
x_sk, y_sk = sk.fit_resample(x_train, y_train)

In [None]:
y_sk.value_counts().to_dict()

In [None]:
df_sk = pd.concat([x_sk, y_sk], axis=1)
df_sk.head()

### 8.2.2. Lower Dataset

In [None]:
sk = SMOTETomek(sampling_strategy=r, n_jobs=-1)
x_skl, y_skl = sk.fit_resample(x_trainl, y_trainl)

In [None]:
y_skl.value_counts().to_dict()

In [None]:
df_skl = pd.concat([x_skl, y_skl], axis=1)
df_skl.head()

## <font color="red">8.3. Test Models</font> 

### <font color="red"> 8.3.1. XGBoost Model </font>

In [None]:
xg = XGBClassifier(n_jobs=-1).fit(x_sn, y_sn)

yhat_xg = xg.predict(x_val)

ml_error(type(xg).__name__, y_val, yhat_xg)

In [None]:
clf_name = 'xgbclassifier'

params = {}
params[f'{clf_name}__learning_rate']    = [0.1, 0.2]
params[f'{clf_name}__min_child_weight'] = [5, 6]
params[f'{clf_name}__gamma']            = [1.]
params[f'{clf_name}__subsample']        = [0.9, 1., 1.1]
params[f'{clf_name}__colsample_bytree'] = [1.0]
params[f'{clf_name}__max_depth']        = [3, 4]

In [None]:
grid = GridSearchCV(XGBClassifier(n_jobs=-1), params, cv=10, n_jobs=-1, scoring='f1_micro')
grid.fit(x_sn, y_sn)

print(grid.best_params_)

In [None]:
xg = grid.best_estimator_

yhat2 = xg.predict(x_val)

In [None]:
ml_error("XGBoost Tuned", y_val, yhat2)

### Send Sub

In [24]:
yhat_sub = xg.predict(df_test2)

ValueError: Feature shape mismatch, expected: 8, got 12

In [None]:
sub_file = pd.DataFrame()
sub_file['uid'] = df_test['uid']
sub_file['bank_account'] = yhat_sub
sub_file['bank_account'] = sub_file['bank_account'].apply(lambda x: 1 if x == 'Yes' else 'No')

sub_file.to_csv('subfil.csv', index=False)

### <font color="red"> 8.3.2. Random Forest Model </font>

In [None]:
xg = RandomForestClassifier(n_jobs=-1).fit(x_sn, y_sn)

yhat_xg = xg.predict(x_val)

ml_error(type(xg).__name__, y_val, yhat_xg)

In [None]:
xg = RandomForestClassifier(n_jobs=-1).fit(x_skl, y_skl)

yhat_xg = xg.predict(x_vall)

ml_error(type(xg).__name__, y_vall, yhat_xg)

# 9.0. Embeeding Space 

## 9.1. Balanced Dataset

### 9.1.1. PCA Smoteen

In [None]:
pca = PCA(n_components=x_sn.shape[1])

principal_components = pca.fit_transform(x_sn)

features = range(pca.n_components_)

df_pca = pd.DataFrame(principal_components)
df_pca['y'] = y_sn

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(features, pca.explained_variance_ratio_, color="black");
ax.set_title('Smoteen Dataset Principal Components');

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(df_pca[0], df_pca[1], hue=df_pca['y']);

### 9.1.2. PCA Smotetomek

In [None]:
pca = PCA(n_components=x_sk.shape[1])

principal_components = pca.fit_transform(x_sk)

features = range(pca.n_components_)

df_pca = pd.DataFrame(principal_components)
df_pca['y'] = y_sk

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(features, pca.explained_variance_ratio_, color="black");
ax.set_title('Smoteen Dataset Principal Components');

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(df_pca[0], df_pca[1], hue=df_pca['y']);

## 9.2. Tree-Based Embeeding

### 9.2.1 Smoteen 

In [None]:
rf_model = RandomForestClassifier(n_jobs=-1)

rf_model.fit(x_sn, y_sn)

df_leaf = pd.DataFrame(rf_model.apply(x_sn))

#### 9.2.1.1. UMAP

In [None]:
reducer = UMAP(n_components=3, random_state=42)
embedding = reducer.fit_transform(df_leaf)

df_umap = pd.DataFrame(embedding)
df_umap['y'] = y_sn

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].scatter(df_umap[0], df_umap[1])
sns.scatterplot(df_umap[0], df_umap[1], hue=df_umap['y']);

In [None]:
df_val_leaf = pd.DataFrame(rf_model.apply(x_val))

reducer = UMAP(n_components=3, random_state=42)
embeddingl = reducer.fit_transform(df_val_leaf)

df_umapl = pd.DataFrame(embeddingl)
df_umapl['y'] = y_val.reset_index(drop=True)

In [None]:
xgb = XGBClassifier(n_jobs=-1).fit(X=df_umap.iloc[:, :-1], y=df_umap.iloc[:, -1])

yhat_xgb = xgb.predict(df_umapl.iloc[:, :-1])

ml_error("SVC", df_umapl.iloc[:, -1], yhat_xgb)

#### 9.2.1.2. PCA

In [None]:
pca = PCA(n_components=df_leaf.shape[1])

principal_components = pca.fit_transform(df_leaf)

features = range(pca.n_components_)

df_pca = pd.DataFrame(principal_components)
df_pca['y'] = y_sn

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(features, pca.explained_variance_ratio_, color="black");
ax.set_title('Smoteen Dataset Principal Components');

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(df_pca[0], df_pca[1], hue=df_pca['y']);

In [None]:
df_val = pd.DataFrame(rf_model.apply(x_val))

In [None]:
principal_components = pca.fit_transform(df_val)

features = range(pca.n_components_)

df_pcav = pd.DataFrame(principal_components)
df_pcav['y'] = y_val.reset_index(drop=True)

In [None]:
xgb = XGBClassifier(n_jobs=-1).fit(X=df_pca.iloc[:, :4], y=df_pca.loc[:, "y"])

yhat_svc = xgb.predict(df_pcav.iloc[:, :4])

ml_error("XGB", df_pcav.loc[:, "y"], yhat_svc)

In [None]:
xgb = SVC(max_iter=150, ).fit(X=df_pca.iloc[:, :4], y=df_pca.loc[:, "y"])

yhat_svc = xgb.predict(df_pcav.iloc[:, :4])

ml_error("XGB", df_pcav.loc[:, "y"], yhat_svc)

### 9.2.2. Smotetomek

In [None]:
rf_model = RandomForestClassifier(n_jobs=-1)

rf_model.fit(x_sk, y_sk)

df_leaf = pd.DataFrame(rf_model.apply(x_sk))

In [None]:
reducer = UMAP(n_components=3, random_state=42)
embedding = reducer.fit_transform(df_leaf)

df_umap = pd.DataFrame(embedding)
df_umap['y'] = y_sk

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].scatter(df_umap[0], df_umap[1])
sns.scatterplot(df_umap[0], df_umap[1], hue=df_umap['y']);

In [None]:
df_val_leaf = pd.DataFrame(rf_model.apply(x_val))

reducer = UMAP(n_components=3, random_state=42)
embeddingl = reducer.fit_transform(df_val_leaf)

df_umapl = pd.DataFrame(embeddingl)
df_umapl['y'] = y_val.reset_index(drop=True)

In [None]:
pca = PCA(n_components=df_leaf.shape[1])

principal_components = pca.fit_transform(df_leaf)

features = range(pca.n_components_)

df_pca = pd.DataFrame(principal_components)
df_pca['y'] = y_sk

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(features, pca.explained_variance_ratio_, color="black");
ax.set_title('Smoteen Dataset Principal Components');

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(df_pca[0], df_pca[1], hue=df_pca['y']);

In [None]:
df_val = pd.DataFrame(rf_model.apply(x_val))

In [None]:
principal_components = pca.fit_transform(df_val)

features = range(pca.n_components_)

df_pcav = pd.DataFrame(principal_components)
df_pcav['y'] = y_val.reset_index(drop=True)

In [None]:
xgb = XGBClassifier(n_jobs=-1).fit(X=df_pca.iloc[:, :3], y=df_pca.loc[:, "y"])

yhat_svc = xgb.predict(df_pcav.iloc[:, :3])

ml_error("XGB", df_pcav.loc[:, "y"], yhat_svc)

In [25]:
xgb = SVC(max_iter=3000, ).fit(X=df_pca.iloc[:, :3], y=df_pca.loc[:, "y"])

yhat_svc = xgb.predict(df_pcav.iloc[:, :3])

ml_error("XGB", df_pcav.loc[:, "y"], yhat_svc)

NameError: name 'df_pca' is not defined