
# Import libraries

In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
from IPython.display import display, HTML
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

def ShowDataGraph(data_frame):

    colors = ["Red", "Green", "Blue", "Orange", "Gold", "Darkseagreen"]

    len_columns = len(data_frame.columns)

    columns_name = list(data_frame.columns)

    fig = make_subplots(rows=len_columns//2 + 1, cols=2, subplot_titles=tuple(columns_name))
    current_col = 1

    for i in columns_name:
        if data_frame[i].dtype == "object":
            fig.add_trace(go.Bar(x=list(dict(data_frame[i].value_counts(sort=False)).keys()) ,y=list(dict(data_frame[i].value_counts(sort=False)).values()) ), row=columns_name.index(i) //2 + 1 , col=current_col)
        
        else:
            fig.add_trace(go.Histogram(x=list(data_frame[i])), row=columns_name.index(i) //2 + 1 , col=current_col)
        current_col = current_col + 1 if current_col < 2 else 1
            
    fig.update_layout(height=200 * len_columns// 2 , width= 900 ,title="Feature values",template="plotly_white", showlegend=False)
    
    fig.show()

def showFeatureImportant(X_frame, y, target_names=["Class_1", "Class_2", "Class_3", "Class_4"]):
    # y must encoded

    # X_frame.columns
    fig = make_subplots(rows=len(X_frame.columns), cols=1, subplot_titles=X_frame.columns)

    for feature in X_frame.columns:
        x = X_frame[feature].unique()
        y_list = []
        unique_list = []
        
        for x_value in x:
            indexList = X_frame[feature].index[X_frame[feature] == x_value]
            Target = y.loc[indexList.tolist()]
            values, counts = np.unique(Target, return_counts=True)

            if len(values) == len(target_names):
                unique_list.append(values)
                y_list.append(counts)
            
            else:
                counts_temp = np.zeros((len(target_names),), dtype='int64')
                counts_temp[values] = counts
                unique_list.append(np.array([i for i in range(len(target_names))], dtype='int64'))
                y_list.append(counts_temp)

        y_show = [[y_list[i][value] for i in range(len(y_list))] for value in range(len(target_names))]

        for i in range(len(target_names)):
            fig.add_trace(go.Bar(x=x , y=y_show[i], name=target_names[i], legendgroup='group', showlegend=False if X_frame.columns.tolist().index(feature) != 0 else True), col=1, row=X_frame.columns.tolist().index(feature) + 1)

    fig.update_layout(barmode='stack')
    fig.show()

def showFeatureDistribute(X_frame, showing_features, y, plot_mode=2):
    
    y_show = y.copy()

    if y_show.name in X_frame.columns:
        data_show = X_frame.copy()

    else:
        data_show = pd.concat([X_frame, y_show], axis=1)

    # data_show[y.name] = ["Yes" if value==1 else "No" for value in data_show[y.name]]

    if plot_mode == 2: 
        assert len(showing_features) == 2
        fig = px.scatter(data_show, x=showing_features[0], y=showing_features[1], color=str(y.name))

        fig.update_traces(marker=dict(size=12,
                                line=dict(width=2,
                                            color='DarkSlateGrey')),
                    selector=dict(mode='markers'))
        fig.update_layout(hovermode="x", legend_traceorder='normal')

    else:
        assert len(showing_features) == 3
        fig = px.scatter_3d(data_show, x=showing_features[0], y=showing_features[1], z=showing_features[2], color=y.name, symbol=y.name)

    fig.show()

def unique_percent(X_frame, feature, y):

    x = sorted(X_frame[feature].unique().tolist())
    y_list = []
    unique_list = []
    
    matrix = np.zeros( (len(y.unique().tolist()), len(x)) )
    print(matrix.shape)
    for x_value in x:
        indexList = X_frame.index[X_frame[feature] == x_value]
        Target = y.loc[indexList.tolist()]
        values, counts = np.unique(Target, return_counts=True)

        for i in range(len(values)):
            matrix[values[i] , x.index(x_value)] = counts[i]
        
    table_temp = pd.DataFrame(matrix, columns=x)
    return table_temp

In [2]:
source_folder = r"D:\Coding_practice\_Data\tabular-playground-series-jun-2021"

train_path = os.path.join(source_folder, "train.csv")
train_csv = pd.read_csv(train_path, index_col='id')

test_path = os.path.join(source_folder, "test.csv")
test_csv = pd.read_csv(test_path , index_col='id')

In [3]:
def basic_info(train_csv, test_csv):
    print(f"Samples in train csv :\t{len(train_csv)}")
    print(f"Number of classes :\t{len(train_csv['target'].unique())}")
    print(f"Classes :\t{np.unique(train_csv['target'], return_inverse=False)}")
    print(f"Number of features :\t{len(set(train_csv.columns.tolist()) - set(['id','target']))}")
    print(f"Samples in test csv :\t{len(test_csv)}")

basic_info(train_csv, test_csv)
train_csv.info()

Samples in train csv :	200000
Number of classes :	9
Classes :	['Class_1' 'Class_2' 'Class_3' 'Class_4' 'Class_5' 'Class_6' 'Class_7'
 'Class_8' 'Class_9']
Number of features :	75
Samples in test csv :	100000
<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 76 columns):
feature_0     200000 non-null int64
feature_1     200000 non-null int64
feature_2     200000 non-null int64
feature_3     200000 non-null int64
feature_4     200000 non-null int64
feature_5     200000 non-null int64
feature_6     200000 non-null int64
feature_7     200000 non-null int64
feature_8     200000 non-null int64
feature_9     200000 non-null int64
feature_10    200000 non-null int64
feature_11    200000 non-null int64
feature_12    200000 non-null int64
feature_13    200000 non-null int64
feature_14    200000 non-null int64
feature_15    200000 non-null int64
feature_16    200000 non-null int64
feature_17    200000 non-null int64
feature_18    200000 non-null int

# Visualize data

In [4]:
print(refine_data.columns)
table_temp = unique_percent(refine_data, "zeros_count_total", refine_data['target'])

display(HTML(table_temp.to_html()))

NameError: name 'refine_data' is not defined

In [79]:
refine_data[['zeros_count_total', 'percentage']]

Unnamed: 0,zeros_count_total,percentage
0,43,1.2647058823529411
1,45,4.090909090909091
2,32,0.49230769230769234
3,38,1.0857142857142856
4,44,2.588235294117647
...,...,...
99995,40,1.8181818181818181
99996,40,1.1764705882352942
99997,41,1.7083333333333333
99998,41,2.05


In [48]:
showFeatureImportant(refine_data[['quantity_count_features']], refine_data['target'], sorted(train_csv.target.unique().tolist()))

In [49]:
from sklearn.manifold import TSNE
t_sne = TSNE(n_components=2)
train_show = train_pp.copy()

train_show = t_sne.fit_transform(train_show.drop(columns=["target","id","fold"]).to_numpy(), train_show.target.to_numpy())

train_show = np.hstack((train_show, train_pp.target.to_numpy().reshape(-1,1)))

data_ = pd.DataFrame(data= train_show,columns=["feature_1","feature_2","target"])

In [50]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
feature_1    100000 non-null float64
feature_2    100000 non-null float64
target       100000 non-null float64
dtypes: float64(3)
memory usage: 2.3 MB


In [106]:
showFeatureDistribute(X_frame=refine_data, showing_features=["quantity_count_features", "total_values"], y=refine_data['target'])

In [None]:
from sklearn.cluster import KMeans
kmean = KMeans(n_clusters=len(np.unique(train_pp['target'], return_inverse=False)), max_iter=1000, verbose=1)

# Preprocess

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(np.unique(train_csv['target'], return_inverse=False))
target = 'target'

train_pp = train_csv.copy()
test_pp = test_csv.copy()

train_pp[target] = le.transform(train_pp[target])
train_pp.head(5)

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,6,1,0,0,0,0,7,0,...,0,0,0,0,0,0,2,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,5
2,0,0,0,0,0,1,0,3,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,0,7,0,1,5,2,2,0,1,...,0,4,0,2,2,0,4,3,0,7
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
def quantity_count_features(row, group=0, step=10, non_zero=1):

    x = np.array([row[f'feature_{i}'] > 0 if non_zero else row[f'feature_{i}'] == 0 for i in range(group * step,  group * step + step)]).sum()

    return x

def value_count_features(row, group=0, step=10):

    x = np.array([row[f'feature_{i}'] for i in range(group * step,  group * step + step)]).sum()

    return x

def mean_value(row, n_feature):

    x = np.array([row[f"feature_{i}"] for i in range(n_feature)]).mean()

    # if row[f"feature_{i}"] > 0

    return x

def std_value(row, n_feature):

    x = np.array([row[f"feature_{i}"] for i in range(n_feature)]).std()

    return x

def z_score_row(row):

    x = np.array((row["total_values"] - row["mean"]) / row['std'])

    return x

def zeros_n_values_percent(row):

    x =np.array( row["zeros_count_total"] / row["total_values"])

    return x 

## Get mean and standard deviation of everypoint per columns
def get_mean_n_std(data_frame):
    col_names = [cname for cname in data_frame.columns]
    mean_n_std_df = pd.DataFrame()

    for cname in col_names:
        mean, std = data_frame[cname].mean(), data_frame[cname].std()
        mean_n_std_df[cname] = [mean, std]

    return mean_n_std_df

## Standarize data point every columns in dataframe
def get_z_score(data_frame, mean_n_std_frame, limit_unique=0):
    assert len(data_frame.columns) == len(mean_n_std_frame.columns)

    col_names = [cname for cname in data_frame.columns]

    standarize_df = data_frame.copy()

    for cname in col_names:
        
        if len(standarize_df[cname].unique()) > limit_unique:
            standarize_df[cname] = (standarize_df[cname] - mean_n_std_frame[cname][0]) / mean_n_std_frame[cname][1]

    return standarize_df

def apply_feature_engineering(train_dtframe, test_dtframe):
    train_data = train_dtframe.copy()
    test_data = test_dtframe.copy()
    features = list(set(train_data.columns.tolist()) - set(['id','target']))
    no_feature = len(features)

    ### Count how many features have value = N of each row data
    train_data["quantity_count_features"] = train_data.apply(quantity_count_features, group=0, step=no_feature, non_zero=0, axis=1)
    test_data["quantity_count_features"] = test_data.apply(quantity_count_features, group=0, step=no_feature, non_zero=0, axis=1)

    for feature in features:
        ### Cap outlier data
        upper_lim = train_data[feature].quantile(.95)
        train_data.loc[(train_data[feature] > upper_lim), feature] = upper_lim
        test_data.loc[(test_data[feature] > upper_lim), feature] = upper_lim

        ### Get mean value of each features
        train_feature_mean = train_data[feature].mean()
        test_feature_mean = test_data[feature].mean()

        # train_data[feature] = train_data[feature] * train_feature_mean
        # test_data[feature] = test_data[feature] * test_feature_mean

        ### Replace 0 with the mean value
        train_data[feature].replace(0, train_feature_mean, inplace=True)
        test_data[feature].replace(0, test_feature_mean, inplace=True)

        ### Apply log
        train_data[feature] =  np.log(train_data[feature] - train_data[feature].min() + 1)
        test_data[feature] = np.log(test_data[feature] - test_data[feature].min() + 1)

        # train_data[feature].iloc[np.where(train_data[feature] != feature_mean)] = 0

    mean_n_std_df = get_mean_n_std(train_data[features])

    train_data[features] = get_z_score(train_data[features], mean_n_std_df)
    test_data[features] = get_z_score(test_data[features], mean_n_std_df)

    train_data["total_values"] = train_data.apply(value_count_features, group=0, step=no_feature, axis=1)
    test_data["total_values"] = test_data.apply(value_count_features, group=0, step=no_feature, axis=1) 

    train_data['mean_row'] = train_data.apply(mean_value, n_feature=no_feature, axis=1)
    test_data['mean_row'] = test_data.apply(mean_value, n_feature=no_feature, axis=1)

    train_data['std_row'] = train_data.apply(std_value, n_feature=no_feature, axis=1)
    test_data['std_row'] = test_data.apply(std_value, n_feature=no_feature, axis=1)

    # train_data['z_score_row'] = train_data.apply(z_score_row, axis=1)

    train_data.drop(columns=[f"feature_{i}" for i in range(no_feature)], inplace=True)
    test_data.drop(columns=[f"feature_{i}" for i in range(no_feature)], inplace=True)

    return train_data, test_data

In [10]:
refine_data, refined_test_data = apply_feature_engineering(train_pp, test_pp)
refine_data.head(10)

Unnamed: 0_level_0,target,quantity_count_features,total_values,mean_row,std_row
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5,55,-13.762664,-0.183502,0.813145
1,5,68,-32.814388,-0.437525,0.368748
2,1,53,-5.441214,-0.07255,0.832202
3,7,39,30.065931,0.400879,1.221593
4,1,68,-30.050903,-0.400679,0.329306
5,7,51,-0.062717,-0.000836,0.932992
6,5,44,9.659543,0.128794,1.014399
7,2,26,53.245816,0.709944,1.272364
8,1,42,23.784872,0.317132,1.270051
9,7,59,-16.005275,-0.213404,0.620601


In [12]:
# display(HTML(refine_data.sample(15).to_html()))
refine_data.iloc[np.where(refine_data.target ==0)].sample(15)

Unnamed: 0_level_0,target,quantity_count_features,total_values,mean_row,std_row
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
102489,0,52,-16.927375,-0.225698,0.732853
56622,0,37,14.444352,0.192591,1.130151
41144,0,47,-0.496883,-0.006625,1.025218
93421,0,51,-10.017278,-0.133564,0.822137
26291,0,32,38.807748,0.517437,1.430725
21144,0,72,-30.795244,-0.410603,0.418332
151224,0,58,-20.924761,-0.278997,0.62075
111930,0,54,-14.785214,-0.197136,0.662914
185658,0,39,16.534277,0.220457,1.15839
198117,0,50,-19.553434,-0.260712,0.679377


# Split data into 5 folds

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 0
refine_data['fold'] = 0
for _, test_index in skf.split(refine_data.index, refine_data['target']):
    
    refine_data['fold'].iloc[test_index] = fold
    fold += 1

In [14]:
refine_data.head(5)
np.unique(refine_data['fold'], return_counts=True)

(array([0, 1, 2, 3, 4], dtype=int64),
 array([40000, 40000, 40000, 40000, 40000], dtype=int64))

# Build model

In [15]:
train = refine_data[refine_data['fold']!=0].reset_index(drop=True)
valid = refine_data[refine_data['fold']==0].reset_index(drop=True)

In [16]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.manifold import TSNE
import torch
from xgboost import XGBClassifier

def training_ss(train_data, valid_data, model="tabnet", target_column='target'):
    print("Splitting data....")
    X_train = train_data.copy()
    X_valid = valid_data.copy()

    X_train.drop(columns=['fold'], inplace=True)
    X_valid.drop(columns=['fold'], inplace=True)

    y_train = X_train.pop(target_column)
    y_valid = X_valid.pop(target_column)

    X_train = X_train.to_numpy()
    X_valid = X_valid.to_numpy()

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    # t_sne = TSNE(n_components=2)
    # X_train = t_sne.fit_transform(X_train)
    # X_valid = t_sne.fit_transform(X_valid)
    
    print("Start training....")
    if model.lower()=="tabnet":

        clf = TabNetClassifier(
            n_d=64, n_a=64, n_steps=5,
            gamma=1.5, n_independent=2, n_shared=2,
            lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=5e-3),
            scheduler_params = {"gamma": 0.95,
                                "step_size": 20},
            scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
        )

        clf.fit(
            X_train=X_train, y_train=y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            eval_name=['train', 'valid'],
            max_epochs=1000, patience=10,
            batch_size=1024, virtual_batch_size=128
        )

    else:
        clf = XGBClassifier(max_depth=8,
        learning_rate=0.1,
        n_estimators=1000,
        verbosity=0,
        silent=None,
        objective="binary:logistic",
        booster='gbtree',
        n_jobs=-1,
        nthread=None,
        gamma=0,
        min_child_weight=1,
        max_delta_step=0,
        subsample=0.7,
        colsample_bytree=1,
        colsample_bylevel=1,
        colsample_bynode=1,
        reg_alpha=0,
        reg_lambda=1,
        scale_pos_weight=1,
        base_score=0.5,
        random_state=0,
        seed=None,)

        clf.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=200,
                verbose=10)
    return clf

In [17]:
clf = training_ss(train, valid)

Splitting data....
Start training....
Device used : cuda
epoch 0  | loss: 1.92973 | train_accuracy: 0.24342 | valid_accuracy: 0.24    |  0:00:52s
epoch 1  | loss: 1.84437 | train_accuracy: 0.30631 | valid_accuracy: 0.30382 |  0:01:35s
epoch 2  | loss: 1.83815 | train_accuracy: 0.31076 | valid_accuracy: 0.30765 |  0:02:20s
epoch 3  | loss: 1.83048 | train_accuracy: 0.30339 | valid_accuracy: 0.29855 |  0:03:03s
epoch 4  | loss: 1.82966 | train_accuracy: 0.30764 | valid_accuracy: 0.30545 |  0:03:47s
epoch 5  | loss: 1.82713 | train_accuracy: 0.31568 | valid_accuracy: 0.3119  |  0:04:33s
epoch 6  | loss: 1.82409 | train_accuracy: 0.31649 | valid_accuracy: 0.31235 |  0:05:21s
epoch 7  | loss: 1.82304 | train_accuracy: 0.31105 | valid_accuracy: 0.30573 |  0:06:09s
epoch 8  | loss: 1.82263 | train_accuracy: 0.31277 | valid_accuracy: 0.30882 |  0:06:58s
epoch 9  | loss: 1.82085 | train_accuracy: 0.31686 | valid_accuracy: 0.31175 |  0:07:47s
epoch 10 | loss: 1.82176 | train_accuracy: 0.31641 | 

# Return result

In [18]:
np.unique(train_csv.target)

array(['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9'], dtype=object)

In [20]:
preds_test = clf.predict_proba(refined_test_data.to_numpy())

data = {}

data['id'] = refined_test_data.index

for i, class_target in enumerate(np.unique(train_csv.target)):
    data[class_target] = preds_test[:,i]

output = pd.DataFrame(data)

output.to_csv('tabnet_.csv', index=False)