In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Setting the paths to the data

TRAIN_DATA_PATH = "../input/tabular-playground-series-may-2022/train.csv"
TEST_DATA_PATH  = "../input/tabular-playground-series-may-2022/test.csv"

## Train Data

In [None]:
# Reading in the data

train = pd.read_csv(TRAIN_DATA_PATH)
print(f"The shape of train is: {train.shape[0]} observations and {train.shape[1]} columns.")
train.head()

### INSIGHTS
* 900,000 observations
* There are 33 columns in total
    * 1 id
    * 31 features (labeled "f_XX")
    * 1 target
* The features are a combination of:
    * Numerical
        * continuous
        * possibly ordinal ("f_07", "f_08")
    * Categorical ("f_27")

In [None]:
# View the stats of numerical values

train.describe().T

### INSIGHTS

* continuous values 
    * range are similar
        * except for "f_28" (ranges from -1229 to 1229)
    * it seems like the features almost have a mean of 0
        * might add a bias term to make it have mean of 0
* f_07 -> f_18, f_29, and f_30 looks like discrete values
    * their quartile values are whole numbers
* f_27 is not on there because it is not a numerical column

In [None]:
# Plotting distribution of the numerical features

fig, ax = plt.subplots(np.int8(np.ceil(len(numerical_columns)/5)), 5, figsize=(12,12))
fig.suptitle('Distribution of the Numerical Columns')
fig_row = 0
fig_column = 0
for col in numerical_columns:
    ax[fig_row, fig_column].hist(train[col])
    ax[fig_row, fig_column].set_title(col)
    
    fig_column+=1
    if fig_column > 4:
        fig_column = 0
        fig_row+=1

plt.tight_layout()
plt.show()

In [None]:
# Separating discrete from continuous variables

discrete_variables = ['f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_29', 'f_30']
continuous_variables = [col for col in numerical_columns if col not in discrete_variables]

In [None]:
# Viewing the correlation 

plt.figure(figsize=(14,8))
sns.heatmap(train[numerical_columns].corr());

### Insights
The variables barely correlate with each other

### Column "f_27"

In [None]:
train['f_27'].value_counts()

In [None]:
t = train['f_27'].value_counts()/train.shape[0]
t_dict = t.to_dict()

* split it into characters
* create a value for each character
    * average all value in that observation
    * number of vowels vs constents 

In [None]:
t = train['f_27'].str.split("")

In [None]:
t[0]

### Insights
There seems to be too many single string instances of the values. Might be good to just ignore for now.

### Column "f_30"

In [None]:
train.groupby(['f_30','target'])['target'].count()

## Baseline Model
Going to run a random forest to determine a baseline value

In [None]:
features = [col for col in train.columns if 'f_' in col and col != 'f_27']
target = 'target'

In [None]:
# Splitting the data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train[features],
    train[target],
    test_size=0.2,
    random_state=0,
    stratify=train[target]
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# Viewing the accuracy of the models

y_train_pred = rf.predict(X_train)
train_accuracy = np.mean(y_train == y_train_pred)
y_test_pred = rf.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print(f"Train accuracy: {train_accuracy}%")
print(f"Test accuracy: {test_accuracy}%")

In [None]:
!pip install tensorflow_decision_forests

In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf

In [None]:
tf_columns = [col for col in train.columns if col not in ['id', 'f_27']]

In [None]:
def split_dataset(dataset, test_ratio=0.30):
    """Splits a panda dataframe in two."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

In [None]:
tf_train, tf_test = split_dataset(train[tf_columns])

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(tf_train, label='target')
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(tf_test, label='target')

In [None]:
# Specify the model.
model_1 = tfdf.keras.RandomForestModel()

# Train the model.
model_1.fit(x=train_ds)

In [None]:
model_1.compile(metrics=["accuracy"])
evaluation = model_1.evaluate(test_ds, return_dict=True)
print()

for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

In [None]:
tfdf.model_plotter.plot_model_in_colab(model_1, tree_idx=0, max_depth=3)

In [None]:
model_1.summary()

In [None]:
model_1.make_inspector().variable_importances()

In [None]:
num_nodes = model_1.make_inspector().variable_importances()['NUM_NODES']

In [None]:
num_nodes[0][1]

In [None]:
features = []
feature_importances = []

for feature in num_nodes:
    features.append(feature[0][0])
    feature_importances.append(feature[1])

In [None]:
plt.figure(figsize=(14,6))
plt.bar(features, feature_importances)
plt.title('Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importances');

In [None]:
model_1.make_inspector().evaluation()

In [None]:
logs = model_1.make_inspector().training_logs()

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.show()

In [None]:
# from sklearn.metrics import classification_report

# print(classification_report(y_train, y_pred))