In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the dataset and display nice table.
data = pd.read_csv('project_data/data/train.csv', delimiter=',')

data

In [None]:
# Check for missing values.
data.isna().sum()

In [None]:
# Tells all the additional information about the dataset.
data.describe()

In [None]:
# NOTE: Run the next 3 cells back to back before running the perceptron code.

# Pre-process the train data for Perceptron.
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# Use this scaler to normalize the train data.
scaler = StandardScaler()
vt = VarianceThreshold(0.01)

p_x = data.iloc[:, 1:]
p_y = data.iloc[:, 0]
p_x = vt.fit_transform(p_x)
p_x = scaler.fit_transform(p_x)

# Replace 0 with -1 to fit design of custom perceptron implementation.
p_y = p_y.replace(0, -1)

pd.concat([p_y, pd.DataFrame(p_x)], axis=1).to_csv('output/perceptron_train.csv', index=False)

In [None]:
# Pre-process the test data for Perceptron.
test_data = pd.read_csv('project_data/data/test.csv', delimiter=',')

p_test_x = test_data.iloc[:, 1:]
p_test_y = test_data.iloc[:, 0]

p_test_x = vt.transform(p_test_x)
p_test_x = scaler.transform(p_test_x)

# Replace 0 with -1 to fit design of custom perceptron implementation.
p_test_y = p_test_y.replace(0, -1)

pd.concat([p_test_y, pd.DataFrame(p_test_x)], axis=1).to_csv('output/perceptron_test.csv', index=False)

In [None]:
# Pre-process the eval data for Perceptron.
eval_data = pd.read_csv('project_data/data/eval.anon.csv', delimiter=',')

p_eval_x = eval_data.iloc[:, 1:]
p_eval_y = eval_data.iloc[:, 0]

p_eval_x = vt.transform(p_eval_x)
p_eval_x = scaler.transform(p_eval_x)

# Replace 0 with -1 to fit design of custom perceptron implementation.
p_eval_y = p_eval_y.replace(0, -1)

pd.concat([p_eval_y, pd.DataFrame(p_eval_x)], axis=1).to_csv('output/perceptron_eval.csv', index=False)

In [None]:
# NOTE: Run the perceptron code in the perceptron directory to get the predictions first.

# Post-process the perceptron predictions for submission.
eval_id_data = pd.read_csv('project_data/data/eval.id', header=None)
perceptron_margin_data = pd.read_csv('perceptron/perceptron_preds_no_id.csv', delimiter=',')

# Match the eval example ids with the perceptron predictions.
eval_id_data.columns = ['id']
perceptron_margin_data = pd.concat([eval_id_data, perceptron_margin_data], axis=1)

# Get rid of the label column since we just want the example ids and the predictions.
perceptron_margin_data.drop(columns=['label'], inplace=True)

# Rename the columns for submission.
perceptron_margin_data.columns = ['example_id', 'label']

# Save the perceptron submission data out.
perceptron_margin_data.to_csv('output/perceptron_margin_submission.csv', index=False)

perceptron_margin_data


In [None]:
# NOTE: Run the next 3 cells back to back before running the ID3 code.

# Pre-process the train data for ID3.
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_selection import VarianceThreshold

# Discretize the data.
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
# Remove all features with constant zeros, to speed up the algo.
vt = VarianceThreshold(threshold=0.01)

id3_x = data.iloc[:, 1:]
id3_y = data.iloc[:, 0]
id3_x = vt.fit_transform(id3_x)
id3_x_discretized = discretizer.fit_transform(id3_x)

# Save the discretized data out.
pd.concat([id3_y, pd.DataFrame(id3_x_discretized)], axis=1).to_csv('output/id3_train_discretized.csv', index=False)

In [None]:
# Pre-process the test data for ID3.
test_data = pd.read_csv('project_data/data/test.csv', delimiter=',')

id3_x = test_data.iloc[:, 1:]
id3_y = test_data.iloc[:, 0]
id3_x = vt.transform(id3_x)
id3_test_x_discretized = discretizer.transform(id3_x)

# Save the discretized data out.
pd.concat([id3_y, pd.DataFrame(id3_test_x_discretized)], axis=1).to_csv('output/id3_test_discretized.csv', index=False)

In [None]:
# Pre-process the eval data for ID3.
eval_data = pd.read_csv('project_data/data/eval.anon.csv', delimiter=',')

id3_x = eval_data.iloc[:, 1:]
id3_y = eval_data.iloc[:, 0]
id3_x = vt.transform(id3_x)
id3_eval_x_discretized = discretizer.transform(id3_x)

# Save the discretized data out.
pd.concat([id3_y, pd.DataFrame(id3_eval_x_discretized)], axis=1).to_csv('output/id3_eval_discretized.csv', index=False)

In [None]:
# NOTE: Run the id3 code in the id3 directory to get the predictions first.

# Post-process the ID3 predictions for submission.

eval_id_data = pd.read_csv('project_data/data/eval.id', header=None)
id3_data = pd.read_csv('id3/id3_preds_no_id.csv', delimiter=',')

# Match the eval example ids with the ID3 predictions.
eval_id_data.columns = ['id']
id3_data = pd.concat([eval_id_data, id3_data], axis=1)

# Get rid of the label column since we just want the example ids and the predictions.
id3_data.drop(columns=['label'], inplace=True)

# Rename the columns for submission.
id3_data.columns = ['example_id', 'label']

# Save the ID3 submission data out.
id3_data.to_csv('output/id3_submission.csv', index=False)

id3_data

In [None]:
# NOTE: Run the adaboost code in the adaboost directory to get the predictions first.

# Post-process the AdaBoost predictions for submission.
eval_id_data = pd.read_csv('project_data/data/eval.id', header=None)
adaboost_data = pd.read_csv('adaboost/adaboost_preds_no_id.csv', delimiter=',')

# Match the eval example ids with the AdaBoost predictions.
eval_id_data.columns = ['id']
adaboost_data = pd.concat([eval_id_data, adaboost_data], axis=1)

# Get rid of the label column since we just want the example ids and the predictions.
adaboost_data.drop(columns=['label'], inplace=True)

# Rename the columns for submission.
adaboost_data.columns = ['example_id', 'label']

# Save the AdaBoost submission data out.
adaboost_data.to_csv('output/adaboost_submission.csv', index=False)

adaboost_data

In [None]:
# NOTE: Run the next 3 cells back to back before running the SVM/Logistic Regression code.

# Pre-process the train data for SVM/Logistic Regression.
from sklearn.preprocessing import StandardScaler

# Use this scaler to normalize the train data.
scaler = StandardScaler()

p_x = data.iloc[:, 1:]
p_y = data.iloc[:, 0]
p_x_norm = scaler.fit_transform(p_x)

pd.concat([p_y, pd.DataFrame(p_x_norm)], axis=1).to_csv('output/svm_logreg_train.csv', index=False)

In [None]:
# Pre-process the test data for SVM/Logistic Regression.
test_data = pd.read_csv('project_data/data/test.csv', delimiter=',')

p_test_x = test_data.iloc[:, 1:]
p_test_y = test_data.iloc[:, 0]
p_test_x_norm = scaler.transform(p_test_x)

pd.concat([p_test_y, pd.DataFrame(p_test_x_norm)], axis=1).to_csv('output/svm_logreg_test.csv', index=False)

In [None]:
# Pre-process the eval data for SVM/Logistic Regression.
eval_data = pd.read_csv('project_data/data/eval.anon.csv', delimiter=',')

p_eval_x = eval_data.iloc[:, 1:]
p_eval_y = eval_data.iloc[:, 0]
p_eval_x_norm = scaler.transform(p_eval_x)

pd.concat([p_eval_y, pd.DataFrame(p_eval_x_norm)], axis=1).to_csv('output/svm_logreg_eval.csv', index=False)

In [None]:
# NOTE: Run the SVM code in the svm_logreg directory to get the predictions first.

# Post-process the SVM predictions for submission.
eval_id_data = pd.read_csv('project_data/data/eval.id', header=None)
svm_data = pd.read_csv('svm_logreg/svm_logreg_preds_no_id.csv', delimiter=',')

# Match the eval example ids with the SVM predictions.
eval_id_data.columns = ['id']
svm_data = pd.concat([eval_id_data, svm_data], axis=1)

# Get rid of the label column since we just want the example ids and the predictions.
svm_data.drop(columns=['label'], inplace=True)

# Rename the columns for submission.
svm_data.columns = ['example_id', 'label']

# Save the SVM submission data out.
svm_data.to_csv('output/svm_submission.csv', index=False)

svm_data

In [None]:
# NOTE: Run the Logistic Regression code in the svm_logreg directory to get the predictions first.

# Post-process the Logistic Regression predictions for submission.

eval_id_data = pd.read_csv('project_data/data/eval.id', header=None)
logreg_data = pd.read_csv('svm_logreg/svm_logreg_preds_no_id.csv', delimiter=',')

# Match the eval example ids with the Logistic Regression predictions.
eval_id_data.columns = ['id']
logreg_data = pd.concat([eval_id_data, logreg_data], axis=1)

# Get rid of the label column since we just want the example ids and the predictions.
logreg_data.drop(columns=['label'], inplace=True)

# Rename the columns for submission.
logreg_data.columns = ['example_id', 'label']

# Save the Logistic Regression submission data out.
logreg_data.to_csv('output/logreg_submission.csv', index=False)

logreg_data

In [None]:
# NOTE: Run the next 3 cells back to back before running the perceptron code (non-trivial pre-processing).

# Pre-process the train data for Perceptron (non-trivial pre-processing).
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE

# Use this scaler to normalize the train data.
scaler = StandardScaler()
vt = VarianceThreshold(0.01)
smote = SMOTE(random_state=42)

p_x = data.iloc[:, 1:]
p_y = data.iloc[:, 0]
p_x = vt.fit_transform(p_x)
p_x = scaler.fit_transform(p_x)

p_x_resampled, p_y_resampled = smote.fit_resample(p_x, p_y)

# Replace 0 with -1 to fit design of custom perceptron implementation.
p_y_resampled = p_y_resampled.replace(0, -1)

pd.concat([p_y_resampled, pd.DataFrame(p_x_resampled)], axis=1).to_csv('output/perceptron_train_smote.csv', index=False)

In [None]:
# Pre-process the test data for Perceptron (non-trivial pre-processing).
test_data = pd.read_csv('project_data/data/test.csv', delimiter=',')

p_test_x = test_data.iloc[:, 1:]
p_test_y = test_data.iloc[:, 0]
p_test_x = vt.transform(p_test_x)
p_test_x = scaler.transform(p_test_x)

# Replace 0 with -1 to fit design of custom perceptron implementation.
p_test_y = p_test_y.replace(0, -1)

pd.concat([p_test_y, pd.DataFrame(p_test_x)], axis=1).to_csv('output/perceptron_test_smote.csv', index=False)

In [None]:
# Pre-process the eval data for Perceptron (non-trivial pre-processing).
eval_data = pd.read_csv('project_data/data/eval.anon.csv', delimiter=',')

p_eval_x = eval_data.iloc[:, 1:]
p_eval_y = eval_data.iloc[:, 0]
p_eval_x = vt.transform(p_eval_x)
p_eval_x = scaler.transform(p_eval_x)

# Replace 0 with -1 to fit design of custom perceptron implementation.
p_eval_y = p_eval_y.replace(0, -1)

pd.concat([p_eval_y, pd.DataFrame(p_eval_x)], axis=1).to_csv('output/perceptron_eval_smote.csv', index=False)

In [None]:
# NOTE: Run the perceptron code in the perceptron directory to get the predictions first.
# NOTE: Change the train and test data path in perceptron/data.py to the smote data.

# Post-process the perceptron predictions for submission.
eval_id_data = pd.read_csv('project_data/data/eval.id', header=None)
perceptron_margin_smote_data = pd.read_csv('perceptron/perceptron_preds_no_id.csv', delimiter=',')

# Match the eval example ids with the perceptron predictions.
eval_id_data.columns = ['id']
perceptron_margin_smote_data = pd.concat([eval_id_data, perceptron_margin_smote_data], axis=1)

# Get rid of the label column since we just want the example ids and the predictions.
perceptron_margin_smote_data.drop(columns=['label'], inplace=True)

# Rename the columns for submission.
perceptron_margin_smote_data.columns = ['example_id', 'label']

# Save the perceptron submission data out.
perceptron_margin_smote_data.to_csv('output/perceptron_margin_smote_submission.csv', index=False)

perceptron_margin_smote_data