In [1]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
dataset_folder = './dataset/'
train_data = pd.read_csv(os.path.join(dataset_folder, 'cv-valid-train.csv'))
test_data = pd.read_csv(os.path.join(dataset_folder, 'cv-valid-test.csv'))
valid_data = pd.read_csv(os.path.join(dataset_folder, 'cv-valid-dev.csv'))

n_samples = len(train_data)
n_male_samples = len(train_data.loc[train_data['label'] == 'male'])
n_female_samples = len(train_data.loc[train_data['label'] == 'female'])
print("Total samples:", n_samples)
print("Total male samples:", n_male_samples)
print("Total female samples:", n_female_samples)

Total samples: 82852
Total male samples: 61025
Total female samples: 20957


In [3]:
def split_dataframe(df: pd.DataFrame, is_train: bool):
    label2int = {
        "male": 1,
        "female": 0
    }

    male = df.loc[df['label'] == 'male']
    female = df.loc[df['label'] == 'female']
    n_samples_ = min(len(male), len(female))

    df = pd.concat([male.sample(n_samples_), female.sample(n_samples_)], ignore_index=True)\
        .sample(frac=1)\
        .reset_index(drop=True)

    X = df.drop(['label'], axis=1).to_numpy()
    y = df['label'].apply(label2int.get).to_numpy()

    if is_train:
        split_dataframe.scaler.fit(X)

    X = split_dataframe.scaler.transform(X)
    return X, y

split_dataframe.scaler = StandardScaler()

In [4]:
X_train, y_train = split_dataframe(train_data, True)
X_test, y_test = split_dataframe(test_data, False)
X_valid, y_valid = split_dataframe(valid_data, False)

In [10]:
tree = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
print("Decision Tree")
print(f"Accuracy on training set: {tree.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {tree.score(X_test, y_test):.3f}")
print(f"Accuracy on valid set: {tree.score(X_valid, y_valid):.3f}")

Decision Tree
Accuracy on training set: 1.000
Accuracy on test set: 0.811
Accuracy on valid set: 0.830


In [11]:
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
print("Random Forests")
print(f"Accuracy on training set: {forest.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {forest.score(X_test, y_test):.3f}")
print(f"Accuracy on valid set: {forest.score(X_valid, y_valid):.3f}")

Random Forests
Accuracy on training set: 0.983
Accuracy on test set: 0.843
Accuracy on valid set: 0.854


In [12]:
gbrt = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)
print("Gradient Boosting")
print(f"Accuracy on training set: {gbrt.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {gbrt.score(X_test, y_test):.3f}")
print(f"Accuracy on valid set: {gbrt.score(X_valid, y_valid):.3f}")

Gradient Boosting
Accuracy on training set: 0.861
Accuracy on test set: 0.852
Accuracy on valid set: 0.863


In [13]:
svm = SVC().fit(X_train, y_train)
print("Support Vector Machine")
print(f"Accuracy on training set: {svm.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {svm.score(X_test, y_test):.3f}")
print(f"Accuracy on valid set: {svm.score(X_valid, y_valid):.3f}")

Support Vector Machine
Accuracy on training set: 0.883
Accuracy on test set: 0.875
Accuracy on valid set: 0.873


In [14]:
mlp = MLPClassifier(random_state=0).fit(X_train, y_train)
print("Multilayer Perceptron")
print(f"Accuracy on training set: {mlp.score(X_train, y_train):.3f}")
print(f"Accuracy on test set: {mlp.score(X_test, y_test):.3f}")
print(f"Accuracy on valid set: {mlp.score(X_valid, y_valid):.3f}")

Multilayer Perceptron
Accuracy on training set: 0.899
Accuracy on test set: 0.882
Accuracy on valid set: 0.889


