# Sex by name using decision trees and random forest


In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Constants
max_length_name = 20

# function to convert a string to a list of ascii values
def transform_string(string):
   string = string.lower()
   ascii_list = [ord(c) for c in string]
   list_ = ascii_list + [0] * (max_length_name - len(ascii_list))
   return np.array(list_).astype(np.float32)

In [3]:
#load data
names = pd.read_csv('unificado.csv')

#explore data
print(names.head())
print(names.info())

      name  sex
0    aaden    1
1  aaliyah    0
2    aamir    1
3    aarav    1
4    aaron    1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12312 entries, 0 to 12311
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    12311 non-null  object
 1   sex     12312 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 192.5+ KB
None


In [4]:
# create X and y
X = np.asarray(names['name'].values.tolist())
y = np.asarray(names['sex'].values.tolist())

X = np.array([transform_string(val) for val in X])
# create train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Decision Trees

In [5]:
tree_model = DecisionTreeClassifier()
history = tree_model.fit(X_train, y_train)

In [6]:
tree_model_pred = tree_model.predict(X_test)
print('Accuracy of Decision Tree Model: ', accuracy_score(y_test, tree_model_pred))

Accuracy of Decision Tree Model:  0.6645549057829759


# Random Forest

In [7]:
random_model = RandomForestClassifier()
history = random_model.fit(X_train, y_train)

In [8]:
random_model_pred = random_model.predict(X_test)
print('Accuracy of Random Forest Model: ', accuracy_score(y_test, random_model_pred))

Accuracy of Random Forest Model:  0.725471085120208


In [9]:
def predict(name, model_loaded):
    data = np.array([transform_string(val) for val in name])
    result = model_loaded.predict(data)
    return ['m' if logit > 0.5 else 'f' for logit in result]

In [10]:
test_data = ['Roberto', 'Stephen', 'Allan', 'Maria', 'Carla', 'Gilberto']
predictions_trees = predict(test_data, tree_model)
predictions_random = predict(test_data, random_model)
print('Predictions for Decision Tree Model: ', predictions_trees)
print('Predictions for Random Forest Model: ', predictions_random)

Predictions for Decision Tree Model:  ['m', 'm', 'm', 'm', 'f', 'm']
Predictions for Random Forest Model:  ['m', 'm', 'm', 'm', 'f', 'm']
