In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import numpy as np

# Load data
df = pd.read_csv('transcriptions_with_sex.csv')

# Define X (features) and y (target)
X = df['groundtruth']
y = df['sex']

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define transformers for different feature types
surname_vectorizer = TfidfVectorizer()
firstname_vectorizer = TfidfVectorizer()
occupation_encoder = OneHotEncoder()
patron_encoder = OneHotEncoder()
age_scaler = StandardScaler()

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'XGBoost': XGBClassifier()
    #'LightGBM': LGBMClassifier()
}

# Define results dataframe
results_df = pd.DataFrame(columns=['Vectorizer', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'])

# Iterate over vectorization techniques
vectorizers = {
    'Surname': surname_vectorizer,
    'Firstname': firstname_vectorizer
}

for vec_name, vectorizer in vectorizers.items():
    # Fit and transform vectorizer on training data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Iterate over models
    for model_name, model in models.items():
        # Fit model on vectorized data
        model.fit(X_train_vec, y_train)

        # Make predictions
        y_pred = model.predict(X_test_vec)

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        roc_auc = roc_auc_score(y_test, y_pred)

        # Append results to dataframe
        results_df = results_df.append({'Vectorizer': vec_name,
                                        'Model': model_name,
                                        'Accuracy': accuracy,
                                        'Precision': precision,
                                        'Recall': recall,
                                        'F1-Score': f1,
                                        'ROC-AUC': roc_auc}, ignore_index=True)

# Display results dataframe
print("Results for Surname and Firstname Vectorization:")
display(results_df)

# Reset results dataframe for occupation and patron link
results_df = pd.DataFrame(columns=['Vectorizer', 'Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'])

# Fit and transform one-hot encoders for occupation and patron link
X_train_occ = occupation_encoder.fit_transform(X_train[['occupation']])
X_test_occ = occupation_encoder.transform(X_test[['occupation']])

X_train_patron = patron_encoder.fit_transform(X_train[['patron_link']])
X_test_patron = patron_encoder.transform(X_test[['patron_link']])

# Concatenate numerical age features
X_train_age = age_scaler.fit_transform(X_train[['age']])
X_test_age = age_scaler.transform(X_test[['age']])

# Concatenate all features
X_train_final = np.concatenate([X_train_occ.toarray(), X_train_patron.toarray(), X_train_age], axis=1)
X_test_final = np.concatenate([X_test_occ.toarray(), X_test_patron.toarray(), X_test_age], axis=1)

# Iterate over models for occupation and patron link
for model_name, model in models.items():
    # Fit model on concatenated data
    model.fit(X_train_final, y_train)

    # Make predictions
    y_pred = model.predict(X_test_final)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    roc_auc = roc_auc_score(y_test, y_pred)

    # Append results to dataframe
    results_df = results_df.append({'Vectorizer': 'Occupation and Patron Link',
                                    'Model': model_name,
                                    'Accuracy': accuracy,
                                    'Precision': precision,
                                    'Recall': recall,
                                    'F1-Score': f1,
                                    'ROC-AUC': roc_auc}, ignore_index=True)

# Display results dataframe
print("\nResults for Occupation and Patron Link Vectorization:")
display(results_df)

ValueError: pos_label=1 is not a valid label. It should be one of ['femme', 'homme']