In [23]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data=pd.read_csv('..\\data\\mm_names.csv', index_col = 0)

# Display the first few rows of the dataset to understand its structure
# data.head()
data.dropna(inplace=True)
data['Name'] = data['Name'].str.lower().replace(' ', '_')
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})


In [24]:
# Preprocess the data
# Assuming the last column is the target variable and the rest are features

# X = data.iloc[:, :-1]  # Features
# y = data.iloc[:, -1]   # Target

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Name'], data['Gender'], test_size=0.2, random_state=42)

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])
# Define the classifiers and their parameter grids
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    },
    'Support Vector Machine': {
        'model': SVC(),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf']
        }
    },
    'Naive Bayes': {
        'model': GaussianNB(),
        'params': {}
    }
}

# Perform GridSearchCV for each classifier
best_estimators = {}
for model_name, model_info in models.items():
    clf = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    best_estimators[model_name] = clf.best_estimator_
    print(f"Best parameters for {model_name}: {clf.best_params_}")

# Evaluate the best models
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    print(f"\nClassification Report for {model_name}:\n")
    print(classification_report(y_test, y_pred))



NameError: name 'numerical_cols' is not defined