In [16]:
import pandas as pd

df = pd.read_csv('../data/mm_names.csv',index_col=0)
df.dropna(inplace=True)
df['Name'] = df['Name'].str.lower().replace(' ', '_')
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
# female_data = df[df['Gender'] == 'Female']
# female_data.shape
df.head()

Unnamed: 0,Name,Gender
0,aungkyi,1
1,aungmay,1
2,aye,1
3,ayeaye,1
4,ayeayeaung,1


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Name'], df['Gender'], test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 3))),  # Include unigrams and bigrams
    ('tfidf', TfidfTransformer()),  # Apply TF-IDF transformation
    ('clf', MultinomialNB(alpha=0.6))  # Adjust alpha parameter for smoothing
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict the gender for the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

name = "Than Zaw Toe"  # Replace with the name you want to predict
y_pred = pipeline.predict([name])[0]
gender = pd.Series(y_pred).map({0: 'male', 1: 'female'}).to_string().split()[1]

print("Predicted gender:", gender)

Accuracy Score: 0.48262910798122066
Predicted gender: female


In [18]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc


ypred_test = pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print("Confusion Matrix")
print(mat_clf)
print("\nClassificaton Report")
print(report_clf)

ypred_testP = pipeline.predict_proba(X_test)
auc = roc_auc_score(y_test, ypred_testP[:,1])
print(auc)

Confusion Matrix
[[513  81]
 [470   1]]

Classificaton Report
              precision    recall  f1-score   support

           0       0.52      0.86      0.65       594
           1       0.01      0.00      0.00       471

    accuracy                           0.48      1065
   macro avg       0.27      0.43      0.33      1065
weighted avg       0.30      0.48      0.36      1065

0.3732870102296854


In [19]:
ypred_train = pipeline.predict(X_train)
mat_clf = confusion_matrix(y_train, ypred_train)
report_clf = classification_report(y_train, ypred_train)

print(mat_clf)
print(report_clf)

ypred_trainP = pipeline.predict_proba(X_train)
auc = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc)

[[2426    0]
 [ 238 1594]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2426
           1       1.00      0.87      0.93      1832

    accuracy                           0.94      4258
   macro avg       0.96      0.94      0.94      4258
weighted avg       0.95      0.94      0.94      4258

0.99362753215709
