In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('gender_classification_v7.csv')

In [5]:
df.shape

(5001, 8)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


In [7]:
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [8]:
df.isnull().sum()

Unnamed: 0,0
long_hair,0
forehead_width_cm,0
forehead_height_cm,0
nose_wide,0
nose_long,0
lips_thin,0
distance_nose_to_lip_long,0
gender,0


In [9]:
df.duplicated().sum()

1768

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.shape

(3233, 8)

In [12]:
df.describe()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
count,3233.0,3233.0,3233.0,3233.0,3233.0,3233.0,3233.0
mean,0.822456,13.217198,5.966038,0.536344,0.557686,0.538509,0.545314
std,0.382188,1.115993,0.549789,0.498755,0.496738,0.498592,0.498019
min,0.0,11.4,5.1,0.0,0.0,0.0,0.0
25%,1.0,12.3,5.5,0.0,0.0,0.0,0.0
50%,1.0,13.2,5.9,1.0,1.0,1.0,1.0
75%,1.0,14.1,6.4,1.0,1.0,1.0,1.0
max,1.0,15.5,7.1,1.0,1.0,1.0,1.0


In [14]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [15]:
map_dict = {'Male': 0, 'Female': 1}
df['gender'] = df['gender'].map(map_dict)

In [16]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [17]:
X

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long
0,1,11.8,6.1,1,0,1,1
1,0,14.0,5.4,0,0,1,0
2,0,11.8,6.3,1,1,1,1
3,0,14.4,6.1,0,1,1,1
4,1,13.5,5.9,0,0,0,0
...,...,...,...,...,...,...,...
4986,1,11.7,6.1,1,1,0,1
4990,1,12.6,5.7,0,0,1,0
4992,1,14.1,7.0,1,1,1,1
4993,1,11.6,5.9,0,0,0,1


In [18]:
y

Unnamed: 0,gender
0,0
1,1
2,0
3,0
4,1
...,...
4986,0
4990,1
4992,0
4993,1


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

In [20]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [21]:
classifier.fit(X_train, y_train)

In [22]:
y_pred = classifier.predict(X_test)
y_pred

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [24]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[328  15]
 [ 19 285]]
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       343
           1       0.95      0.94      0.94       304

    accuracy                           0.95       647
   macro avg       0.95      0.95      0.95       647
weighted avg       0.95      0.95      0.95       647



In [25]:
print(accuracy_score(y_test, y_pred))

0.9474497681607419


In [26]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.01, 0.1, 1, 10, 50, 100],
    'l1_ratio': [0, 0.5, 1]
}

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
grid = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

In [29]:
grid.fit(X_train, y_train)

In [30]:
grid.best_params_

{'C': 0.1, 'l1_ratio': 0, 'penalty': 'l2'}

In [31]:
grid.best_score_

0.9524342247746503

In [32]:
y_pred_gird = grid.predict(X_test)

In [33]:
print(accuracy_score(y_test, y_pred_gird))

0.9443585780525502
