# Imports

In [None]:
!pip install -U lightautoml

In [None]:
import warnings

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")

In [3]:
INPUT_PATH = '/content/drive/MyDrive/open_data_battle/gender/gender.csv'
OUTPUT_PATH = '/content/drive/MyDrive/open_data_battle/gender/result.csv'
BINARY_COLS = ['nose_wide', 'nose_long', 'lips_thin', 
               'distance_nose_to_lip_long', 'long_hair']
BINARY_THR = 0.5
REPLACE_TARGET = {'Female': 0, 'Male': 1}
REPLACE_TARGET_REVERSE = {0: 'Female', 1: 'Male'}
CLF_THR = 0.5

# EDA

In [4]:
df = pd.read_csv(INPUT_PATH, index_col=0)
df.head()

Unnamed: 0,index,long_hair,forehead_width_cm,forehead_height_cm,forehead_width_mm,forehead_width_conventional_units,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender,index.1,long_hair.1,forehead_width_cm.1,forehead_height_cm.1,forehead_width_mm.1,forehead_width_conventional_units.1,nose_wide.1,nose_long.1,lips_thin.1,distance_nose_to_lip_long.1,gender.1
0,0,0.704275,11.8,6.1,118.0,0.025806,0.895614,0.477485,0.841261,0.902628,,2515,0.818018,11.5,5.8,115.0,0.006452,0.137806,0.40262,0.167844,0.188913,Female
1,1,0.11869,14.0,5.4,140.0,0.167742,0.20252,0.186825,0.719697,0.309122,,2516,0.995941,15.2,5.8,152.0,0.245161,0.570656,0.980237,0.811487,0.96754,Male
2,2,0.203894,11.8,6.3,118.0,0.025806,0.892793,0.723152,0.725821,0.72064,,2517,0.633806,15.4,5.7,154.0,0.258065,0.709777,0.365825,0.933026,0.646992,Male
3,3,0.386228,14.4,6.1,144.0,0.193548,0.024963,0.562005,0.831515,0.770379,,2518,0.795792,13.0,6.9,130.0,0.103226,0.883454,0.399733,0.595028,0.822491,Male
4,4,0.56402,13.5,5.9,135.0,0.135484,0.415389,0.063705,0.211584,0.157217,,2519,0.915185,14.2,5.8,142.0,0.180645,0.652973,0.735543,0.553571,0.53291,Male


In [5]:
# Transform examples
df_part1 = df[[x for x in df if not x.endswith('.1')]]
df_part2 = df[[x for x in df if x.endswith('.1')]]
df_part2.rename(columns={x:x[:-2] for x in df_part2}, inplace=True)
df_common = pd.concat([df_part1, df_part2], ignore_index=True)
df_common.drop_duplicates(['index'], inplace=True)

# Binary cols
for col in BINARY_COLS:
    df_common[col] = (df_common[col] >= BINARY_THR).astype(int)

df_common.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5001 entries, 0 to 5029
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   index                              5001 non-null   int64  
 1   long_hair                          5001 non-null   int64  
 2   forehead_width_cm                  5001 non-null   float64
 3   forehead_height_cm                 5001 non-null   float64
 4   forehead_width_mm                  5001 non-null   float64
 5   forehead_width_conventional_units  5001 non-null   float64
 6   nose_wide                          5001 non-null   int64  
 7   nose_long                          5001 non-null   int64  
 8   lips_thin                          5001 non-null   int64  
 9   distance_nose_to_lip_long          5001 non-null   int64  
 10  gender                             4000 non-null   object 
dtypes: float64(4), int64(6), object(1)
memory usage: 468.8+ 

# Split data

In [6]:
df_test = df_common[df_common['gender'].isna()]
df_train = df_common[~df_common['gender'].isna()]
print(df_test.shape, df_train.shape)

(1001, 11) (4000, 11)


In [7]:
df_train['gender'] = df_train['gender'].apply(lambda x: REPLACE_TARGET[x])

In [8]:
X_train, X_test = train_test_split(df_train, test_size=0.2)
print(X_train.shape, X_test.shape)
print(X_train.gender.value_counts(normalize=True))
print(X_test.gender.value_counts(normalize=True))

(3200, 11) (800, 11)
0    0.505313
1    0.494688
Name: gender, dtype: float64
1    0.51375
0    0.48625
Name: gender, dtype: float64


# Fit, predict

In [13]:
# Fit
automl = TabularAutoML(
    task = Task(
        name = 'binary',
        metric = lambda y_true, y_pred: accuracy_score(y_true, (y_pred > CLF_THR)*1))
)
oof_pred = automl.fit_predict(
    X_train, 
    roles = {'target': 'gender', 'drop': ['index']}
)

In [14]:
# Predict
test_pred = automl.predict(X_test.drop('gender', axis=1))
df_pred = pd.DataFrame({'index':X_test['index'],
                        'gender': (test_pred.data[:, 0] > CLF_THR)*1})
print(f'accuracy: {accuracy_score(X_test.gender, df_pred.gender):.3f}')

accuracy: 0.975


# Submition

In [11]:
submit_csv = automl.predict(df_test.drop('gender', axis=1))
submit_csv = pd.DataFrame({'gender': (submit_csv.data[:, 0] > CLF_THR)*1})
(submit_csv['gender']
 .apply(lambda x: REPLACE_TARGET_REVERSE[x])
 .to_csv(OUTPUT_PATH, index=False, header=False))