In [1]:
!gdown --id '1jjNkvi-CRaSzrOuOT5MSex73Bramw00R' --output boy_or_girl_train.csv
!gdown --id '1UyGdiofgC70nZlyGnRQkmIpUIOYldmN8' --output boy_or_girl_test.csv

Downloading...
From: https://drive.google.com/uc?id=1jjNkvi-CRaSzrOuOT5MSex73Bramw00R
To: /content/boy_or_girl_train.csv
100% 25.0k/25.0k [00:00<00:00, 43.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UyGdiofgC70nZlyGnRQkmIpUIOYldmN8
To: /content/boy_or_girl_test.csv
100% 27.3k/27.3k [00:00<00:00, 44.7MB/s]


In [2]:
import numpy as np
import pandas as pd

In [3]:
train_df = pd.read_csv("boy_or_girl_train.csv")
test_df = pd.read_csv("boy_or_girl_test.csv")

In [4]:
train_df.describe()

Unnamed: 0,id,gender,height,weight,sleepiness,iq,fb_friends
count,423.0,423.0,349.0,373.0,347.0,359.0,355.0
mean,212.0,1.252955,2.8653299999999997e+108,2.6809649999999997e+108,3.334294,125.024708,2.8169010000000003e+252
std,122.253834,0.43522,5.352877e+109,5.177804e+109,1.255303,38.005167,inf
min,1.0,1.0,-1000.0,-1000.0,1.0,50.0,-1000.0
25%,106.5,1.0,165.0,55.0,2.0,100.0,200.0
50%,212.0,1.0,171.0,64.0,3.0,120.0,400.0
75%,317.5,2.0,176.0,73.0,4.0,147.0,700.0
max,423.0,2.0,1e+111,1e+111,5.0,200.0,1e+255


In [5]:
test_df.head()

Unnamed: 0,id,gender,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends,yt,self_intro
0,1,0,天蠍座,Apple,,100.0,1.0,87.0,87.0,87.0,GOod
1,2,0,金牛座,Apple,175.0,80.0,3.0,130.0,2000.0,30.0,Easygoing
2,3,0,雙子座,Apple,155.0,45.0,3.0,150.0,400.0,9.0,I LOVE INTEL
3,4,0,處女座,Apple,173.0,85.0,4.0,100.0,2000.0,15.0,"I'm a hard-work man, just do my best to finish..."
4,5,0,射手座,Android,164.0,57.0,4.0,130.0,505.0,2.0,I'm smart


# Data Detail


In [6]:
gender_counts = train_df["gender"].value_counts()
gender_counts

gender
1    316
2    107
Name: count, dtype: int64

In [7]:
missing_values = train_df.isnull().sum()
missing_values

id             0
gender         0
star_sign     50
phone_os      61
height        74
weight        50
sleepiness    76
iq            64
fb_friends    68
yt            66
self_intro     0
dtype: int64

# Data Clean


### Remove outliers and Data imputation


In [8]:
def missingValue(df):
    # height、weight、fb_friends會有異常值
    df.loc[df['height'] > 200, 'height'] = np.nan
    df.loc[df['height'] < 150, 'height'] = np.nan

    df.loc[df['weight'] > 200, 'weight'] = np.nan
    df.loc[df['weight'] < 40, 'weight'] = np.nan

    df.loc[df['fb_friends'] > 2000, 'fb_friends'] = np.nan
    df.loc[df['fb_friends'] < 50, 'fb_friends'] = np.nan

    # 將非數字值轉換為缺失值
    df['yt'] = pd.to_numeric(df['yt'], errors='coerce')

    df.loc[df['yt'] > 24, 'yt'] = np.nan
    df.loc[df['yt'] < 0, 'yt'] = np.nan

    for column in ['height', 'weight', 'iq', 'fb_friends', 'yt']:
        df[column] = pd.to_numeric(df[column], errors='coerce')

    for column in ['height', 'weight', 'iq', 'fb_friends', 'yt']:
        df[column].fillna(df[column].mean(), inplace=True)

    for column in ['sleepiness', 'star_sign', 'phone_os']:
        df[column].fillna(df[column].mode()[0], inplace=True)

    return df

In [9]:
train_df = missingValue(train_df)
test_df = missingValue(test_df)

### One-hot encoding


In [10]:
def oneHot(df):
    one_hot_encoded = pd.get_dummies(df['star_sign'], prefix='star_sign')
    df = pd.concat([df, one_hot_encoded], axis=1)
    df.drop('star_sign', axis=1, inplace=True)
    df['phone_os'] = df['phone_os'].apply(
        lambda x: x if x in ['Android', 'Apple'] else 'other')
    df.loc[~df['phone_os'].isin(['Android', 'Apple']), 'phone_os'] = 'other'

    one_hot_encoded = pd.get_dummies(df['phone_os'], prefix='phone_os')
    df = pd.concat([df, one_hot_encoded], axis=1)
    df.drop('phone_os', axis=1, inplace=True)

    return df

In [11]:
train_df = oneHot(train_df)
test_df = oneHot(test_df)

### TFIDF


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
train_text_features = train_df['self_intro'].fillna('')
test_text_features = test_df['self_intro'].fillna('')

tfidf = TfidfVectorizer(max_features=100)

tfidf.fit(train_text_features)

train_tfidf_matrix = tfidf.transform(train_text_features)
test_tfidf_matrix = tfidf.transform(test_text_features)

train_tfidf_df = pd.DataFrame(
    train_tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
test_tfidf_df = pd.DataFrame(
    test_tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

train_df = pd.concat(
    [train_df.drop('self_intro', axis=1), train_tfidf_df], axis=1)
test_df = pd.concat(
    [test_df.drop('self_intro', axis=1), test_tfidf_df], axis=1)

### Feature selection


In [13]:
train_df = train_df.drop(['id'], axis=1)

test_df = test_df.drop(['id'], axis=1)
test_df = test_df.drop(['gender'], axis=1)

## Model


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.combine import SMOTETomek

train_df['gender'] = train_df['gender'].map({1: 0, 2: 1})

X = train_df.drop('gender', axis=1)
y = train_df['gender']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

SMOTETomek = SMOTETomek(random_state=42)

X_train_resampled, y_train_resampled = SMOTETomek.fit_resample(
    X_train, y_train)

In [15]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

base_models = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100,
     learning_rate=0.1, max_depth=3, random_state=42))
]

meta_model = LogisticRegression(random_state=42)

stacked_model = StackingClassifier(
    estimators=base_models, final_estimator=meta_model, cv=5)

stacked_model.fit(X_train_resampled, y_train_resampled)

y_pred = stacked_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("ACCURACY：", accuracy)

ACCURACY： 0.9176470588235294


## Test


In [16]:
# y_pred_test = model.predict(test_df)

# id_list = ['{}'.format(i) for i in range(1, len(y_pred_test) + 1)]
# y_pred = ['1' if pred == 0 else '2' for pred in y_pred_test]
# result_df = pd.DataFrame({'id': id_list, 'gender': y_pred})
# result_df

In [17]:
# result_df.to_csv('poorthing_predicted_gender_LR_DT_Xgboost.csv', index=False)