In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("Boy_or_girl_train.csv")
df.head()

Unnamed: 0,index,Timestamp,gender,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends,yt,self_intro
0,0,2017-07-04 13:47:57.495,2.0,處女座,Apple,154.0,43.0,2.0,180.0,583.0,0.0,Beautiful
1,1,2017-07-04 13:48:25.645,2.0,處女座,Apple,156.0,47.0,2.0,130.0,400.0,3.5,Enjoying being who I'm not
2,2,2017-07-04 13:49:15.348,1.0,射手座,Android,170.0,61.0,3.0,90.0,540.0,5.0,Practice Makes perfect
3,3,2017-07-04 13:49:15.792,1.0,射手座,Apple,170.0,62.0,4.0,100.0,173.0,5.0,Straightforward
4,4,2017-07-04 13:54:39.330,2.0,射手座,Android,158.0,67.0,3.0,128.0,320.0,1.2,Humorous


### Check and Drop Null data

There's 10 null values in the dataset, drop it. 
- How to check null value: [stackoverflow](https://stackoverflow.com/questions/29530232/how-to-check-if-any-value-is-nan-in-a-pandas-dataframe)
- How to drop null valus: [pandas.pydata.org](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html)

In [4]:
df.isnull().sum()

index         0
Timestamp     1
gender        1
star_sign     1
phone_os      1
height        1
weight        1
sleepiness    1
iq            3
fb_friends    1
yt            2
self_intro    1
dtype: int64

In [5]:
df.dropna(axis=0, inplace=True)

### Convert Category data into Nominal

In [9]:
# lowercase string
df['phone_os'] = df['phone_os'].str.lower()
# removing leading and trailing whitespaces
df['phone_os'] = df['phone_os'].str.strip()

coded_star_signs = {'水瓶座':1, '雙魚座':2, '牡羊座':3, '金牛座':4, '雙子座':5, '巨蟹座':6, '獅子座':7, '處女座':8, '天秤座':9, '天蠍座':10, '射手座':11, '摩羯座':12}
coded_phone_os = {'apple':1, 'android':2, 'windows phone':3, 'johncena':4}
coded_gender = {2:-1} #girls as -1

coded_df = df.replace({"star_sign": coded_star_signs})
coded_df = coded_df.replace({"phone_os": coded_phone_os})
coded_df = coded_df.replace({"gender": coded_gender})

coded_df.head(3)

Unnamed: 0,index,Timestamp,gender,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends,yt,self_intro
0,0,2017-07-04 13:47:57.495,-1.0,8,1,154.0,43.0,2.0,180.0,583.0,0.0,Beautiful
1,1,2017-07-04 13:48:25.645,-1.0,8,1,156.0,47.0,2.0,130.0,400.0,3.5,Enjoying being who I'm not
2,2,2017-07-04 13:49:15.348,1.0,11,2,170.0,61.0,3.0,90.0,540.0,5.0,Practice Makes perfect


### Split Data into Features and Label
不小心砍掉YT時間，少一欄將錯就錯吧

In [12]:
features = coded_df.iloc[:, 3:10]
label = coded_df['gender']

In [13]:
features.head(1)

Unnamed: 0,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends
0,8,1,154.0,43.0,2.0,180.0,583.0


### Train-test-val split
[split tutorial](https://stackoverflow.com/questions/51841810/how-does-fit-function-in-scikit-learn-make-validation)
Train: 0.8, val: 0.2

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(features, label, test_size=0.2, random_state=1)

In [56]:
print(X_train.to_numpy().shape)
print(y_train.to_numpy().shape)
print(X_val.to_numpy().shape)
print(y_val.to_numpy().shape)

(192, 7)
(192,)
(49, 7)
(49,)


### Build RandomForset-Classifacation model
Docs ref: [sklearn.ensemble.RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#) 欸...是不是只接受nominal資料阿，餵numeric會怎麼樣?

In [66]:
model = RandomForestClassifier(n_estimators=100, random_state=0)
clf = model.fit(X_train.astype('int'), y_train.astype('int'))
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [67]:
clf.score(X_train, y_train)

1.0

In [68]:
clf.score(X_val.astype('int'), y_val.astype('int'))

0.8979591836734694

### 如果用 cross-validation
沒有fit喔，只有看score https://scikit-learn.org/stable/modules/cross_validation.html

In [62]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = RandomForestClassifier(n_estimators=100, random_state=0)
scores = cross_validate(clf, features.astype('int'), label.astype('int'), scoring=scoring) #可直接用全部有標記的資料
sorted(scores.keys())



['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']

In [63]:
scores['test_precision_macro']

array([0.75096154, 0.88311688, 0.75980392])

In [64]:
scores['test_recall_macro']

array([0.73015873, 0.8172043 , 0.68996416])

### Try looking in some specific test case
ref: [pandas.DataFrame.from_dict](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html)

In [36]:
X_val.iloc[0]

star_sign       12.0
phone_os         1.0
height         162.0
weight          43.0
sleepiness       5.0
iq             180.0
fb_friends    6666.0
Name: 20, dtype: float64

In [75]:
my_case = {
    'star_sign': [2, 10],
    'phone_os': [2, 1],
    'height': [173, 166],
    'weight': [68, 50],
    'sleepiness': [1, 1],
    'iq': [120, 80],
    'fb_friends': [400, 800]
}
my_case = pd.DataFrame.from_dict(my_case)
my_case

Unnamed: 0,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends
0,2,2,173,68,1,120,400
1,10,1,166,50,1,80,800


In [49]:
type(my_case)

pandas.core.frame.DataFrame

In [76]:
res = clf.predict(my_case.astype('int'))
res

array([ 1, -1])

### 全部男生的 test submission

In [79]:
submission = {
    'index': list(range(0,185)),
    'gender': [1]*185
}
submission = pd.DataFrame.from_dict(submission)
submission

Unnamed: 0,index,gender
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
180,180,1
181,181,1
182,182,1
183,183,1


In [81]:
submission.to_csv('submission_all_boy.csv', index=False)

### Predict on Test set

In [100]:
test = pd.read_csv("Boy_or_girl_test_no_solution - 砍int長度.csv")
test.head(1)

Unnamed: 0,index,Timestamp,gender,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends,yt,self_intro
0,0,2018-09-10 10:12:58.794,0.0,摩羯座,Android,174.0,76.5,3,120,555,0.0,I am alive.


In [101]:
test.isnull().sum()

index         0
Timestamp     0
gender        0
star_sign     0
phone_os      0
height        0
weight        0
sleepiness    0
iq            0
fb_friends    0
yt            0
self_intro    0
dtype: int64

### pre-process

In [102]:
# lowercase string
test['phone_os'] = test['phone_os'].str.lower()
# removing leading and trailing whitespaces
test['phone_os'] = test['phone_os'].str.strip()

coded_star_signs = {'水瓶座':1, '雙魚座':2, '牡羊座':3, '金牛座':4, '雙子座':5, '巨蟹座':6, '獅子座':7, '處女座':8, '天秤座':9, '天蠍座':10, '射手座':11, '摩羯座':12}
coded_phone_os = {'apple':1, 'android':2, 'windows phone':3, 'johncena':4}
coded_gender = {2:-1} #girls as -1

test = test.replace({"star_sign": coded_star_signs})
test = test.replace({"phone_os": coded_phone_os})
test = test.replace({"gender": coded_gender})

test = test.iloc[:, 3:10]
test.head(3)

Unnamed: 0,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends
0,12,2,174.0,76.5,3,120,555
1,5,1,178.0,85.0,3,120,653
2,11,1,178.0,45.0,4,200,500


In [103]:
prediction = clf.predict(test.astype('int'))
prediction = list(map(lambda x: 2 if x==-1 else x, prediction))
prediction

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1]

In [104]:
submission = {
    'index': list(range(0,185)),
    'gender': prediction
}
submission = pd.DataFrame.from_dict(submission)
submission.head(3)

Unnamed: 0,index,gender
0,0,1
1,1,1
2,2,1


In [105]:
submission.to_csv('submission_原始資料來一發.csv', index=False)