In [None]:
import sklearn
import pandas as pd
import pickle
import gzip
import os
import numpy as np

### Process data

In [None]:
# Dataset for training. We used the famous apps as the training dataset for model v1.
'''
Update path of dataset below
'''
data_path = '/Users/phuongle/MyProjects/kobiton/tools/what-beautiful-is/dataset-train/'
data = []
data_columns = ['']
for file in os.listdir(data_path):
  if 'dataset_normal_' in file:
    df = pd.read_csv(data_path + file)
    for row in df.iterrows():
      label1 = row[1]['A_over_B_label']
      label2 = row[1]['B_over_A_label']
      if label1 == label2:
        coordA = [int(x) for x in row[1]['screen_A_coordinate_element'][1:-1].split(',')]
        coordB = [int(x) for x in row[1]['screen_B_coordinate_element'][1:-1].split(',')]
        r = [row[1]['screen_A_fontsize_mm'], row[1]['screen_A_width'], row[1]['screen_A_height'], 
             row[1]['screen_A_ppi'], coordA[0], coordA[1], row[1]['screen_B_fontsize_mm'], row[1]['screen_B_width'], 
             row[1]['screen_B_height'], row[1]['screen_B_ppi'], coordB[0], coordB[1], label1]
        data.append(r)
with gzip.open('data.pklz', "wb") as f:
  pickle.dump(data, f)

In [None]:
# Dataset for testing. We used the normal apps as the testing dataset for model v1
'''
Update file path of dataset below
'''
data_path = ''
df = pd.read_csv('/Users/phuongle/MyProjects/kobiton/tools/what-beautiful-is/dataset-test/normal_apps.csv')
data = []
for row in df.iterrows():
  label1 = row[1]['A_over_B_label']
  label2 = row[1]['B_over_A_label']
  if label1 == label2 and label1 != -1:
    coordA = [int(x) for x in row[1]['screen_A_coordinate_element'][1:-1].split(',')]
    coordB = [int(x) for x in row[1]['screen_B_coordinate_element'][1:-1].split(',')]
    r = [row[1]['screen_A_fontsize_mm'], row[1]['screen_A_width'], row[1]['screen_A_height'], 
        row[1]['screen_A_ppi'], coordA[0], coordA[1], row[1]['screen_B_fontsize_mm'], row[1]['screen_B_width'], 
        row[1]['screen_B_height'], row[1]['screen_B_ppi'], coordB[0], coordB[1], label1]
    data.append(r)
with gzip.open('test.pklz', "wb") as f:
  pickle.dump(data, f)

### Models

In [None]:
with gzip.open('data.pklz', 'rb') as f:
  data_train = pickle.load(f)

In [None]:
# 1 for abnormal, 0 for normal
from sklearn.model_selection import train_test_split
data_train = np.array(data_train)
n = len(data_train)
positive = np.where(data_train[:, -1] == 1)[0]
negative = np.where(data_train[:, -1] == 0)[0]
len_pos = len(positive)
print(n, len(positive), len(negative))
X = data_train[:, :-1]
y = data_train[:, -1]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

In [None]:
# Tune data for tree
tree = RandomForestClassifier(n_estimators=150, min_samples_split=20, min_samples_leaf=10)
tree.fit(X,y)

In [None]:
# Bagging
number_models = int(len(negative)/len_pos) + 1
models = []
begin = 0
for i in range(number_models):
  idx = np.concatenate([positive, negative[begin:begin+len_pos*3]])
  np.random.shuffle(idx)
  X_train = X[idx]
  y_train = y[idx]
  tree = RandomForestClassifier(n_estimators=150, min_samples_split=20, min_samples_leaf=10)
  tree.fit(X_train, y_train)
  begin += len_pos
  if begin + len_pos*3 >= len(negative):
    begin = 0
    np.random.shuffle(negative)
  models.append(tree)

In [None]:
# Linear
nn = Perceptron()
begin = 0
for i in range(10):
  idx = np.concatenate([positive, negative[begin:begin+len_pos]])
  np.random.shuffle(idx)
  X_train = X[idx]
  y_train = y[idx]
  begin += len_pos
  if begin + len_pos >= len(negative):
    begin = 0
    np.random.shuffle(negative)
  nn.fit(X_train, y_train)
# nn.fit(X_test, y_test)

In [None]:
with gzip.open('test.pklz', 'rb') as f:
  data_test = pickle.load(f)
data_test = np.array(data_test)
n = len(data_test)
X_test = data_test[:, :-1]
y_test = data_test[:, -1]
print(n)

In [None]:
# SVM
svc = SVC()
svc.fit(np.concatenate([X, X_test]), np.concatenate([y, y_test]))

In [None]:
# Booster
model = LGBMClassifier(min_data_in_bin=10)
model.fit(np.concatenate([X, X_test]), np.concatenate([y, y_test]))

### Metric - Evaluation

In [None]:
# Total
y_h = []
for tree in models:
  y_h.append(tree.predict(X))

In [None]:
# y_hat = np.sum(y_h, axis=0) > len(models)-4  * 1.
y_hat = nn.predict(X)
# y_hat = svc.predict(X)
n = 58126
# y_hat = model.predict(X)
print(accuracy_score(y, y_hat))
tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
print(tn*100/n, fp*100/n, fn*100/n, tp*100/n)
print(tn, fp, fn, tp)

In [None]:
# Test
data = np.array(data)
n_test = len(data)
positive_t = len(np.where(data[:, -1] == 1)[0])
negative_t = n_test - positive_t
print(n_test, positive_t, negative_t)
X_test = data[:, :-1]
y_test = data[:, -1]

In [None]:
y_ht = []
for tree in models:
  y_ht.append(tree.predict(X_test))

In [None]:
# y_hatt = np.sum(y_ht, axis=0) > 0  * 1.
y_hatt = nn.predict(X_test)
# y_hatt = svc.predict(X_test)
n_test = 94
# y_hatt = model.predict(X_test)
print(accuracy_score(y_test, y_hatt))
tn, fp, fn, tp = confusion_matrix(y_test, y_hatt).ravel()
print(tn*100/n_test, fp*100/n_test, fn*100/n_test, tp*100/n_test)
print(tn, fp, fn, tp)

### Save models

In [None]:
pickle.dump(svc, open('fontsize_svc.pkl', 'wb'))
pickle.dump(model, open('fontsize_lightgbm.pkl', 'wb'))