In [163]:
import pandas as pd

## Load Data

In [164]:
disney = pd.read_csv('analysis/disney_pics.csv').drop(['Unnamed: 0', 'name'], axis = 1)
disney.head()

Unnamed: 0,width,height,mean_hue,mean_saturation,mean_value,mean_entropy,edge_sum
0,67,98,0.272307,0.148943,0.831144,3.778824,0.819642
1,99,99,0.346913,0.50067,0.275656,4.424552,0.585586
2,99,99,0.453854,0.168975,0.484728,4.181699,0.481131
3,99,99,0.351184,0.66926,0.497502,4.214874,0.554411
4,99,99,0.136235,0.926444,0.331088,3.405405,0.365691


In [165]:
hayao = pd.read_csv('analysis/hayao_pics.csv').drop(['Unnamed: 0', 'name'], axis = 1)
hayao.head()

Unnamed: 0,width,height,mean_hue,mean_saturation,mean_value,mean_entropy,edge_sum
0,67,98,0.281277,0.530815,0.231668,3.895495,0.583236
1,99,99,0.273528,0.405424,0.722306,4.257138,0.662596
2,147,100,0.600145,0.325997,0.71184,3.021263,0.27085
3,147,100,0.507668,0.685993,0.552678,3.737993,0.201171
4,147,100,0.154216,0.728086,0.574216,4.672342,0.49849


In [191]:
# assign labels to each df for classification
label_disney = ['D'] * disney.shape[0]
disney = disney.assign(label=label_disney)
label_hayao = ['H'] * hayao.shape[0]
hayao = hayao.assign(label = label_hayao)

In [192]:
# combine two dfs into one
data = pd.concat([disney,hayao])

# shuffle data
data = data.sample(frac = 1, replace = False).reset_index(drop = True)
data

Unnamed: 0,width,height,mean_hue,mean_saturation,mean_value,mean_entropy,edge_sum,label
0,99,99,0.392431,0.413879,0.557107,5.066878,0.767575,H
1,99,99,0.258040,0.174180,0.446801,4.491106,0.490120,D
2,147,100,0.224277,0.514485,0.514048,5.362808,0.700043,H
3,99,99,0.551152,0.412819,0.503934,3.126946,0.262209,D
4,99,99,0.285075,0.072632,0.648570,4.007925,0.485902,D
...,...,...,...,...,...,...,...,...
496,99,99,0.021120,0.009484,0.062846,2.396829,0.404760,D
497,99,99,0.540731,0.457793,0.229726,5.306859,0.581612,H
498,99,99,0.366496,0.374171,0.781674,4.660333,0.638039,D
499,99,99,0.380891,0.305587,0.452497,4.573495,0.452182,D


## Data Preprocessing

In [168]:
data.describe()

Unnamed: 0,width,height,mean_hue,mean_saturation,mean_value,mean_entropy,edge_sum,label
count,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0
mean,102.033932,99.007984,0.365262,0.436018,0.547438,4.36657,0.537332,0.355289
std,23.699301,0.58304,0.165117,0.18413,0.182753,0.764381,0.183716,0.479079
min,67.0,98.0,0.001462,6.8e-05,0.046154,0.801167,0.110002,0.0
25%,99.0,99.0,0.24694,0.316454,0.415992,4.026364,0.404776,0.0
50%,99.0,99.0,0.365365,0.443637,0.547617,4.482826,0.513117,0.0
75%,99.0,99.0,0.491058,0.567867,0.678632,4.907562,0.648178,1.0
max,147.0,100.0,0.884856,0.926444,0.968928,5.656472,1.262019,1.0


In [169]:
X = data.drop('label', axis = 1)
y = data.label

In [170]:
# scale edge_sum with MinMaxScaler
mms = MinMaxScaler(feature_range = (0,1))
edge = X[['edge_sum']]
new_edge = mms.fit_transform(edge)
X[['edge_sum']] = new_edge

In [171]:
# scale all other features except width and height by StandardScaler
scaler = StandardScaler()
scale_feature = X[['mean_hue','mean_saturation','mean_value','mean_entropy']]
fet = scaler.fit_transform(scale_feature)
X[['mean_hue','mean_saturation','mean_value','mean_entropy']] = fet

In [172]:
X.head()

Unnamed: 0,width,height,mean_hue,mean_saturation,mean_value,mean_entropy,edge_sum
0,67,98,1.299032,1.967042,-0.225692,1.15205,0.517515
1,99,99,-0.552831,0.768781,-0.036652,0.788203,0.475518
2,67,98,0.87122,0.670034,-0.462382,0.692678,0.496309
3,99,99,1.696704,2.231982,-1.759079,-0.873642,0.123486
4,99,99,0.971886,0.626029,0.746958,0.91077,0.372942


## Model Fitting

In [173]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [183]:
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.mixture import GaussianMixture
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import ConstantKernel, RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [185]:
# Classifiers
classifier_name = [
    
    'SVM',
    'Linear SVC',
    'Random Forest Classifier',
    'AdaBoost Classifier',
    'Gaussian Mixture Model',
    'Decision Tree Classifier',
    'K-Nearest Neighbors Model'
    
]

classifiers = {
    
    'SVM' : SVC(random_state = 42),
    'Linear SVC' : LinearSVC(random_state = 42),
    'Random Forest Classifier' : RandomForestClassifier(random_state = 42),
    'AdaBoost Classifier' : AdaBoostClassifier(random_state = 42),
    'Gaussian Mixture Model' : GaussianMixture(random_state = 42),
    'Decision Tree Classifier' : DecisionTreeClassifier(random_state = 42),
    'K-Nearest Neighbors Model' : KNeighborsClassifier()
    
}

In [186]:
# hyperparameters
hyperparameters = {
    
    'SVM' : {
        'kernel' : ['rbf', 'linear'],
        'gamma' : ['auto' ,'scale'],
        'C' : [0.0001, 0.001, 0.01, 0.1, 1]
    },
    
    'Linear SVC' : {
        'penalty' : ['l2', 'l1'],
        'C' : [0.0001, 0.001, 0.01, 0.1, 1]
    },
    
    'Random Forest Classifier' : {
        'n_estimators': [40, 60, 80, 100],
        'max_depth': [5, 10, 15, 20, 25, 30]
    },
    
    'AdaBoost Classifier' : {
        'n_estimators': [50, 100, 120, 150]
    },
    
    'Gaussian Mixture Model' : {
        'n_components': [1, 2, 3, 4, 5],
        'n_init': [1, 2, 3, 4, 5]
    },
    
    'Decision Tree Classifier' : {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [3, 4, 5, 7],
        'min_samples_split': [2, 3],
        'max_features': ['max_features', 'log2', 'sqrt']
    },
    
    'K-Nearest Neighbors Model' : {
        'n_neighbors': [3, 5, 7, 9]
    }
    
}

In [184]:
# helper function to print out best hyperparameters set
def grid_search_best_hyperparameters(grid, parameters = None):
    
    best_score = grid.best_score_
    best_parameter = grid.best_params_
    print('Model Best Score:', best_score)
    
    
    if parameters != None:
        print('Best Hyperparameters Set:')
        print('    {')
        for parameter_name in sorted(parameters.keys()):
            print('     ' + parameter_name + ':', best_parameter[parameter_name])
        print('    }')
        
    else:
        return

In [187]:
# fit models
def model_results(classifiers, hyperparameters, cv, scoring, X, y):
    
    for classifier in classifier_name:
        
        classifier_model = classifiers[classifier]
        params = hyperparameters[classifier]
        grid_classifier = GridSearchCV(classifier_model, params, cv = cv, scoring = scoring)
        grid_classifier.fit(X, y)
        print('\033[1m' + classifier + '\033[0m' + ':')
        grid_search_best_hyperparameters(grid_classifier, params)
        print('--------------------------------------------')
        

In [188]:
import warnings
warnings.filterwarnings('ignore')

In [189]:
model_results(classifiers, hyperparameters, 5, 'accuracy', X, y)

[1mSVM[0m:
Model Best Score: 0.8184158415841584
Best Hyperparameters Set:
    {
     C: 0.0001
     gamma: auto
     kernel: linear
    }
--------------------------------------------
[1mLinear SVC[0m:
Model Best Score: 0.8184158415841584
Best Hyperparameters Set:
    {
     C: 0.0001
     penalty: l2
    }
--------------------------------------------
[1mRandom Forest Classifier[0m:
Model Best Score: 0.8164158415841584
Best Hyperparameters Set:
    {
     max_depth: 5
     n_estimators: 40
    }
--------------------------------------------
[1mAdaBoost Classifier[0m:
Model Best Score: 0.7864158415841584
Best Hyperparameters Set:
    {
     n_estimators: 50
    }
--------------------------------------------
[1mGaussian Mixture Model[0m:
Model Best Score: 0.8183564356435644
Best Hyperparameters Set:
    {
     n_components: 2
     n_init: 5
    }
--------------------------------------------
[1mDecision Tree Classifier[0m:
Model Best Score: 0.8064554455445545
Best Hyperparameter

## Reference

* https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
* https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn-svm-linearsvc
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier
* https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html
* https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html#sklearn.gaussian_process.GaussianProcessClassifier

* https://scikitlearn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-download-auto-examples-classification-plot-classifier-comparison-py
* https://scikit-learn.org/stable/modules/preprocessing.html