In [None]:
from PIL import Image
import numpy as np
from numpy import asarray
import os
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.linalg import eigh
from sklearn.metrics import accuracy_score

In [None]:
def get_train_test_splits(folder, train_size=8):
    image_folders = []
    for f in os.listdir(folder):
        image_folders.append(f)
    
    x_first, y_first = True, True
    y_train, y_test = [], []
    
    for f in image_folders:
        loc = folder + '/' + f
        count = 0
        folnum = int(f[1:])
        for file in os.listdir(loc):
            file_loc = loc + '/' + file
            image = Image.open(file_loc)
            
            pixels = asarray(image)
            pixels = np.reshape(pixels,[1,pixels.shape[0]*pixels.shape[1]])

            if count < train_size:
                if x_first:
                    X_train =  pixels
                    x_first = False
                else:
                    X_train = np.vstack([X_train, pixels])
                y_train.append(folnum)
            else:
                if y_first:
                    X_test = pixels 
                    y_first = False
                else:
                    X_test = np.vstack([X_test, pixels])
                y_test.append(folnum)
            count += 1
    return X_train, X_test, y_train, y_test
        

In [None]:
X_train, X_test, y_train, y_test = get_train_test_splits(folder='pics', train_size=8)

In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
pca = PCA(n_components=150)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
X_train.shape

In [None]:
# print(pca.explained_variance_ratio_)
plt.figure(figsize=(15,10))
plt.ylim(0.0,1.1)
plt.plot(np.cumsum(pca.explained_variance_ratio_), linewidth=3)
# plt.axhline(y=0.95, color='r', linestyle='-')
# plt.text(500, 0.85, '95% cut-off threshold', color = 'red', fontsize=14)
plt.xlabel('number of components', fontsize=21)
plt.ylabel('cumulative explained variance', fontsize=21)
plt.title('Scree Plot using PCA', fontsize=24)
# plt.rc('xtick', labelsize=16)
plt.rc('font', size=16)
# plt.rc('ytick', labelsize=16)
plt.grid()
plt.show()

In [None]:
param = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 
                #   50, 60, 70, 80, 90, 100, None
                  ],
    # 'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 
                    #  1000, 1200, 1400, 1600, 1800, 2000
                     ]
    }

In [None]:
rft = RandomForestClassifier(bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=400)
rft.fit(X_train, y_train)
y_pred = rft.predict(X_test)

In [None]:
X_test.shape

In [None]:
y_pred.shape

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
rf = GridSearchCV(RandomForestClassifier(),param,verbose=10)
rf.fit(X_train, y_train)