Mounting Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Dependencies

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from PIL import Image

In [41]:
import warnings
warnings.filterwarnings("ignore")

Uploading Data

In [4]:
img_data = []
min = 10**10
path = '/content/gdrive/MyDrive/ML Mini Project/Data_Images/Data/F'
for i in range(15890):
  path1 = path + str(i+1) + '.jpg'
  img = Image.open(path1, 'r')
  img_arr = np.array(img)
  img_data.append(img_arr)

In [43]:
min = img_data[0].size
for i in range(len(img_data)):
  if(min > (img_data[i].size)):
    min = img_data[i].size
  img_data[i] = img_data[i].reshape(img_data[i].size)

In [44]:
for i in range(len(img_data)):
  img_data[i] = img_data[i][-min:]

In [45]:
img_data_1 = np.array(img_data)
print(img_data_1.shape)
cols = []
for i in range(min):
  cols.append('Feature : ' + str(i+1))
data = pd.DataFrame(img_data_1, columns = cols)
print(data)

(15890, 3510)
       Feature : 1  Feature : 2  ...  Feature : 3509  Feature : 3510
0               84           53  ...              30              32
1              181          173  ...              78             107
2              183          195  ...              32              44
3               26           21  ...              25              48
4              255          237  ...             137             129
...            ...          ...  ...             ...             ...
15885          239          240  ...              59              51
15886           46           80  ...              81             139
15887          248          201  ...              19              19
15888           97           75  ...             122              96
15889          121           68  ...              79              51

[15890 rows x 3510 columns]


In [46]:
label = pd.read_csv('/content/gdrive/MyDrive/ML Mini Project/Data_Images/label.csv')
print(label)

       label
0          1
1          1
2          1
3          1
4          1
...      ...
15885      0
15886      0
15887      0
15888      0
15889      0

[15890 rows x 1 columns]


In [47]:
x_train, x_test_1, y_train, y_test_1 = train_test_split(data, label, test_size = 0.4, random_state = 0)
x_val, x_test, y_val, y_test = train_test_split(x_test_1, y_test_1, test_size = 0.25, random_state = 0)

Preprocessing

In [48]:
std = StandardScaler()
x_train_1 = std.fit_transform(x_train)
x_val_1 = std.transform(x_val)
x_test_1 = std.transform(x_test)

In [49]:
lda = LinearDiscriminantAnalysis()
x_train_2 = lda.fit_transform(x_train_1, y_train)
x_val_2 = lda.transform(x_val_1)
x_test_2 = lda.transform(x_test_1)

In [50]:
skb = SelectKBest(k = 100)
x_train_3 = skb.fit_transform(x_train_1, y_train)
x_val_3 = skb.transform(x_val_1)
x_test_3 = skb.transform(x_test_1)

Decision Tree Classifier

In [51]:
model_dt_1 = DecisionTreeClassifier(max_depth = 7)
model_dt_1.fit(x_train_1, y_train)
print('Training accuracy : ', model_dt_1.score(x_train_1, y_train))
print('Validation accuracy : ', model_dt_1.score(x_val_1, y_val))
print('Testing accuracy : ', model_dt_1.score(x_test_1, y_test))
print('Cross validation score : ', cross_val_score(model_dt_1, x_train_1, y_train))

Training accuracy :  0.9873085798195931
Validation accuracy :  0.973778057478498
Testing accuracy :  0.9723096286972939
Cross validation score :  [0.97325642 0.97115889 0.96958574 0.97587834 0.97166842]


In [52]:
model_dt_2 = DecisionTreeClassifier(max_depth = 7)
model_dt_2.fit(x_train_2, y_train)
print('Training accuracy : ', model_dt_2.score(x_train_2, y_train))
print('Validation accuracy : ', model_dt_2.score(x_val_2, y_val))
print('Testing accuracy : ', model_dt_2.score(x_test_2, y_test))
print('Cross validation score : ', cross_val_score(model_dt_2, x_train_2, y_train))

Training accuracy :  1.0
Validation accuracy :  0.9618208516886931
Testing accuracy :  0.9622404027690371
Cross validation score :  [0.99895123 0.99947562 1.         1.         1.        ]


In [53]:
model_dt_3 = DecisionTreeClassifier(max_depth = 7)
model_dt_3.fit(x_train_3, y_train)
print('Training accuracy : ', model_dt_3.score(x_train_3, y_train))
print('Validation accuracy : ', model_dt_3.score(x_val_3, y_val))
print('Testing accuracy : ', model_dt_3.score(x_test_3, y_test))
print('Cross validation score : ', cross_val_score(model_dt_3, x_train_3, y_train))

Training accuracy :  0.9840570589469267
Validation accuracy :  0.9767149150409062
Testing accuracy :  0.974826935179358
Cross validation score :  [0.97220766 0.97797588 0.97482958 0.97430519 0.97481637]


AdaBoost Classifier

In [54]:
model_b_1 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 10)
model_b_1.fit(x_train_1, y_train)
print('Training accuracy : ', model_b_1.score(x_train_1, y_train))
print('Validation accuracy : ', model_b_1.score(x_val_1, y_val))
print('Testing accuracy : ', model_b_1.score(x_test_1, y_test))
print('Cross validation score : ', cross_val_score(model_b_1, x_train_1, y_train))

Training accuracy :  0.9856303754982169
Validation accuracy :  0.9775540172015943
Testing accuracy :  0.9786028949024543
Cross validation score :  [0.97535396 0.97482958 0.97378081 0.97378081 0.97219307]


In [55]:
model_b_2 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 10)
model_b_2.fit(x_train_2, y_train)
print('Training accuracy : ', model_b_2.score(x_train_2, y_train))
print('Validation accuracy : ', model_b_2.score(x_val_2, y_val))
print('Testing accuracy : ', model_b_2.score(x_test_2, y_test))
print('Cross validation score : ', cross_val_score(model_b_2, x_train_2, y_train))

Training accuracy :  1.0
Validation accuracy :  0.9618208516886931
Testing accuracy :  0.9622404027690371
Cross validation score :  [0.99895123 0.99947562 1.         1.         1.        ]


In [56]:
model_b_3 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 10)
model_b_3.fit(x_train_3, y_train)
print('Training accuracy : ', model_b_3.score(x_train_3, y_train))
print('Validation accuracy : ', model_b_3.score(x_val_3, y_val))
print('Testing accuracy : ', model_b_3.score(x_test_3, y_test))
print('Cross validation score : ', cross_val_score(model_b_3, x_train_3, y_train))

Training accuracy :  0.9825886301657227
Validation accuracy :  0.9781833438221104
Testing accuracy :  0.9773442416614223
Cross validation score :  [0.97850026 0.97535396 0.97535396 0.97115889 0.97376705]


Naive Bayes Classifier

In [57]:
model_nb_1 = GaussianNB()
model_nb_1.fit(x_train_1, y_train)
print('Training accuracy : ', model_nb_1.score(x_train_1, y_train))
print('Validation accuracy : ', model_nb_1.score(x_val_1, y_val))
print('Testing accuracy : ', model_nb_1.score(x_test_1, y_test))
print('Cross validation score : ', cross_val_score(model_nb_1, x_train_1, y_train))

Training accuracy :  0.7243549402139711
Validation accuracy :  0.7396685546465283
Testing accuracy :  0.7142857142857143
Cross validation score :  [0.75353959 0.7304667  0.71893026 0.69847929 0.73452256]


In [58]:
model_nb_2 = GaussianNB()
model_nb_2.fit(x_train_2, y_train)
print('Training accuracy : ', model_nb_2.score(x_train_2, y_train))
print('Validation accuracy : ', model_nb_2.score(x_val_2, y_val))
print('Testing accuracy : ', model_nb_2.score(x_test_2, y_test))
print('Cross validation score : ', cross_val_score(model_nb_2, x_train_2, y_train))

Training accuracy :  0.9993706733794839
Validation accuracy :  0.9702118732955738
Testing accuracy :  0.9660163624921334
Cross validation score :  [0.99947562 0.99947562 0.99790246 1.         0.99895068]


In [59]:
model_nb_3 = GaussianNB()
model_nb_3.fit(x_train_3, y_train)
print('Training accuracy : ', model_nb_3.score(x_train_3, y_train))
print('Validation accuracy : ', model_nb_3.score(x_val_3, y_val))
print('Testing accuracy : ', model_nb_3.score(x_test_3, y_test))
print('Cross validation score : ', cross_val_score(model_nb_3, x_train_3, y_train))

Training accuracy :  0.7463813719320327
Validation accuracy :  0.749108453954269
Testing accuracy :  0.7312775330396476
Cross validation score :  [0.73518616 0.75773466 0.73990561 0.73990561 0.7570829 ]
