In [11]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import classification_report
from ipynb.fs.full.SIFT_to_Features import SIFT_nparray_to_Features
%matplotlib inline

In [13]:
mypath = 'C:/Users/Yinzi/Desktop/ML PROJECT/Part 3/dataset/crop_part1'
filenames = np.array([f for f in listdir(mypath) if isfile(join(mypath, f))])
splitcolumns = [x.split('_')[0:3] + [mypath +'/' + x] for x in filenames if x.count('_') == 3]
filecolumns = ['age','gender','race','file']
df = pd.DataFrame(data = splitcolumns, columns = filecolumns)

In [14]:
df['age']= df['age'].astype('int')
df['gender']= df['gender'].astype('int')
df['race']= df['race'].astype('int')

In [15]:
images = []
for img_path in df['file']:
    image = plt.imread(img_path)
    images.append(image)

In [20]:
len(images)

9778

### 1. Gender + sift

In [16]:
X_train, X_test, y_train, y_test = train_test_split(images, df['gender'])

In [18]:
X_train_sift, X_test_sift = SIFT_nparray_to_Features(X_train, X_test)

  "        for j in bin_assignment:\n",


In [21]:
gbc=GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,random_state=100,max_features=5 )

In [22]:
gbc.fit(X_train_sift, y_train)

GradientBoostingClassifier(learning_rate=0.05, max_features=5, n_estimators=500,
                           random_state=100)

In [23]:
y_pred_sift = pred=gbc.predict(X_test_sift)

In [24]:
print("GBC accuracy is %2.2f" % accuracy_score(y_test, y_pred_sift))

GBC accuracy is 0.65


In [25]:
print(confusion_matrix(y_test, y_pred_sift))

[[604 489]
 [364 988]]


In [26]:
print(classification_report(y_test, y_pred_sift))

              precision    recall  f1-score   support

           0       0.62      0.55      0.59      1093
           1       0.67      0.73      0.70      1352

    accuracy                           0.65      2445
   macro avg       0.65      0.64      0.64      2445
weighted avg       0.65      0.65      0.65      2445



### 2. race + sift

In [28]:
X2_train, X2_test, y2_train, y2_test = train_test_split(images, df['race'], test_size =0.3, random_state = 42)

In [29]:
X2_train_sift, X2_test_sift = SIFT_nparray_to_Features(X2_train, X2_test)

  "        for j in bin_assignment:\n",


In [30]:
gbc.fit(X2_train_sift, y2_train)

GradientBoostingClassifier(learning_rate=0.05, max_features=5, n_estimators=500,
                           random_state=100)

In [31]:
y2_pred_sift = gbc.predict(X2_test_sift)

In [32]:
print("GBC accuracy is %2.2f" % accuracy_score(y2_test, y2_pred_sift))

GBC accuracy is 0.59


In [33]:
print(confusion_matrix(y2_test, y2_pred_sift))

[[1462    5   53   53    4]
 [  88   11    0   16    0]
 [ 280    4  157   23    6]
 [ 323    2   23   89   12]
 [ 262    5   18   28   10]]


In [34]:
print(classification_report(y2_test, y2_pred_sift))

              precision    recall  f1-score   support

           0       0.61      0.93      0.73      1577
           1       0.41      0.10      0.15       115
           2       0.63      0.33      0.44       470
           3       0.43      0.20      0.27       449
           4       0.31      0.03      0.06       323

    accuracy                           0.59      2934
   macro avg       0.48      0.32      0.33      2934
weighted avg       0.54      0.59      0.52      2934



### 3 Age + sift

In [35]:
# put the age in different interval
df['age']=np.where(((df.age>=1) & (df.age<=2)), 1, df.age)
df['age']=np.where(((df.age>=3) & (df.age<6)), 2, df.age)
df['age']=np.where(((df.age>=6) & (df.age<9)), 3, df.age)
df['age']=np.where(((df.age>=9) & (df.age<12)), 4, df.age)
df['age']=np.where(((df.age>=12) & (df.age<21)), 5, df.age)
df['age']=np.where(((df.age>=21) & (df.age<36)), 6, df.age)
df['age']=np.where(((df.age>=36) & (df.age<51)), 7, df.age)
df['age']=np.where(((df.age>=51) & (df.age<80)), 8, df.age)
df['age']=np.where((df.age>=80), 9, df.age)

In [36]:
X3_train, X3_test, y3_train, y3_test = train_test_split(images, df['age'], test_size =0.3, random_state = 42)

In [37]:
X3_train_sift, X3_test_sift = SIFT_nparray_to_Features(X3_train, X3_test)

  "        for j in bin_assignment:\n",


In [38]:
gbc.fit(X3_train_sift, y3_train)

GradientBoostingClassifier(learning_rate=0.05, max_features=5, n_estimators=500,
                           random_state=100)

In [39]:
y3_pred_sift = pred=gbc.predict(X3_test_sift)

In [40]:
print("GBC accuracy is %2.2f" % accuracy_score(y3_test, y3_pred_sift))

GBC accuracy is 0.37


In [41]:
print(confusion_matrix(y3_test, y3_pred_sift))

[[349  21   1   0  11  52  10  34   2]
 [ 85   7   2   2  14  52   8  46   0]
 [ 34  13   4   1   9  49  10  37   0]
 [ 21   4   3   2  15  25   5  42   1]
 [ 34   7   5   1  40 124  18 109   3]
 [ 36  10   1   1  31 352  58 127   0]
 [ 29   5   2   1  14 139  36 126   2]
 [ 44  10   3   0  23 119  38 288  12]
 [  4   0   0   0   3   5   3  79  21]]


In [42]:
print(classification_report(y3_test, y3_pred_sift))

              precision    recall  f1-score   support

           1       0.55      0.73      0.63       480
           2       0.09      0.03      0.05       216
           3       0.19      0.03      0.04       157
           4       0.25      0.02      0.03       118
           5       0.25      0.12      0.16       341
           6       0.38      0.57      0.46       616
           7       0.19      0.10      0.13       354
           8       0.32      0.54      0.40       537
           9       0.51      0.18      0.27       115

    accuracy                           0.37      2934
   macro avg       0.30      0.26      0.24      2934
weighted avg       0.33      0.37      0.33      2934

