In [None]:
#connect your google drive to google colab for easier access under "Files" on the left sidebar
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#import the necessary libraries
import cv2
import numpy as np
from skimage import io
import pandas as pd 
from skimage.color import rgb2gray
from skimage.feature import ORB, match_descriptors
from sklearn.cluster import KMeans
from scipy.spatial import distance
import json
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [None]:
#Extract image features from all images
descriptors_lst=[]
imgs_features=[]
keypoints_lst=[]
#reading data (from excel) using pandas
df=pd.read_excel("/content/drive/MyDrive/Qualtrics_Images_url.xlsx",sheet_name="Qualtrics MRT images",header=[0])
#iterating through each MRT image
for i, j in df.iterrows():
  img = io.imread('{}'.format(df.iloc[i]["Image_url"]))
  img = rgb2gray(img)
  #extract image features
  detector_extractor = ORB()
  detector_extractor.detect_and_extract(img)
  descriptors=detector_extractor.descriptors 
  keypoints=detector_extractor.keypoints
  # descriptors_lst length= all the descriptors that can be identified in all the images, each as an element. 
  descriptors_lst.extend(descriptors)
  # img_features length=220(220 images). within each element, there is a list of the descriptors belonging to each image
  imgs_features.append(descriptors)
  # keypoints_lst  length=220 (220 images). within each element, there is a list of the keypoints belonging to each image
  keypoints_lst.append(keypoints)

#References
#https://scikit-image.org/docs/dev/auto_examples/color_exposure/plot_rgb_to_gray.html

In [None]:
#using k-means clustering to extract 500 visual words that are most representative from all MRT images (image features)

# A k-means clustering algorithm who takes 2 parameter which is number 
# of cluster(k) and the other is descriptors list(unordered 1d array)
# Returns an array that holds central points.

def kmeans(k, descriptor_list):
    kmeans = KMeans(n_clusters = k, n_init=10)
    kmeans.fit(descriptor_list)
    visual_words = kmeans.cluster_centers_ 
    return visual_words
    
# Takes the central points which is visual words    
visual_words = kmeans(500, descriptors_lst) 


#References
#https://medium.com/@aybukeyalcinerr/bag-of-visual-words-bovw-db9500331b2f

In [None]:
# computing the visual words histogram (For each MRT image (220), for each VW (500))


imgs_hist=[]

# Find the index of the closest central point to the each orb descriptor. 
# Takes 2 parameters the first one is an orb descriptor and the second one is the array of central points in k means
# Returns the index of the closest central point.  
def find_index(image, center):
    count = 0
    ind = 0
    for i in range(len(center)):
        if(i == 0):
           count = distance.euclidean(image, center[i]) 
        else:
            dist = distance.euclidean(image, center[i]) 
            if(dist < count):
                ind = i
                count = dist
    return ind

for img in imgs_features:
  histogram = np.zeros(len(visual_words))
  for each_feature in img:
    ind = find_index(each_feature, visual_words)
    histogram[ind] += 1
  imgs_hist.append(histogram)

  #References
#https://github.com/AybukeYALCINER/gabor_sift_bovw/blob/master/assignment1.py

In [None]:
#saving the 500 visual words histogram to an excel sheet
name_lst=[]
for x in range(500):
  name_lst.append("VW"+str(x))

df2 = pd.DataFrame.from_records(imgs_hist, columns=name_lst)
with open("/content/drive/MyDrive/imgBOVW.xlsx",'wb') as f:
  df2.to_excel(f, sheet_name="500VW")

  #References
#https://datascience.stackexchange.com/questions/26333/convert-a-list-of-lists-into-a-pandas-dataframe
#https://stackoverflow.com/questions/20638006/convert-list-of-dictionaries-to-a-pandas-dataframe 

In [None]:
# running correlation on the BOVW histogram 

#reading data (from excel) using excel
df=pd.read_excel("/content/drive/MyDrive/imgBOVW.xlsx",sheet_name="4corr",header=[0],index_col=[0])
#using pearson's correlation
df2=df.corr(method ='pearson')
#export the correlation result to excel sheet
with pd.ExcelWriter('/content/drive/MyDrive/imgBOVW.xlsx',mode='a') as writer:  
  df2.to_excel(writer, sheet_name='BOVW_corr')

In [None]:
# computing the image features keypoints histogram (For each MRT image (220), for each VW (500))

keypoints_hist=[]
# Find the index of the closest central point to the each orb descriptor. 
# Takes 2 parameters the first one is an orb descriptor and the second one is the array of central points in k means
# Returns the index of the closest central point.  
def find_index(image, center):
    count = 0
    ind = 0
    for i in range(len(center)):
        if(i == 0):
           count = distance.euclidean(image, center[i]) 
           #count = L1_dist(image, center[i])
        else:
            dist = distance.euclidean(image, center[i]) 
            #dist = L1_dist(image, center[i])
            if(dist < count):
                ind = i
                count = dist
    return ind

default_value = 0
lst=[i for i in range(500)]
keypoints_bin = dict.fromkeys(lst,default_value)


for img_idx in range(len(imgs_features)):
  img=imgs_features[img_idx]
  for each_feature in range(len(img)):
    ind = find_index(img[each_feature], visual_words)

    ky_lst=keypoints_lst[img_idx][each_feature].tolist()
    if keypoints_bin[ind] == 0:
      keypoints_bin[ind]=[ky_lst]
    else:
      keypoints_bin[ind].append(ky_lst)
  keypoints_hist.append(keypoints_bin)
  keypoints_bin=keypoints_bin.fromkeys(keypoints_bin, 0)

  #References
#https://github.com/AybukeYALCINER/gabor_sift_bovw/blob/master/assignment1.py
    #https://www.journaldev.com/32797/python-convert-numpy-array-to-list

In [None]:
#saving the image features keypoints histogram to an excel sheet

name_lst=[]
for x in range(500):
  name_lst.append("VW"+str(x))
df2 = pd.DataFrame.from_dict(keypoints_hist, orient='columns')
with pd.ExcelWriter('/content/drive/MyDrive/imgBOVW.xlsx',mode='a') as writer:  
  df2.to_excel(writer, sheet_name='keypoints_500VW')

  #References
#https://datascience.stackexchange.com/questions/26333/convert-a-list-of-lists-into-a-pandas-dataframe
#https://stackoverflow.com/questions/20638006/convert-list-of-dictionaries-to-a-pandas-dataframe

In [None]:
#  Identify visual words (for each image quality of Beautiful,Safe and Welcome) on each MRT images

#function for drawing the visual words identified in the MRT images 
def draw_keypoints(img_name,img_url, keypoints,metrics, color = (0, 255, 255)):
  for kp in keypoints:
    x, y = kp
    cv2.circle(img_url, (int(x), int(y)), 2, color)

    #Note because i used skimage to load image, which will be loaded as RGB. 
    #but since im uisng opencv functions on this image, they assume that the image is loaded as BGR
    #so i will need to convert the image to BGR such that opencv wont apply function on the image as if its BGR
    cv2.imwrite("/content/drive/MyDrive/BOVW_images2/{}/{}".format(metrics,img_name),cv2.cvtColor(img_url,cv2.COLOR_RGB2BGR))


img_metrics=["Beautiful","Safe","Welcome"]
name_lst=[]
for x in range(500):
  name_lst.append("VW"+str(x))

#reading data (from excel) using pandas
df2=pd.read_excel("/content/drive/MyDriveimgBOVW.xlsx",sheet_name="keypoints_500VW",usecols="B:SG",header=[0])
df2.columns=name_lst
df3=pd.read_excel("/content/drive/MyDrive/Qualtrics_Images_url.xlsx",sheet_name="Qualtrics MRT images",header=[0])  

#drawing visual words identified in each MRT image for each image quality (Beautiful, Safe, Welcome)
for i in img_metrics:
  df=pd.read_excel("/content/drive/MyDrive/imgBOVW.xlsx",sheet_name="{}".format(i), usecols="B:DL",header=[0])
  for x,vws in df2[df.columns].iterrows():
    keypoints2_lst=[]
    for vw in vws:
      if type(vw)!=int:
        keypoints2_lst.extend(json.loads(vw))
    draw_keypoints("{}".format(df3.iloc[x]["Image_name"]), io.imread('{}'.format(df3.iloc[x]["Image_url"])),keypoints2_lst,i) 

#References
#https://stackoverflow.com/questions/39316447/opencv-giving-wrong-color-to-colored-images-on-loading
#https://stackoverflow.com/questions/42406338/why-cv2-imwrite-changes-the-color-of-pics
#https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#https://www.geeksforgeeks.org/python-convert-a-string-representation-of-list-into-list/

In [None]:
#Building a classification model to determine if an MRT image would be deemed as Beautiful/Safe/Welcome or not. 

#reading data (from excel) using pandas
img_metrics=["Beautiful","Safe","Welcome"]
for i in img_metrics:
  X=pd.read_excel("/content/drive/MyDrive/imgBOVW.xlsx",sheet_name="{}".format(i), usecols="B:DL",header=[0])
  Y=pd.read_excel("/content/drive/MyDrive/imgBOVW.xlsx",sheet_name="{}".format(i), usecols="A",header=[0])

#setting classification rules
  Y.loc[Y[i] <= 3, i] = 0
  Y.loc[Y[i] > 3, i] = 1


  #scaling of data
  stdslr=StandardScaler()
  X=stdslr.fit_transform(X)

  #SVM, a supervised learning classification algorithm is used 
  clf=LinearSVC(max_iter=80000,random_state=1)
  skf = StratifiedKFold(n_splits=5)
  eva_dict={"accuracy score":[],"precision score":[],"recall score":[],"F1 score":[]}
  for train_index, test_index in skf.split(X, Y):
    X_train,X_test=X[train_index],X[test_index]
    Y_train,Y_test=Y.iloc[train_index],Y.iloc[test_index]
    clf.fit(X_train,Y_train)
    Y_pred=clf.predict(X_test)

    #evaluation of classfication model
    eva_dict["accuracy score"].append(metrics.accuracy_score(Y_test, Y_pred))
    eva_dict["precision score"].append(metrics.precision_score(Y_test, Y_pred))
    eva_dict["recall score"].append(metrics.recall_score(Y_test, Y_pred))
    eva_dict["F1 score"].append(metrics.f1_score(Y_test, Y_pred))
  #export the evaluation results  to excel sheet (4 sets for each image quality)
  df2=pd.DataFrame.from_dict(eva_dict,orient="columns")
  with pd.ExcelWriter('/content/drive/MyDrive/imgBOVW.xlsx',mode='a') as writer:  
    df2.to_excel(writer, sheet_name='{}_modeleval'.format(i))
  #calculating the overall  mean and standard deviation of the evaluation result, for each image quality
  print("mean:",df2.mean())
  print("std:",df2.std())

  #References
#https://stackoverflow.com/questions/54762004/converting-continuous-column-to-binary-in-python-based-on-some-condition
#https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html
#://stackoverflow.com/questions/64492004/stratified-kfold-vs-train-test-split-what-training-data-is-used
#https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
#https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python
#https://www.analyseup.com/python-machine-learning/stratified-kfold.html