## 1. Image Classification with Adaboost

## Image pre-proccessing

In [1]:
# flatten each image, add sub-sliding windows features to each image, 

def obtain_dataset(folder_name):
    import cv2
    import numpy as np
    import glob
    from matplotlib import pyplot as plt
    # assuming 128x128 size images and HoGDescriptor length of 34020
    X, X_1, y = [], [], []

    for fullpath in glob.iglob(f'{folder_name}/*/*'):
        _, target, _ = fullpath.split('/')
        X_1.append(cv2.imread(fullpath))
        X.append(cv2.imread(fullpath).flatten())
        y.append(target)

    X, X_1, y = np.array(X), np.array(X_1), np.array(y)   
  #  print(X.shape, X_1.shape)
    hog_feature_len=34020
    hog = cv2.HOGDescriptor()
  #  for image in X_1:
  #  h = hog.compute(image)

    W = []
    stepSize = 30
    (w_width, w_height) = (32, 32) # window size
    for image in X_1:
        for i in range(0, 128 - w_width , stepSize):
               for j in range(0, 128 - w_height, stepSize):
                    window = image[i:i + w_width, j:j + w_height,:]
        W.append(window.flatten())

    W = np.array(W)
  #  print(W.shape)
    X = np.hstack((W,X))

    y    = (y[:,None] == np.unique(y)[None]).argmax(axis = 1)
    # use this to read all images in the three directories and obtain the set of features X and train labels Y
    # you can assume there are three different classes in the image dataset
    return (X,y) 

In [2]:
# Optional function for those who want to include pre-processing for train data in obtain dataset
def obtain_dataset_train_test(folder_name_train, folder_name_test):
 
    
    X_train, y_train = obtain_dataset(folder_name_train)
    X_test,  y_test  = obtain_dataset(folder_name_test)

    return (X_train, y_train, X_test, y_test) 

## AdaBoost Algorithm

In [3]:
class BoostingClassifier:
    import numpy as np
    '''
    Parameters
    -----------
    base_estimator: object
        The base model from which the boosted ensemble is built.

    n_estimators: integer, optional(default=50)
        The maximum number of estimators

    learning_rate: float, optional(default=1)

    algorithm: {'SAMME','SAMME.R'}, optional(default='SAMME.R')
        SAMME.R uses predicted probabilities to update wights, while SAMME uses class error rate

    random_state: int or None, optional(default=None)

    '''

    def __init__(self, *args, **kwargs):
        import numpy as np
        from copy import deepcopy
        if kwargs and args:
            raise ValueError(
                '''AdaBoostClassifier can only be called with keyword
                   arguments for the following keywords: base_estimator ,n_estimators,
                    learning_rate,algorithm,random_state''')
        allowed_keys = ['base_estimator', 'n_estimators', 'learning_rate', 'algorithm', 'random_state']
        keywords_used = kwargs.keys()
        for keyword in keywords_used:
            if keyword not in allowed_keys:
                raise ValueError(keyword + ":  Wrong keyword used --- check spelling")

        n_estimators = 50
        learning_rate = 1
        algorithm = 'SAMME.R'
        random_state = None

        if kwargs and not args:
            if 'base_estimator' in kwargs:
                base_estimator = kwargs.pop('base_estimator')
            else:
                raise ValueError('''base_estimator can not be None''')
            if 'n_estimators' in kwargs: n_estimators = kwargs.pop('n_estimators')
            if 'learning_rate' in kwargs: learning_rate = kwargs.pop('learning_rate')
            if 'algorithm' in kwargs: algorithm = kwargs.pop('algorithm')
            if 'random_state' in kwargs: random_state = kwargs.pop('random_state')

        self.base_estimator_ = base_estimator
        self.n_estimators_ = n_estimators
        self.learning_rate_ = learning_rate
        self.algorithm_ = algorithm
        self.random_state_ = random_state
        self.estimators_ = list()
        self.estimator_weights_ = np.zeros(self.n_estimators_)
        self.estimator_errors_ = np.ones(self.n_estimators_)

    def get_params(self, deep=True):
        import numpy as np
        from copy import deepcopy
        return {"base_estimator": self.base_estimator,"n_estimators": self.n_estimators, "learning_rate": self.learning_rate,\
                "algorithm": self.algorithm, "random_state": self.random_state}

    def set_params(self, **parameters):
        import numpy as np
        from copy import deepcopy
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def _samme_proba(self, estimator, n_classes, X):
        import numpy as np
        from copy import deepcopy

        proba = estimator.predict_proba(X)

        # Displace zero probabilities so the log is defined.
        # Also fix negative elements which may occur with
        # negative sample weights.
        proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
        log_proba = np.log(proba)

        return (n_classes - 1) * (log_proba - (1. / n_classes)
                                  * log_proba.sum(axis=1)[:, np.newaxis])


    def fit(self, X, y):
        import numpy as np
        from copy import deepcopy
        self.n_samples = X.shape[0]
        # There is hidden trouble for classes, here the classes will be sorted.
        # So in boost we have to ensure that the predict results have the same classes sort
        self.classes_ = np.array(sorted(list(set(y))))
        self.n_classes_ = len(self.classes_)
        for iboost in range(self.n_estimators_):
            if iboost == 0:
                sample_weight = np.ones(self.n_samples) / self.n_samples

            sample_weight, estimator_weight, estimator_error = self.boost(X, y, sample_weight)

            # early stop
            if estimator_error == None:
                break

            # append error and weight
            self.estimator_errors_[iboost] = estimator_error
            self.estimator_weights_[iboost] = estimator_weight

            if estimator_error <= 0:
                break

        return self


    def boost(self, X, y, sample_weight):
        import numpy as np
        from copy import deepcopy
        if self.algorithm_ == 'SAMME':
            return self.discrete_boost(X, y, sample_weight)
        elif self.algorithm_ == 'SAMME.R':
            return self.real_boost(X, y, sample_weight)

    def real_boost(self, X, y, sample_weight):
        import numpy as np
        from copy import deepcopy
        from numpy.core.umath_tests import inner1d
        estimator = deepcopy(self.base_estimator_)
        if self.random_state_:
            estimator.set_params(random_state=1)

        estimator.fit(X, y, sample_weight=sample_weight)

        y_pred = estimator.predict(X)
        incorrect = y_pred != y
        estimator_error = np.dot(incorrect, sample_weight) / np.sum(sample_weight, axis=0)

        # if worse than random guess, stop boosting
        if estimator_error >= 1.0 - 1 / self.n_classes_:
            return None, None, None

        y_predict_proba = estimator.predict_proba(X)
        # repalce zero
        y_predict_proba[y_predict_proba < np.finfo(y_predict_proba.dtype).eps] = np.finfo(y_predict_proba.dtype).eps

        y_codes = np.array([-1. / (self.n_classes_ - 1), 1.])
        y_coding = y_codes.take(self.classes_ == y[:, np.newaxis])

        # for sample weight update
        intermediate_variable = (-1. * self.learning_rate_ * (((self.n_classes_ - 1) / self.n_classes_) *
                                                              inner1d(y_coding, np.log(
                                                                  y_predict_proba))))  #dot iterate for each row

        # update sample weight
        sample_weight *= np.exp(intermediate_variable)

        sample_weight_sum = np.sum(sample_weight, axis=0)
        if sample_weight_sum <= 0:
            return None, None, None

        # normalize sample weight
        sample_weight /= sample_weight_sum

        # append the estimator
        self.estimators_.append(estimator)

        return sample_weight, 1, estimator_error


    def discrete_boost(self, X, y, sample_weight):
        import numpy as np
        from copy import deepcopy
        estimator = deepcopy(self.base_estimator_)
        if self.random_state_:
            estimator.set_params(random_state=1)

        estimator.fit(X, y, sample_weight=sample_weight)

        y_pred = estimator.predict(X)
        incorrect = y_pred != y
        estimator_error = np.dot(incorrect, sample_weight) / np.sum(sample_weight, axis=0)

        # if worse than random guess, stop boosting
        if estimator_error >= 1 - 1 / self.n_classes_:
            return None, None, None

        # update estimator_weight
        estimator_weight = self.learning_rate_ * np.log((1 - estimator_error) / estimator_error) + np.log(
            self.n_classes_ - 1)

        if estimator_weight <= 0:
            return None, None, None

        # update sample weight
        sample_weight *= np.exp(estimator_weight * incorrect)

        sample_weight_sum = np.sum(sample_weight, axis=0)
        if sample_weight_sum <= 0:
            return None, None, None

        # normalize sample weight
        sample_weight /= sample_weight_sum

        # append the estimator
        self.estimators_.append(estimator)

        return sample_weight, estimator_weight, estimator_error

    def predict(self, X):
        import numpy as np
        from copy import deepcopy
        n_classes = self.n_classes_
        classes = self.classes_[:, np.newaxis]
        pred = None

        if self.algorithm_ == 'SAMME.R':
            # The weights are all 1. for SAMME.R
            pred = sum(self._samme_proba(estimator, n_classes, X) for estimator in self.estimators_)
        else:  # self.algorithm == "SAMME"
            pred = sum((estimator.predict(X) == classes).T * w
                       for estimator, w in zip(self.estimators_,
                                               self.estimator_weights_))

        pred /= self.estimator_weights_.sum()
        if n_classes == 2:
            pred[:, 0] *= -1
            pred = pred.sum(axis=1)
            return self.classes_.take(pred > 0, axis=0)

        return self.classes_.take(np.argmax(pred, axis=1), axis=0)


    def predict_proba(self, X):
        import numpy as np
        from copy import deepcopy
        if self.algorithm_ == 'SAMME.R':
            # The weights are all 1. for SAMME.R
            proba = sum(self._samme_proba(estimator, self.n_classes_, X)
                        for estimator in self.estimators_)
        else:  # self.algorithm == "SAMME"
            proba = sum(estimator.predict_proba(X) * w
                        for estimator, w in zip(self.estimators_,
                                                self.estimator_weights_))

        proba /= self.estimator_weights_.sum()
        proba = np.exp((1. / (n_classes - 1)) * proba)
        normalizer = proba.sum(axis=1)[:, np.newaxis]
        normalizer[normalizer == 0.0] = 1.0
        proba /= normalizer

        return proba

### Adaboost Accuracy
#### Using optimised hyperparameters

In [4]:
def test_func_boosting_image(image_dataset_train, image_dataset_test):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score
    #(X_train, Y_train) = obtain_dataset(image_dataset_train)
    #(X_test, Y_test) = obtain_dataset(image_dataset_test)# optionally replace the two calls with a single call to obtain_dataset_train_test() function
    (X_train, Y_train, X_test, Y_test) = obtain_dataset_train_test(image_dataset_train, image_dataset_test)
    bc = BoostingClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, learning_rate=0.25,algorithm='SAMME')
    bc.fit(X_train, Y_train)
    y_pred = bc.predict(X_test)
    acc = accuracy_score(Y_test, y_pred)
    return acc

## 2. Image Classification with SVM

## SVM Model

In [5]:
class SVMClassifier: # NOTE sklearn Classifier is used instead so left this cell blank
    def __init__(self):
        #implement initialisation
        self.some_paramter=1
    def fit_image(self, X,y):
        #training of the SVM 
        # providing for separate image kernels
        return 
    def fit_text(self, X,y):
        # training of the SVM
        # providing for separate text kernels
        return
    def predict_image(self, X):
        # prediction routine for the SVM
        return
    def predict_text(self, X):
        # prediction routine for the SVM
        return    

### Custom Kernals

In [6]:
# 5 - Custom: Cauchy...

def cauchy(i, j, sigma=None):
    '''
        K(x, y) = 1 / (1 + ||x - y||^2 / s ^ 2)
    where:
        s = sigma
    '''
    from scipy.spatial import distance
    
    if sigma is None:
        sigma = float(i.shape[1])
        dists_sq = distance.cdist(i, j, 'euclidean')

    return 1 / (1 + dists_sq / sigma**2)

In [7]:
# 6 - Custom: Cosine kernal...

def cosine(i, j):
    import numpy as np
    """ 
        K(x, y) = <x . y> / (||x|| . ||y||)
    """
    norm_i = np.linalg.norm(i)
    norm_j = np.linalg.norm(j)
    return i.dot(j.T) / (norm_i * (norm_j.T))

In [8]:
# 7 - Custom: Multi-quadric kernal...

def multiquadric(i, j):
    import numpy as np
    from scipy.spatial import distance
    """
    Multiquadratic kernel, 
        K(x, y) = sqrt(||x-y||^2 + c^2)
    where:
        c > 0
    """
    c = 0.5
    dists_sq = distance.cdist(i, j, 'euclidean')
    return np.sqrt(dists_sq + c**2)


In [9]:
# 8 - Custom: Inverse Multi-quadric kernal...

def inv_multiquadric(i, j):
    from scipy.spatial import distance
    import numpy as np
    """
    Inverse Multiquadratic kernel, 
        K(x, y) = 1/ sqrt(||x-y||^2 + c^2)
    where:
        c > 0
    """
    c = 0.5
    dists_sq = distance.cdist(i, j, 'euclidean')
    return 1 / np.sqrt(dists_sq + c**2)

In [10]:
# 9 - Custom: T-Student kernal...
def t_student(d=50):
    def t_student_kernel(i, j):
        from scipy.spatial import distance
        import numpy as np
        """
        T-Student kernel, 
            K(x, y) = 1 / (1 + ||x - y||^d)
        where:
            d = degree
        """
        sqrt_dist = np.sqrt((distance.cdist(i, j, 'euclidean')))
        return 1 / (1 + sqrt_dist ** d)
    return t_student_kernel

### SVM Accuracy 
#### Using optimised hyperparameters

In [1]:
def test_func_svm_image(image_dataset_train, image_dataset_test):
    from sklearn.metrics import accuracy_score    
    from sklearn.svm import SVC
    #(X_train, Y_train) = obtain_dataset(image_dataset_train)
    #(X_test, Y_test) = obtain_dataset(image_dataset_test) # optionally replace the two calls with a single call to obtain_dataset_train_test() function
    (X_train, Y_train, X_test, Y_test) = obtain_dataset_train_test(image_dataset_train, image_dataset_test)
    sc = SVC(kernel = 'poly',decision_function_shape='ovr', C = 100, gamma = 1)
    sc.fit(X_train, Y_train)
    y_pred = sc.predict(X_test)
    acc = accuracy_score(Y_test, y_pred)
    return acc

## 3. Text Classification/ Sentiment Analysis with Adaboost

## Text data pre-proccessing

In [12]:
def extract_bag_of_words(train_file):
    # Write your preprocessor to process the text
    # Write your own bag of words feature extractor using nltk and scikit-learn
    # return (X,y)
    import nltk
    #nltk.download()
    from nltk.stem import PorterStemmer
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    import seaborn as sns
    import pandas as pd
    import numpy as np
    
    df = pd.read_csv(train_file, index_col=False)
 #   print(len(df))
 #   print(df.sentiment.value_counts())
    # Cleaning data...

    # Make input all lower case...
    df['review'] = df['review'].str.lower()

    # Remove punctuations...
    df['review'] = df['review'].str.replace(r'[^\w\s]+', '')

    # Tokenise words...
    df['review'] = df['review'].apply(word_tokenize)

    # Remove stop-words...
    stop_words = set(stopwords.words('english'))

    def remove_stops(row):
        my_list = row['review']
        meaningful_words = [w for w in my_list if not w in stop_words]
        return (meaningful_words)

    df['review_clean'] = df.apply(remove_stops, axis=1)

    # check an example
    #print(df['review_clean'].iloc[1])
    
    # implement stemming...

    ps = PorterStemmer()

    def stem_list(row):
        my_list = row['review_clean']
        stemmed_list = [ps.stem(word) for word in my_list]
        return (stemmed_list)

    df['review_stemmed'] = df.apply(stem_list, axis=1)
    
    X = df.review_stemmed
    y = np.array(df.sentiment)

    return (X, y)

In [13]:
def extract_bag_of_words_train_test(train_file, test_file):
    # Write your preprocessor to process the text
    # Write your own bag of words feature extractor using nltk and scikit-learn
    
    # Process training data first and ensure the test data is not used while extracting bag of words feature vector
    # 80%/ 20% train test data set...
    # x_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD
    
    x_train, y_train = extract_bag_of_words(train_file)     # X = df.review_stemmed; y = np.array(df.sentiment)
    X_test, y_test = extract_bag_of_words(test_file)
 #   X_test = np.array(X_test)
    
    print("Train shapes : X = {}, y = {}".format(x_train.shape,y_train.shape))
    print("Test shapes : X = {}, y = {}".format(X_test.shape,y_test.shape))
    
    # Process testing data here. Ensure that test data is not used above
    
    # Implement bag of words TfidfVectorizer...

    tfidf = TfidfVectorizer(max_features=20000)

    # fit and tranform training and test set text into a matrix separately 
    X_train_tfidf = tfidf.fit_transform(x_train.map(' '.join)).toarray()
    X_test_tfidf = tfidf.transform(X_test.map(' '.join)).toarray() # took out fit

    feature_names = tfidf.get_feature_names()
    
    
    svd = TruncatedSVD(n_components=500, random_state=42)
   
    X_train = svd.fit_transform(X_train_tfidf)
    X_test = svd.transform(X_test_tfidf)
    
    print('LSA output shape:', X_train.shape)
    
    #for col in X_train.nonzero()[1]:
    #    print(feature_names[col], ' - ', X_train[0, col])
    
 #   print("num_features: {}".format(len(feature_names)))
    print("X_train.shape = ",X_train.shape)
    print("X_test.shape = ",X_test.shape)
    print("y_train.shape = ",y_train.shape)
    print("y_test.shape = ",y_test.shape)
 #   print('X_train vectorised output sample:\n {}'.format(X_train[0:15]))
    # return (X_train,y_train,X_test,y_test)

    return (X_train,y_train,X_test,y_test)

## Adaboost Accuracy
#### Using optimised hyperparameters

In [14]:
## 3. Text Classification/ Sentiment Analysis with Adaboostdef test_func_boosting_text(text_dataset_train, text_dataset_test):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score
#    (X_train, Y_train) = extract_bag_of_words(text_dataset_train)
#    (X_test, Y_test) = extract_bag_of_words(text_dataset_test) # optionally the two calls can be replaced by a single extract_bag_of_words_train_test() function
    (X_train,Y_train,X_test,Y_test) = extract_bag_of_words_train_test(text_dataset_train, text_dataset_test)    
    bc = BoostingClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=150, learning_rate=1,algorithm='SAMME')
    bc.fit(X_train, Y_train)
    y_pred = bc.predict(X_test)    
    acc = accuracy_score(Y_test, y_pred)
    return acc

## 4. Text Classification/ Sentiment Analysis with SVM

### Custom Kernals for SVM

In [15]:
# 5 - Custom: Cauchy...

def cauchy(i, j, sigma=None):
    '''
        K(x, y) = 1 / (1 + ||x - y||^2 / s ^ 2)
    where:
        s = sigma
    '''
    from scipy.spatial import distance
    
    if sigma is None:
        sigma = float(i.shape[1])
        dists_sq = distance.cdist(i, j, 'euclidean')

    return 1 / (1 + dists_sq / sigma**2)

In [16]:
# 6 - Custom: Cosine kernal...

def cosine(i, j):
    import numpy as np
    """ 
        K(x, y) = <x . y> / (||x|| . ||y||)
    """
    norm_i = np.linalg.norm(i)
    norm_j = np.linalg.norm(j)
    return i.dot(j.T) / (norm_i * (norm_j.T))

In [17]:
# 7 - Custom: Multi-quadric kernal...

def multiquadric(i, j):
    import numpy as np
    from scipy.spatial import distance
    """
    Multiquadratic kernel, 
        K(x, y) = sqrt(||x-y||^2 + c^2)
    where:
        c > 0
    """
    c = 0.5
    dists_sq = distance.cdist(i, j, 'euclidean')
    return np.sqrt(dists_sq + c**2)


In [18]:
# 8 - Custom: Inverse Multi-quadric kernal...

def inv_multiquadric(i, j):
    from scipy.spatial import distance
    import numpy as np
    """
    Inverse Multiquadratic kernel, 
        K(x, y) = 1/ sqrt(||x-y||^2 + c^2)
    where:
        c > 0
    """
    c = 0.5
    dists_sq = distance.cdist(i, j, 'euclidean')
    return 1 / np.sqrt(dists_sq + c**2)

In [19]:
# 9 - Custom: T-Student kernal...
def t_student(d=50):
    def t_student_kernel(i, j):
        from scipy.spatial import distance
        import numpy as np
        """
        T-Student kernel, 
            K(x, y) = 1 / (1 + ||x - y||^d)
        where:
            d = degree
        """
        sqrt_dist = np.sqrt((distance.cdist(i, j, 'euclidean')))
        return 1 / (1 + sqrt_dist ** d)
    return t_student_kernel

## SVM Accuracy

#### Using optimised hyperparameters

In [20]:
def test_func_svm_text(text_dataset_train, text_dataset_test):
    from sklearn.metrics import accuracy_score    
    from sklearn.svm import SVC
 #   (X_train, Y_train) = extract_bag_of_words(text_dataset_train)
  #  (X_test, Y_test) = extract_bag_of_words(text_dataset_test) # optionally the two calls can be replaced by a single extract_bag_of_words_train_test() function
    (X_train,Y_train,X_test,Y_test) = extract_bag_of_words_train_test(text_dataset_train, text_dataset_test)    
    sc_text = SVC(kernel = 'rbf', decision_function_shape='ovr', C = 10, gamma = 1.0)
    sc_text.fit(X_train, Y_train)
    y_pred = sc_text.predict(X_test)
    acc = accuracy_score(Y_test, y_pred)
    return acc