# ML4622 Machine Learning - Project

## Pre-processing Data

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn import metrics

data_train = pd.read_csv('data/train.csv')
data_valid = pd.read_csv('data/valid.csv')
data_test = pd.read_csv('data/test.csv')

### Inspecting Data

In [2]:
data_train.shape

(28520, 772)

In [3]:
data_valid.shape

(750, 772)

In [4]:
data_test.shape

(744, 769)

In [5]:
data_test.head()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768
0,1,0.15316,0.112289,0.040273,0.006054,-0.082956,0.010444,0.045114,-0.165139,0.102368,...,0.106029,0.062651,0.038253,-0.156998,-0.048017,0.068254,0.047534,0.160371,0.076477,0.093875
1,2,0.067325,0.076545,0.105324,0.028182,-0.103412,-0.089486,0.141884,0.046473,0.00303,...,-0.005264,0.084812,0.160644,-0.075722,-0.054241,0.049832,0.003579,-0.017871,-0.059488,-0.102072
2,3,0.010169,0.116066,-0.012554,0.001083,-0.175814,0.014485,0.170519,-0.127064,-0.038071,...,0.037776,0.063297,0.079459,-0.382852,-0.099809,0.013234,-0.020953,-0.021539,-0.04473,-0.123528
3,4,0.162924,-0.003632,0.232407,0.127371,0.021365,-0.093776,0.128503,-0.119398,-0.04316,...,0.06536,0.152132,0.036445,-0.046519,-0.153139,0.018204,-0.112533,0.095837,0.091668,-0.111523
4,5,-0.030397,0.020162,-0.048599,0.008827,-0.118521,-0.02377,0.018864,-0.026707,-0.020216,...,-0.008395,0.060719,0.021331,-0.154071,-0.027794,0.018429,0.019384,-0.02869,0.020569,-0.052916


### Handling Data

In [6]:
LABELS = [F'label_{i}' for i in range(1, 5)]
FEATURES = [F'feature_{i}' for i in range(1, 769)]

X_train, X_valid, X_test, y_train, y_valid, y_predict, y_test_1, y_test_2 = [{} for _ in range(8)]

for label in LABELS:
    if label == 'label_2':
        df_train = data_train[data_train['label_2'].notna()]
        df_valid = data_valid[data_valid['label_2'].notna()]
    else:
        df_train = data_train
        df_valid = data_valid
    df_test = data_test

    X_train[label] = df_train.drop(LABELS, axis=1)
    y_train[label] = df_train[label]
    X_valid[label] = df_valid.drop(LABELS, axis=1)
    y_valid[label] = df_valid[label]
    X_test[label] = df_test.iloc[:, 1:]


## Feature Engineering & Dimensionality Reduction

### Principal Component Analysis (PCA)

In [7]:
def pca_reduce(X, pca, first_fit=False):
  if first_fit:
    X_train_pca = pca.fit_transform(X)
  else:
    X_train_pca = pca.transform(X)
  return pd.DataFrame(data=X_train_pca, columns=[X.columns[i] for i in range(X_train_pca.shape[1])])

## Hyper-parameter Tuning

In [8]:
C = [0.1, 1, 10, 100]
svc_model = SVC(kernel='rbf')
param_grid = {
    'C':C
}
grid = HalvingGridSearchCV(svc_model, param_grid, cv=5, verbose=1)

## Predicting labels

In [9]:
def svm(X, y):
  model = SVC(kernel='rbf', random_state=40, C=100)
  model.fit(X, y)
  return model

def predict(X, model):
  y_pred = model.predict(X)
  return pd.Series(y_pred)

def show_metrics(y_true, y_pred):
  print(metrics.confusion_matrix(y_true, y_pred))
  print(metrics.classification_report(y_true, y_pred))
  print(metrics.accuracy_score(y_true, y_pred))
  print(metrics.precision_score(y_true, y_pred, average='weighted'))
  print(metrics.recall_score(y_true, y_pred, average='weighted'))
  print(metrics.f1_score(y_true, y_pred, average='weighted'))

y_pred = {}

### Label 1 - Speaker

In [10]:
pca = PCA(n_components=0.97, svd_solver='full')
X_train_red = pca_reduce(X_train['label_1'], pca, first_fit=True)
X_train_red.shape
X_valid_red = pca_reduce(X_valid['label_1'], pca)
X_test_red = pca_reduce(X_test['label_1'], pca)
X_test_red.shape

grid.fit(X_train_red, y_train['label_1'])
print(grid.best_params_)

svm_model = svm(X_train_red, y_train['label_1'])
y1_pred_X_valid = predict(X_valid_red, svm_model)

show_metrics(y_valid['label_1'], y1_pred_X_valid)

y1_pred_X_test = predict(X_test_red, svm_model)
y_pred[0] = y1_pred_X_test

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 9506
max_resources_: 28520
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 9506
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 28518
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'C': 100}
[[13  0  0 ...  0  0  0]
 [ 0  9  0 ...  0  0  0]
 [ 0  0 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 20  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00        12
           4       0.89      1.00      0.94        16
           5       0.94      0.94      0.94        18
           6       1.00      1.00      1.00         9
           7       1.00      0.94      0.97        17
           8       1.00 

### Label 2 - Age

In [11]:
pca = PCA(n_components=0.98, svd_solver='full')
X_train_red = pca_reduce(X_train['label_2'], pca, first_fit=True)
X_train_red.shape
X_valid_red = pca_reduce(X_valid['label_2'], pca)
X_test_red = pca_reduce(X_test['label_2'], pca)
X_test_red.shape

grid.fit(X_train_red, y_train['label_2'])
print(grid.best_params_)

svm_model = svm(X_train_red, y_train['label_2'])
y2_pred_X_valid = predict(X_valid_red, svm_model)

show_metrics(y_valid['label_2'], y2_pred_X_valid)

y2_pred_X_test = predict(X_test_red, svm_model)
y_pred[1] = y2_pred_X_test.astype(int)

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 9346
max_resources_: 28040
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 9346
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 28038
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'C': 10}
[[ 35   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  68   1   1   1   0   0   0   0   0   0   0   0   0   0   0   0]
 [  2   1  42   0   0   1   0   0   0   0   0   0   0   0   0   0   0]
 [  0   2   0  74   1   0   0   0   0   2   0   0   0   0   0   0   0]
 [  0   1   1   0 113   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   1   0   2   1  73   0   1   0   2   0   0   1   0   0   0   0]
 [  0   0   1   0   0   2  43   0   0   0   0   0   0   0   0   0   0]
 [  1   1   0   0   0   0   0  43   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   1   0   0  47   0   0 

### Label 3 - Gender

In [12]:
pca = PCA(n_components=0.95, svd_solver='full')
X_train_red = pca_reduce(X_train['label_3'], pca, first_fit=True)
X_train_red.shape
X_valid_red = pca_reduce(X_valid['label_3'], pca)
X_test_red = pca_reduce(X_test['label_3'], pca)
X_test_red.shape

grid.fit(X_train_red, y_train['label_3'])
print(grid.best_params_)

svm_model = svm(X_train_red, y_train['label_3'])
y3_pred_X_valid = predict(X_valid_red, svm_model)

show_metrics(y_valid['label_3'], y3_pred_X_valid)

y3_pred_X_test = predict(X_test_red, svm_model)
y_pred[2] = y3_pred_X_test

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 9506
max_resources_: 28520
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 9506
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 28518
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'C': 100}
[[142   0]
 [  1 607]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       142
           1       1.00      1.00      1.00       608

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750

0.9986666666666667
0.9986759906759908
0.9986666666666667
0.9986684571511081


### Label 4 - Accent

In [13]:
pca = PCA(n_components=0.97, svd_solver='full')
X_train_red = pca_reduce(X_train['label_4'], pca, first_fit=True)
X_train_red.shape
X_valid_red = pca_reduce(X_valid['label_4'], pca)
X_test_red = pca_reduce(X_test['label_4'], pca)
X_test_red.shape

grid.fit(X_train_red, y_train['label_4'])
print(grid.best_params_)

svm_model = svm(X_train_red, y_train['label_4'])
y4_pred_X_valid = predict(X_valid_red, svm_model)

show_metrics(y_valid['label_4'], y4_pred_X_valid)

y4_pred_X_test = predict(X_test_red, svm_model)
y_pred[3] = y4_pred_X_test

n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 9506
max_resources_: 28520
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 9506
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 28518
Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'C': 10}
[[ 21   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  10   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0  27   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   8   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  14   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0  10   1   0   0   0   0   0   0   0]
 [  0   0   1   0   0   1 530   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   2  30   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   0  18   0   0   0   0   0]
 [  0   0   0   0   0   0   2   0   0  15   0   0   0   0]
 [  0   0   0   0   0   

## Formatting Output

In [14]:
data = {
  'ID' : list(range(1, y_pred[0].shape[0] + 1)),
  'label_1': y_pred[0],
  'label_2': y_pred[1],
  'label_3': y_pred[2],
  'label_4': y_pred[3]
}
df = pd.DataFrame(data)

df.to_csv('output11.csv', index=False)