In [5]:
import numpy as np
import pandas as pd
from feature_scaling import pretreat
from performance_metrics import calculate_metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.cross_decomposition import PLSRegression

In [7]:

def plsnipals(X, Y, A):
    # Initialize variables
    n, p = X.shape
    m = Y.shape[1]
    W = np.zeros((p, A))
    T = np.zeros((n, A))
    P = np.zeros((p, A))
    Q = np.zeros((m, A))

    varX = np.sum(np.sum(X**2))
    varY = np.sum(np.sum(Y**2))

    R2X = np.zeros(A)
    R2Y = np.zeros(A)

    # NIPALS algorithm
    for i in range(A):
        error = 1
        u = Y[:, 0]
        niter = 0
        while error > 1e-8 and niter < 1000:
            w = np.dot(X.T, u) / np.dot(u.T, u)
            w = w / np.linalg.norm(w)
            t = np.dot(X, w)
            q = np.dot(Y.T, t) / np.dot(t.T, t)
            u1 = np.dot(Y, q) / np.dot(q.T, q)
            error = np.linalg.norm(u1 - u) / np.linalg.norm(u)
            u = u1
            niter += 1

        p = np.dot(X.T, t) / np.dot(t.T, t)
        X = X - np.outer(t, p)
        Y = Y - np.outer(t, q)

        # Store variables
        W[:, i] = w
        T[:, i] = t
        P[:, i] = p
        Q[:, i] = q

    R2X = np.diag(np.dot(np.dot(T.T, T), np.dot(P.T, P))) / varX
    R2Y = np.diag(np.dot(np.dot(T.T, T), np.dot(Q.T, Q))) / varY

    # Calculate B and Wstar
    Wstar = np.dot(W, np.linalg.inv(np.dot(P.T, W)))
    B = np.dot(Wstar, Q.T)
    Q = Q.T

    return B, Wstar, T, P, Q, W, R2X, R2Y


In [10]:
heart_data = pd.read_csv('heart.csv')
iris_data = pd.read_csv('iris.csv')
wine_data = pd.read_csv('winequality-combined.csv')
sonar_data = pd.read_csv('sonar.csv')

iris_data.rename(columns={'Species': 'target'}, inplace=True)
wine_data.rename(columns={'quality': 'target'}, inplace=True)
sonar_data.rename(columns={'Class': 'target'}, inplace=True)

heart_data.name = 'heart data'
iris_data.name = 'iris data'
wine_data.name = 'wine data'

iris_data.drop(columns=['Id'], inplace=True)
iris_data['target'] = iris_data['target'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}).astype('float64')
wine_data['target'] = wine_data['target'].map({3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5}).astype('float64')

heart_data.dropna(inplace=True)
iris_data.dropna(inplace=True)
wine_data.dropna(inplace=True)


In [19]:
X = wine_data.drop(columns=['target']).values
y = wine_data['target'].values

# X = heart_data.drop(columns=['target']).values
# y = heart_data['target'].values

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)
X_train = X
y_train = y

# X_train = pretreat(X_train)

In [278]:
X_train.shape

(1025, 13)

In [20]:
scores=[]
kFold=KFold(n_splits=10,random_state=42,shuffle=True)
for train_index,test_index in kFold.split(X_train):
    # print("Train Index: ", train_index, "\n")
    # print("Test Index: ", test_index)
    X_train1, X_test1, y_train1, y_test1 = X_train[train_index], X_train[test_index], y_train[train_index], y_train[test_index]
    dt = DecisionTreeClassifier()
    dt.fit(X_train1, y_train1)
    y_pred = dt.predict(X_test1)
    # print(np.mean(y_pred == y_test1))
    scores.append(np.mean(y_pred == y_test1))
    print(calculate_metrics(y_test1, y_pred))
print(np.mean(scores))

{'accuracy': 0.6153846153846154, 'precision': 0.4484014360168321, 'recall': 0.44998646084472954, 'f1': 0.44919255020195675}
{'accuracy': 0.6292307692307693, 'precision': 0.40367409214761246, 'recall': 0.40618261785119686, 'f1': 0.40492446992867065}
{'accuracy': 0.5731895223420647, 'precision': 0.4254882154882155, 'recall': 0.4081517227065172, 'f1': 0.41663970303272657}
{'accuracy': 0.6086286594761171, 'precision': 0.36517604880525495, 'recall': 0.36129665696551666, 'f1': 0.36322599483550544}
{'accuracy': 0.6563944530046225, 'precision': 0.41683644419155597, 'recall': 0.41446851948657404, 'f1': 0.41564910938993177}
{'accuracy': 0.6317411402157165, 'precision': 0.4358221017492791, 'recall': 0.4345829370484668, 'f1': 0.43520163732151396}
{'accuracy': 0.6721044045676998, 'precision': 0.4672840946034403, 'recall': nan, 'f1': nan}
{'accuracy': 0.6302003081664098, 'precision': 0.3994636841939223, 'recall': 0.399704220515702, 'f1': 0.3995839161560848}
{'accuracy': 0.6097560975609756, 'precisio

  metrics['recall'] = np.sum(np.diag(matrix) / np.sum(matrix, axis = 1)) / n
  metrics['recall'] = np.sum(np.diag(matrix) / np.sum(matrix, axis = 1)) / n


In [13]:
X_train

array([[52.,  1.,  0., ...,  2.,  2.,  3.],
       [53.,  1.,  0., ...,  0.,  0.,  3.],
       [70.,  1.,  0., ...,  0.,  0.,  3.],
       ...,
       [47.,  1.,  0., ...,  1.,  1.,  2.],
       [50.,  0.,  0., ...,  2.,  0.,  2.],
       [54.,  1.,  0., ...,  1.,  1.,  3.]])

In [14]:
y_train_matrix = y_train.reshape(-1, 1)
X_train_pretreated = pretreat(X_train)
B, Wstar, T, P, Q, W, R2X, R2Y = plsnipals(X_train_pretreated, y_train_matrix, 5)

X_train_transformed = X_train_pretreated @ Wstar

In [15]:
scores=[]
kFold=KFold(n_splits=10,random_state=2,shuffle=True)
for train_index,test_index in kFold.split(X_train_transformed):
    # print("Train Index: ", train_index, "\n")
    # print("Test Index: ", test_index)
    X_train1, X_test1, y_train1, y_test1 = X_train_transformed[train_index], X_train_transformed[test_index], y_train[train_index], y_train[test_index]
    dt = DecisionTreeClassifier()
    dt.fit(X_train1, y_train1)
    y_pred = dt.predict(X_test1)
    print(calculate_metrics(y_test1, y_pred))
    scores.append(np.mean(y_pred == y_test1))
print(np.mean(scores))

{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
1.0


In [16]:
pls_model = PLSRegression(n_components=5)
pls_model.fit(X_train_pretreated, y_train_matrix)

In [17]:
X_train_transformed

array([[-0.05798808, -2.15664761, -0.505248  ,  0.07921756,  0.55377305],
       [-2.52460387,  0.37994622,  0.89401679,  0.21718348, -0.50536957],
       [-2.88094294,  0.8762405 ,  1.57457817, -0.58677747,  0.10772471],
       ...,
       [-1.47471203, -0.6796297 ,  0.64483258, -0.63091696, -0.41078892],
       [ 1.63152755,  0.06831135,  0.1349232 ,  0.08712064,  0.0878363 ],
       [-1.20546253, -0.79297818,  0.74369507, -1.09735824, -1.03676134]])

In [18]:
pls_model.transform(X_train_pretreated)

array([[ 0.05795979,  2.15559533,  0.50500148,  0.07917891,  0.55350285],
       [ 2.52337205, -0.37976084, -0.89358058,  0.21707751, -0.50512299],
       [ 2.87953726, -0.87581296, -1.57380989, -0.58649117,  0.10767214],
       ...,
       [ 1.47399248,  0.6792981 , -0.64451795, -0.63060912, -0.41058849],
       [-1.63073149, -0.06827802, -0.13485737,  0.08707813,  0.08779345],
       [ 1.20487435,  0.79259126, -0.74333221, -1.09682282, -1.03625548]])

regularization is two type, L1, L2

why lasso can eliminated feature , bcz coefficient is zero

ridge don't have any feature selection capability

independent column (we try to choose), dependent column, dependent variable, independent variable

classification problem steps
data pre-process
normalizations process => advantage disadvantage

evalution matrix - F1 score ()
Precision = True positives/ (True positives + False positives)
Accuracy = (True positives + True Negatives)/ (True positives + True negatives + False positives + False negatives)
Recall = True positives/ (True positives + False negatives)

when to choose precision and recall/ accuracy scenerio

imbalanced dataset = what is imbalanced dataset, how to handle imbalanced dataset
ans : when one class is more than other class, then it is called imbalanced dataset
how to handle imbalanced dataset

1. resampling technique
2. SMOTE
3. Ensemble technique
4. Cost sensitive learning
5. Anomaly detection

PLS:

1. what is PLS
2. show that PLS show independent variable and dependent variable

- covariance, and variance, when we can tell this is a covariance and this is a variance

- when a transpose b covariance?
- objective of PLS.
- what is the difference between PLS and PCA
- data mean centering and scaling
  what is mean center? ans : mean center is a process where we subtract the mean from the data
