In [None]:
# default_exp learn


In [None]:
# all_func


In [None]:
# export

from peptide.imports import *
from peptide.utils import *
from peptide.data import ProteinDataset


In [None]:
# hide

from nbdev.showdoc import *


# Learn
> Learner class for supervised, unsupervised and semi supervised learning with Protein data.

#hide

- To suppress warning in multithreaded runs - https://github.com/scikit-learn/scikit-learn/issues/12939
    - But makes it slow
- To revert - delete the following
```python
from sklearn.utils import parallel_backend
..
    with parallel_backend("multiprocessing"):

```

In [None]:
# export

# from sklearn.utils import parallel_backend


class Learner:
    """Class for training and prediction."""

    def __init__(
        self,
        X_train: np.ndarray,  # X_train numpy ndarray
        y_train: np.ndarray,  # y_train numpy ndarray
        X_test: np.ndarray,  # X_test numpy ndarray
        y_test: np.ndarray,  # y_test numpy ndarray
        ohe: bool = False,  # to use one hot encoding or not
        scaler: bool = False,  # to use standard scaling or not
        pca: bool = True,  # to use principal component analysis or not
        pca_n_components: int = 50,  # PCA number of components
        param_grids: list = None,  # param_grid for grid search, if None - gets default grid from utils
    ):
        """Initialize learner for training and prediction."""
        self.classifiers = ["LogisticRegression", "LinearSVC", "XGBClassifier"]
        self.X_train, self.y_train = X_train, y_train
        self.X_test, self.y_test = X_test, y_test
        self.ohe, self.scaler, self.pca = ohe, scaler, pca
        self.pca_n_components = pca_n_components
        self.pipeline = self.create_pipeline()
        self.param_grids = (
            get_default_param_grid() if param_grids is None else param_grids
        )

        self.grid_list, self.train_results = [], []
        self.predict_results = None

    def create_pipeline(self) -> Pipeline:
        """Create and return pipeline"""

        steps = []
        if self.ohe:
            steps.append(("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False)))
        if self.scaler:
            steps.append(("scaler", StandardScaler()))
        if self.pca:
            steps.append(("pca", PCA(n_components=self.pca_n_components)))
        steps.append(("classifier", "passthrough"))

        pipe = Pipeline(steps)

        return pipe

    def train(
        self,
        scoring: str = "accuracy",  # must be one of https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
        cv: int = 5,  # defaults to 5-fold CV
        n_jobs: int = -1,  #  defaults to -1 to use all cores
    ) -> tuple[list, list]:
        """Run GridSearchCV for all models on X_train and y_train of dataset.
        Returns:
            train_results: list of grid search results
            grid_list: list of trained grid objects
        """

        result_list = []
        grid_list = []

        for classifier, param_grid in zip(self.classifiers, self.param_grids):
            print(f"Starting grid search for {classifier}")
            grid = GridSearchCV(
                estimator=self.pipeline,
                param_grid=param_grid,
                n_jobs=n_jobs,
                cv=cv,
                scoring=scoring,
                verbose=1,
            )

            # with parallel_backend("multiprocessing"):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")
                grid.fit(self.X_train, self.y_train)

            result_list.append(pd.DataFrame.from_dict(grid.cv_results_))
            grid_list.append(grid)

        self.train_results = result_list
        self.grid_list = grid_list

        return self.train_results, self.grid_list

    def get_top_5_train_results(self) -> list:
        "Return top 5 results for each grid"
        results = []
        for result in self.train_results:
            results.append(result.sort_values("rank_test_score")[:5])
        return results

    def predict(self) -> pd.DataFrame:
        """Get predictions on the dataset's X_test from best estimators of GridSearchCV."""
        results = []
        for classifier, grid in zip(self.classifiers, self.grid_list):
            preds = grid.predict(self.X_test)
            result = [
                classifier,
                grid.best_params_,
                accuracy_score(self.y_test, preds),
                recall_score(self.y_test, preds),
                precision_score(self.y_test, preds),
                f1_score(self.y_test, preds),
            ]
            results.append(result)

        self.predict_results = pd.DataFrame(
            results,
            columns=[
                "classifier",
                "best_params",
                "accuracy",
                "recall",
                "precision",
                "f1",
            ],
        )
        return self.predict_results

    ## Unsupervised Learning ##

    def pick_k(
        self,
        max_clusters: int = 10,  # max number of clusters to try out
        pca_n_components: int = 50,  # number of components to reduce to in PCA
    ) -> np.ndarray:  # PCA reduced X
        """Plot elbow and silohutte curves & print silohutte scores to help determine the ideal 'k' for Kmeans."""
       

        # concat X
        X = np.concatenate((self.X_train, self.X_test), axis=0)
        assert (self.X_train.shape[0] + self.X_test.shape[0]) == X.shape[0]

        if self.ohe:  # One Hot Encode X
            ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
            X = ohe.fit_transform(X)

        if self.scaler:  # Scale
            scalr = StandardScaler()
            X = scalr.fit_transform(X)

        if self.pca:  # Dim Reduce X
            pca = PCA(n_components=pca_n_components)
            X = pca.fit_transform(X)

        # visualize elbow plot
        visualize_elbow(X, np.arange(2, max_clusters))
        # visualize silhouette scores and plot
        plot_silhouette_scores(max_clusters=max_clusters, X=X)

        return X

    def analyze_clusters(
        self, 
        X_pca: np.ndarray,  # dim reduced X numpy ndarray
        k: int,  # the chosen value of k for KMeans
        random_state: int = 10  # random state for KMeans
    ) -> None:
        """Perform KMeans clustering, print cluster counts and plot clusters from the result."""

        km = KMeans(n_clusters=k, random_state=random_state).fit(X_pca)
        print(f"Cluster counts: {Counter(km.labels_)}")
        visualize_clusters(km.labels_, X_pca)

    ## Semi Supervised Learning ##

    def run_label_spreading(
        self,
        pca_n_components: int = 50  # number of components to reduce to in PCA
    ) -> None:
        """Run Lanel Spreading and print classification report."""

        # concat X
        X = np.concatenate((self.X_train, self.X_test), axis=0)
        assert (self.X_train.shape[0] + self.X_test.shape[0]) == X.shape[0]

        # concat y
        y = np.concatenate((self.y_train, np.full(self.y_test.shape, -1)), axis=0)
        assert (self.y_train.shape[0] + self.y_test.shape[0]) == y.shape[0]

        if self.ohe:  # One Hot Encode X
            ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
            X = ohe.fit_transform(X)

        if self.scaler:  # Scale
            scalr = StandardScaler()
            X = scalr.fit_transform(X)

        if self.pca:  # Dim Reduce X
            pca = PCA(n_components=pca_n_components)
            X = pca.fit_transform(X)

        # Run LableSpreading
        lbl_spread = LabelSpreading(kernel="knn", alpha=0.01)
        lbl_spread.fit(X, y)
        semi_sup_preds = lbl_spread.transduction_[self.X_train.shape[0] :]
        assert semi_sup_preds.shape[0] == self.X_test.shape[0]

        # print result
        print(classification_report(self.y_test, semi_sup_preds))


In [None]:
show_doc(Learner, show_all_docments=True)

<h2 id="Learner" class="doc_header"><code>class</code> <code>Learner</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>Learner</code>(**`X_train`**:`ndarray`, **`y_train`**:`ndarray`, **`X_test`**:`ndarray`, **`y_test`**:`ndarray`, **`ohe`**:`bool`=*`False`*, **`scaler`**:`bool`=*`False`*, **`pca`**:`bool`=*`True`*, **`pca_n_components`**:`int`=*`50`*, **`param_grids`**:`list`=*`None`*)

```
Class for training and prediction.
```

||Type|Default|Details|
|---|---|---|---|
|**`X_train`**|`ndarray`||X_train numpy ndarray|
|**`y_train`**|`ndarray`||y_train numpy ndarray|
|**`X_test`**|`ndarray`||X_test numpy ndarray|
|**`y_test`**|`ndarray`||y_test numpy ndarray|
|**`ohe`**|`bool`|`False`|to use one hot encoding or not|
|**`scaler`**|`bool`|`False`|to use standard scaling or not|
|**`pca`**|`bool`|`True`|to use principal component analysis or not|
|**`pca_n_components`**|`int`|`50`|PCA number of components|
|**`param_grids`**|`list`|`None`|param_grid for grid search, if None - gets default grid from utils|


In [None]:
show_doc(Learner.create_pipeline, show_all_docments=True)

<h4 id="Learner.create_pipeline" class="doc_header"><code>Learner.create_pipeline</code><a href="__main__.py#L35" class="source_link" style="float:right">[source]</a></h4>

> <code>Learner.create_pipeline</code>()

```
Create and return pipeline
```



In [None]:
show_doc(Learner.train, show_all_docments=True)

<h4 id="Learner.train" class="doc_header"><code>Learner.train</code><a href="__main__.py#L51" class="source_link" style="float:right">[source]</a></h4>

> <code>Learner.train</code>(**`scoring`**:`str`=*`'accuracy'`*, **`cv`**:`int`=*`5`*, **`n_jobs`**:`int`=*`-1`*)

```
Run GridSearchCV for all models on X_train and y_train of dataset.
Returns:
    train_results: list of grid search results
    grid_list: list of trained grid objects
```

||Type|Default|Details|
|---|---|---|---|
|**`scoring`**|`str`|`accuracy`|must be one of https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter|
|**`cv`**|`int`|`5`|defaults to 5-fold CV|
|**`n_jobs`**|`int`|`-1`|defaults to -1 to use all cores|


In [None]:
show_doc(Learner.get_top_5_train_results, show_all_docments=True)

<h4 id="Learner.get_top_5_train_results" class="doc_header"><code>Learner.get_top_5_train_results</code><a href="__main__.py#L90" class="source_link" style="float:right">[source]</a></h4>

> <code>Learner.get_top_5_train_results</code>()

```
Return top 5 results for each grid
```



In [None]:
show_doc(Learner.predict, show_all_docments=True)

<h4 id="Learner.predict" class="doc_header"><code>Learner.predict</code><a href="__main__.py#L97" class="source_link" style="float:right">[source]</a></h4>

> <code>Learner.predict</code>()

```
Get predictions on the dataset's X_test from best estimators of GridSearchCV.
```



In [None]:
show_doc(Learner.pick_k, show_all_docments=True)

<h4 id="Learner.pick_k" class="doc_header"><code>Learner.pick_k</code><a href="__main__.py#L127" class="source_link" style="float:right">[source]</a></h4>

> <code>Learner.pick_k</code>(**`max_clusters`**:`int`=*`10`*, **`pca_n_components`**:`int`=*`50`*)

```
Plot elbow and silohutte curves & print silohutte scores to help determine the ideal 'k' for Kmeans.
```

||Type|Default|Details|
|---|---|---|---|
|**`max_clusters`**|`int`|`10`|max number of clusters to try out|
|**`pca_n_components`**|`int`|`50`|number of components to reduce to in PCA|


 The `pick_k` method does the following to help determine the ideal k for KMeans: 
 - It first concats X_train and X_test of this dataset into a single ndarray 'X'
 - then encodes X using OneHotEncoder
 - then sclaes X using StandardScaler
 - then dimensionality reduces X using PCA
 - then plots elbow & silhouette plots for X and prints silhouette scores, and returns the PCA-reduced X.

In [None]:
show_doc(Learner.analyze_clusters, show_all_docments=True)

<h4 id="Learner.analyze_clusters" class="doc_header"><code>Learner.analyze_clusters</code><a href="__main__.py#L158" class="source_link" style="float:right">[source]</a></h4>

> <code>Learner.analyze_clusters</code>(**`X_pca`**:`ndarray`, **`k`**:`int`, **`random_state`**:`int`=*`10`*)

```
Perform KMeans clustering, print cluster counts and plot clusters from the result.
```

||Type|Default|Details|
|---|---|---|---|
|**`X_pca`**|`ndarray`||dim reduced X numpy ndarray|
|**`k`**|`int`||the chosen value of k for KMeans|
|**`random_state`**|`int`|`10`|random state for KMeans|


In [None]:
show_doc(Learner.run_label_spreading, show_all_docments=True)

<h4 id="Learner.run_label_spreading" class="doc_header"><code>Learner.run_label_spreading</code><a href="__main__.py#L172" class="source_link" style="float:right">[source]</a></h4>

> <code>Learner.run_label_spreading</code>(**`pca_n_components`**:`int`=*`50`*)

```
Run Lanel Spreading and print classification report.
```

||Type|Default|Details|
|---|---|---|---|
|**`pca_n_components`**|`int`|`50`|number of components to reduce to in PCA|


## Export -

In [None]:
# hide
from nbdev.export import *

notebook2script()


Converted 00_basics.ipynb.
Converted 01_data.ipynb.
Converted 02_learn.ipynb.
Converted 03_onehot.ipynb.
Converted 04_lstm.ipynb.
Converted 05_transformer.ipynb.
Converted 99_utils.ipynb.
Converted index.ipynb.
