In [None]:
import pandas as pd
import numpy as np
import os

Preprocessing
1. Import Data 
2. Subset data (reduce runtime, improve mafft performance)
2. Align sequences (mafft)
3. Encode categorical values (Optional, AA, gene annotations)
4. Collapse AA into functional groups (Optional)

In [None]:
# align sequences (mafft)
os.system(f"cat in1.fasta in2.fasta > in-merged.fasta")
os.system(f"mafft --auto --quiet in.fasta > out.aln")

# convert to dataframe, add Y (gene annotation)
df = pd.read_csv('out.aln', sep="\t")  # example

Profile/Probabilistic Methods

Statistical Methods
1. ANOVA

In [None]:
# 1. ANOVA
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms

def OLS_sm(df,
           dependant_var='price',
           numeric_features=[],
           categorical_features=[],
           verbose=False,
           show_summary=True,
           show_plots=True,
           target_is_dollar=True):
    """
    ### Uses formula based statsmodels regression for OLS. ###
    
    Displays a statsmodels.iolib.summary.Summary object containing summary of OLS analysis. 
    Returns a statsmodels.regression.linear_model.RegressionResultsWrapper which can be used to access other options available.

    Parameters:
    ===========
        df = pandas.DataFrame; no default. 
                Input dataset to use for OLS.
        dependant_var = str; default: 'price'. 
                Dependent variable.
        numeric_features = list; default = []. 
                Identify numeric features.
        categorical_features = list; default = []. 
                Identify categorical features.
        verbose = boolean; default: False. 
                Shows some formula used and drop information.
                    `True` shows information.
                    `False` does not show information.
        show_summary = boolean; default: False. 
                Shows summary report.
                    `True` shows information.
                    `False` does not show information.
        show_plots = boolean; default: True. 
                Shows summary and Homoscedasticity information.
                    `True` shows information.
                    `False` does not show information.
        target_is_dollar = boolean; default: True. 
                Modify chart axis label.
                    `True` shows information.
                    `False` does not show information.    
    Under-The-Hood:
    =============
    --{Major Steps}--
        
        ## Regression
        cate = ' + '.join([f'C({x})' for x in categorical_features])
        nume = ' + '.join([f'{x}' for x in numeric_features])
        formula = f'{dependant_var} ~ {nume} + {cate}'
        
        ## plots
        # plot on the left
        sm.qqplot(multiple_regression.resid,
                  dist=stats.norm,
                  line='45',
                  fit=True,
                  ax=ax1)
        # plot on the right
        ax2.scatter(x=multiple_regression.fittedvalues,
                    y=multiple_regression.resid,
                    s=4,
                    color='gold')
    
    Note:
    =====
        Make sure that every column in the DataFrame has the correct dtype.
        Numeric values stored as str (i.e, object) will make stats model assume that those are categorical variable.
        If Erros, check df to see if the passed feature is available in the DataFrame.
    
    Issues:
    =======
        - Output control is not clear.
    
    Changelog:
    ==========
        - changed `resid`, was using `resid_pearson`.
    
    -- ver: 1.3 --
    """
    cate = ' + '.join([f'C({x})' for x in categorical_features])
    nume = ' + '.join([f'{x}' for x in numeric_features])
    if len(cate)==0:
        formula = f'{dependant_var} ~ {nume}'
    else:
        formula = f'{dependant_var} ~ {nume} + {cate}'
    print('Formula for the OLS model: ', formula)
    # OLS regressor
    multiple_regression = smf.ols(formula=formula, data=df).fit()

    if verbose:
        show_summary = True
        show_plots = True

    if show_summary:
        display(multiple_regression.summary())
    if show_plots:
        # plotting
        # plot 1
        fig, (ax1,
              ax2) = plt.subplots(ncols=2,
                                  figsize=(10, 5),
                                  gridspec_kw={'width_ratios': [0.6, 0.4]})
        sm.qqplot(multiple_regression.resid,
                  dist=stats.norm,
                  line='45',
                  fit=True,
                  ax=ax1)
        ax1.set_title('Q-Q Plot', fontdict={"size": 15})
        # plot 2
        # uses The predicted values for the original (unwhitened) design.
        ax2.scatter(x=multiple_regression.fittedvalues, 
                    y=multiple_regression.resid,
                    s=4,
                    color='gold')
        if target_is_dollar:
            ax2.yaxis.set_major_formatter(format_number)
        ax2.set(xlabel='Predicted', ylabel='Residuals')
        ax2.axhline(y=0, c='r', lw=4, ls='--')
        ax2.set_title('Predicted VS Residuals', fontdict={"size": 15})
        plt.suptitle('Visual Check of Residuals for Homoscedasticity',
                     ha='center',
                     va='bottom',
                     fontdict={"size": 25})
        plt.tight_layout()
    if verbose == False and show_summary == False and show_plots == True:
        print('r_sq:', round(multiple_regression.rsquared, 4))
    return multiple_regression

# Univariate ANOVA
''' package can handle categorical variables directly, no need to encode'''

stat_list = []
for idx, column in enumerate(df.columns):
    regression_target = 'annot'
    # for dealing with categorical variables
    temp_df = df.copy()
    for column in df.columns:
        f = f'{regression_target} ~ C({column})'
    model = smf.ols(formula=f, data=temp_df).fit()
    temp_dict = {
        'name': column,
        'r_sq': model.rsquared,
        'intercept': model.params[0],
        'beta': model.params[1],
        'p_val': model.pvalues[1],
        'Jarque-Bera': sms.jarque_bera(model.resid)[0] 
    }
    stat_list.append(temp_dict)
df_stat = pd.DataFrame(stat_list).set_index('name')

# Multivariate ANOVA
OLS_sm(df=df,
       numeric_features=[],
       dependant_var='price',
       categorical_features=df.columns.tolist()[:-1],
       show_summary=False)

ML-based methods (Wrapper?)