diff --git a/causalml/dataset/classification.py b/causalml/dataset/classification.py
index 5f61c55d..bb77ed59 100644
--- a/causalml/dataset/classification.py
+++ b/causalml/dataset/classification.py
@@ -2,9 +2,9 @@
 import numpy as np
 import pandas as pd
 from sklearn.datasets import make_classification
+from scipy.interpolate import UnivariateSpline
 from scipy.optimize import fsolve
-from scipy.special import expit
-from scipy.special import logit
+from scipy.special import expit, logit
 
 
 # ------ Define a list of functions for feature transformation
@@ -119,8 +119,9 @@ def _standardize(x):
 def _fixed_transformation(fs, x, f_index=0):
     """
     Transform and standardize a vector by a transformation function.
-    If the given index is within the function list f_index < len(fs), then use fs[f_index] as the transformation function
-    otherwise, randomly choose a function from the function list.
+    If the given index is within the function list f_index < len(fs), then use fs[f_index] as the transformation
+    function. Otherwise, randomly choose a function from the function list.
+
     Parameters
     ----------
     fs : list
@@ -160,7 +161,8 @@ def _random_transformation(fs, x):
 # @staticmethod
 def _softmax(z, p, xb):
     """
-    Softmax function. This function is used to reversely solve the constant root value in the linear part to make the softmax function output mean to be a given value.
+    Softmax function. This function is used to reversely solve the constant root value in the linear part to make the
+    softmax function output mean to be a given value.
 
     Parameters
     ----------
@@ -201,7 +203,8 @@ def make_uplift_classification_logistic(
     n_samples : int, optional (default=1000)
         The number of samples to be generated for each treatment group.
     treatment_name: list, optional (default = ['control','treatment1','treatment2','treatment3'])
-        The list of treatment names. The first element must be 'control' as control group, and the rest are treated as treatment groups.
+        The list of treatment names. The first element must be 'control' as control group, and the rest are treated as
+        treatment groups.
     y_name: string, optional (default = 'conversion')
         The name of the outcome variable to be used as a column in the output dataframe.
     n_classification_features: int, optional (default = 10)
@@ -218,7 +221,8 @@ def make_uplift_classification_logistic(
     n_mix_informative_uplift_dict: dictionary, optional (default: {'treatment1': 1, 'treatment2': 1, 'treatment3': 1})
         Number of mix features for each treatment. The mix feature is defined as a linear combination
         of a randomly selected informative classification feature and a randomly selected uplift feature.
-        The mixture is made by a weighted sum (p*feature1 + (1-p)*feature2), where the weight p is drawn from a uniform distribution between 0 and 1.
+        The mixture is made by a weighted sum (p*feature1 + (1-p)*feature2), where the weight p is drawn from a uniform
+        distribution between 0 and 1.
     delta_uplift_dict: dictionary, optional (default: {'treatment1': .02, 'treatment2': .05, 'treatment3': -.05})
         Treatment effect (delta), can be positive or negative.
         Dictionary of {treatment_key: delta}.
@@ -227,14 +231,18 @@ def make_uplift_classification_logistic(
     random_seed : int, optional (default = 20200101)
         The random seed to be used in the data generation process.
     feature_association_list : list, optional (default = ['linear','quadratic','cubic','relu','sin','cos'])
-        List of uplift feature association patterns to the treatment effect. For example, if the feature pattern is 'quadratic', then the treatment effect will increase or decrease quadratically with the feature.
-        The values in the list must be one of ('linear','quadratic','cubic','relu','sin','cos'). However, the same value can appear multiple times in the list.
+        List of uplift feature association patterns to the treatment effect. For example, if the feature pattern is
+        'quadratic', then the treatment effect will increase or decrease quadratically with the feature.
+        The values in the list must be one of ('linear','quadratic','cubic','relu','sin','cos'). However, the same
+        value can appear multiple times in the list.
     random_select_association : boolean, optional (default = True)
-        How the feature patterns are selected from the feature_association_list to be applied in the data generation process.
-        If random_select_association = True, then for every uplift feature, a random feature association pattern is selected from the list.
-        If random_select_association = False, then the feature association pattern is selected from the list in turns to be applied to each feature one by one.
+        How the feature patterns are selected from the feature_association_list to be applied in the data generation
+        process. If random_select_association = True, then for every uplift feature, a random feature association
+        pattern is selected from the list. If random_select_association = False, then the feature association pattern
+        is selected from the list in turns to be applied to each feature one by one.
     error_std : float, optional (default = 0.05)
-        Standard deviation to be used in the error term of the logistic regression. The error is drawn from a normal distribution with mean 0 and standard deviation specified in this argument.
+        Standard deviation to be used in the error term of the logistic regression. The error is drawn from a normal
+        distribution with mean 0 and standard deviation specified in this argument.
 
     Returns
     -------
@@ -273,7 +281,6 @@ def make_uplift_classification_logistic(
         f_list.append(feature_association_pattern_dict[fi])
 
     # generate treatment key ------------------------------------------------#
-    n_all = n * len(treatment_name)
     treatment_list = []
     for ti in treatment_name:
         treatment_list += [ti] * n
@@ -518,14 +525,16 @@ def make_uplift_classification(
     delta_uplift_decrease_dict: dictionary, optional (default: {'treatment1': 0., 'treatment2': 0., 'treatment3': 0.})
         Negative treatment effect created by the negative uplift features on the base classification label.
         Dictionary of {treatment_key: increase_delta}.
-    n_uplift_increase_mix_informative_dict: dictionary, optional (default: {'treatment1': 1, 'treatment2': 1, 'treatment3': 1})
+    n_uplift_increase_mix_informative_dict: dictionary, optional
         Number of positive mix features for each treatment. The positive mix feature is defined as a linear combination
         of a randomly selected informative classification feature and a randomly selected positive uplift feature.
         The linear combination is made by two coefficients sampled from a uniform distribution between -1 and 1.
-    n_uplift_decrease_mix_informative_dict: dictionary, optional (default: {'treatment1': 0, 'treatment2': 0, 'treatment3': 0})
+        default: {'treatment1': 1, 'treatment2': 1, 'treatment3': 1}
+    n_uplift_decrease_mix_informative_dict: dictionary, optional
         Number of negative mix features for each treatment. The negative mix feature is defined as a linear combination
         of a randomly selected informative classification feature and a randomly selected negative uplift feature. The
         linear combination is made by two coefficients sampled from a uniform distribution between -1 and 1.
+        default: {'treatment1': 0, 'treatment2': 0, 'treatment3': 0}
     positive_class_proportion: float, optional (default = 0.5)
         The proportion of positive label (1) in the control group.
     random_seed : int, optional (default = 20190101)
diff --git a/causalml/dataset/synthetic.py b/causalml/dataset/synthetic.py
index 1104f232..4fca03f1 100644
--- a/causalml/dataset/synthetic.py
+++ b/causalml/dataset/synthetic.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
 from matplotlib import pyplot as plt
 import numpy as np
 import pandas as pd
diff --git a/causalml/feature_selection/filters.py b/causalml/feature_selection/filters.py
index b5689915..f9249788 100644
--- a/causalml/feature_selection/filters.py
+++ b/causalml/feature_selection/filters.py
@@ -9,7 +9,6 @@
 import statsmodels.api as sm
 from scipy import stats
 from sklearn.impute import SimpleImputer
-import warnings
 
 
 class FilterSelect:
@@ -25,10 +24,12 @@ def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name, order
 
         Args:
             data (pd.Dataframe): DataFrame containing outcome, features, and experiment group
-            treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
+            treatment_indicator (string): the column name for binary indicator of treatment (1) or control (0)
             feature_name (string): feature name, as one column in the data DataFrame
             y_name (string): name of the outcome variable
-            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
+            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3.
+                order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear
+                importance of the feature,
             order= 3 will calculate feature importance up to cubic forms.
 
         Returns:
@@ -105,10 +106,12 @@ def filter_F(self, data, treatment_indicator, features, y_name, order=1):
 
         Args:
             data (pd.Dataframe): DataFrame containing outcome, features, and experiment group
-            treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
+            treatment_indicator (string): the column name for binary indicator of treatment (1) or control (0)
             features (list of string): list of feature names, that are columns in the data DataFrame
             y_name (string): name of the outcome variable
-            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
+            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3.
+                order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear
+                importance of the feature,
             order= 3 will calculate feature importance up to cubic forms.
 
         Returns:
@@ -143,10 +146,12 @@ def _filter_LR_one_feature(
 
         Args:
             data (pd.Dataframe): DataFrame containing outcome, features, and experiment group
-            treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
+            treatment_indicator (string): the column name for binary indicator of treatment (1) or control (0)
             feature_name (string): feature name, as one column in the data DataFrame
             y_name (string): name of the outcome variable
-            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
+            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3.
+                order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear
+                importance of the feature,
             order= 3 will calculate feature importance up to cubic forms.
 
         Returns:
@@ -222,10 +227,12 @@ def filter_LR(
 
         Args:
             data (pd.Dataframe): DataFrame containing outcome, features, and experiment group
-            treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
+            treatment_indicator (string): the column name for binary indicator of treatment (1) or control (0)
             feature_name (string): feature name, as one column in the data DataFrame
             y_name (string): name of the outcome variable
-            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
+            order (int): the order of feature to be evaluated with the treatment effect, order takes 3 values: 1,2,3.
+                order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear
+                importance of the feature,
             order= 3 will calculate feature importance up to cubic forms.
 
         Returns:
@@ -261,7 +268,8 @@ def _GetNodeSummary(
         smooth=True,
     ):
         """
-        To count the conversions and get the probabilities by treatment groups. This function comes from the uplift tree algorithm, that is used for tree node split evaluation.
+        To count the conversions and get the probabilities by treatment groups. This function comes from the uplift
+        tree algorithm, that is used for tree node split evaluation.
 
         Parameters
         ----------
@@ -420,18 +428,23 @@ def _filter_D_one_feature(
 
         Args:
             data (pd.Dataframe): DataFrame containing outcome, features, and experiment group
-            treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
+            treatment_indicator (string): the column name for binary indicator of treatment (1) or control (0)
             feature_name (string): feature name, as one column in the data DataFrame
             y_name (string): name of the outcome variable
             method (string, optional, default = 'KL'): taking one of the following values {'F', 'LR', 'KL', 'ED', 'Chi'}
-                    The feature selection method to be used to rank the features.
-                    'F' for F-test
-                    'LR' for likelihood ratio test
-                    'KL', 'ED', 'Chi' for bin-based uplift filter methods, KL divergence, Euclidean distance, Chi-Square respectively
-            experiment_group_column (string, optional, default = 'treatment_group_key'): the experiment column name in the DataFrame, which contains the treatment and control assignment label
-            control_group (string, optional, default = 'control'): name for control group, value in the experiment group column
+                The feature selection method to be used to rank the features.
+                'F' for F-test
+                'LR' for likelihood ratio test
+                'KL', 'ED', 'Chi' for bin-based uplift filter methods, KL divergence, Euclidean distance,
+                Chi-Square respectively
+            experiment_group_column (string, optional, default = 'treatment_group_key'): the experiment column name in
+                the DataFrame, which contains the treatment and control assignment label
+            control_group (string, optional, default = 'control'): name for control group, value in the experiment
+                group column
             n_bins (int, optional, default = 10): number of bins to be used for bin-based uplift filter methods
-            null_impute (str, optional, default=None): impute np.nan present in the data taking on of the following strategy values {'mean', 'median', 'most_frequent', None}. If Value is None and null is present then exception will be raised
+            null_impute (str, optional, default=None): impute np.nan present in the data taking on of the following
+                strategy values {'mean', 'median', 'most_frequent', None}. If Value is None and null is present then
+                exception will be raised
 
         Returns:
             D_result : pd.DataFrame
@@ -455,9 +468,8 @@ def _filter_D_one_feature(
             ).fit_transform(data[feature_name].values.reshape(-1, 1))
         elif data[feature_name].isna().any():
             raise Exception(
-                "Null value(s) present in column '{}'. Please impute the null value or use null_impute parameter provided!!!".format(
-                    feature_name
-                )
+                "Null value(s) present in column '{}'. Please impute the null value or use null_impute parameter "
+                "provided.".format(feature_name)
             )
 
         # drop duplicate edges in pq.cut result to avoid issues
@@ -515,18 +527,23 @@ def filter_D(
 
         Args:
             data (pd.Dataframe): DataFrame containing outcome, features, and experiment group
-            treatment_indicator (string): the column name for binary indicator of treatment (value 1) or control (value 0)
+            treatment_indicator (string): the column name for binary indicator of treatment (1) or control (0)
             features (list of string): list of feature names, that are columns in the data DataFrame
             y_name (string): name of the outcome variable
             method (string, optional, default = 'KL'): taking one of the following values {'F', 'LR', 'KL', 'ED', 'Chi'}
-                    The feature selection method to be used to rank the features.
-                    'F' for F-test
-                    'LR' for likelihood ratio test
-                    'KL', 'ED', 'Chi' for bin-based uplift filter methods, KL divergence, Euclidean distance, Chi-Square respectively
-            experiment_group_column (string, optional, default = 'treatment_group_key'): the experiment column name in the DataFrame, which contains the treatment and control assignment label
-            control_group (string, optional, default = 'control'): name for control group, value in the experiment group column
+                The feature selection method to be used to rank the features.
+                'F' for F-test
+                'LR' for likelihood ratio test
+                'KL', 'ED', 'Chi' for bin-based uplift filter methods, KL divergence, Euclidean distance, Chi-Square
+                respectively
+            experiment_group_column (string, optional, default = 'treatment_group_key'): the experiment column name in
+                the DataFrame, which contains the treatment and control assignment label
+            control_group (string, optional, default = 'control'): name for control group, value in the experiment
+                group column
             n_bins (int, optional, default = 10): number of bins to be used for bin-based uplift filter methods
-            null_impute (str, optional, default=None): impute np.nan present in the data taking on of the following strategy values {'mean', 'median', 'most_frequent', None}. If Value is None and null is present then exception will be raised
+            null_impute (str, optional, default=None): impute np.nan present in the data taking on of the followin
+                strategy values {'mean', 'median', 'most_frequent', None}. If Value is None and null is present then
+                exception will be raised
 
         Returns:
             all_result : pd.DataFrame
@@ -578,13 +595,19 @@ def get_importance(
                 The feature selection method to be used to rank the features.
                 'F' for F-test
                 'LR' for likelihood ratio test
-                'KL', 'ED', 'Chi' for bin-based uplift filter methods, KL divergence, Euclidean distance, Chi-Square respectively
-            experiment_group_column (string): the experiment column name in the DataFrame, which contains the treatment and control assignment label
+                'KL', 'ED', 'Chi' for bin-based uplift filter methods, KL divergence, Euclidean distance, Chi-Square
+                respectively
+            experiment_group_column (string): the experiment column name in the DataFrame, which contains the treatment
+                and control assignment label
             control_group (string): name for control group, value in the experiment group column
             treatment_group (string): name for treatment group, value in the experiment group column
             n_bins (int, optional): number of bins to be used for bin-based uplift filter methods
-            null_impute (str, optional, default=None): impute np.nan present in the data taking on of the following strategy values {'mean', 'median', 'most_frequent', None}. If value is None and null is present then exception will be raised
-            order (int): the order of feature to be evaluated with the treatment effect for F filter and LR filter, order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2 corresponds to quadratic and linear importance of the feature,
+            null_impute (str, optional, default=None): impute np.nan present in the data taking on of the following
+                strategy values {'mean', 'median', 'most_frequent', None}. If value is None and null is present then
+                exception will be raised
+            order (int): the order of feature to be evaluated with the treatment effect for F filter and LR filter,
+                order takes 3 values: 1,2,3. order = 1 corresponds to linear importance of the feature, order=2
+                corresponds to quadratic and linear importance of the feature,
             order= 3 will calculate feature importance up to cubic forms.
             disp (bool): Set to True to print convergence messages for Logistic regression convergence in LR method.
 
diff --git a/causalml/inference/iv/drivlearner.py b/causalml/inference/iv/drivlearner.py
index 3f13bdd5..34a53de0 100644
--- a/causalml/inference/iv/drivlearner.py
+++ b/causalml/inference/iv/drivlearner.py
@@ -9,23 +9,24 @@
     check_p_conditions,
     convert_pd_to_np,
 )
-from causalml.metrics import regression_metrics, classification_metrics
+from causalml.metrics import regression_metrics
 from causalml.propensity import compute_propensity_score
 from scipy.stats import norm
-from sklearn.model_selection import cross_val_predict, KFold
+from sklearn.model_selection import KFold
 from tqdm import tqdm
 from xgboost import XGBRegressor
 
 logger = logging.getLogger("causalml")
 
 
-class BaseDRIVLearner(object):
+class BaseDRIVLearner:
     """A parent class for DRIV-learner regressor classes.
 
     A DRIV-learner estimates endogenous treatment effects for compliers with machine learning models.
 
-    Details of DR-learner are available at Kennedy (2020) (https://arxiv.org/abs/2004.14497).
-    The DR moment condition for LATE comes from Chernozhukov et al (2018) (https://academic.oup.com/ectj/article/21/1/C1/5056401).
+    Details of DR-learner are available at `Kennedy (2020) <https://arxiv.org/abs/2004.14497>`_.
+    The DR moment condition for LATE comes from
+    `Chernozhukov et al (2018) <https://academic.oup.com/ectj/article/21/1/C1/5056401>`_.
     """
 
     def __init__(
@@ -103,9 +104,9 @@ def fit(
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
             p (2-tuple of np.ndarray or pd.Series or dict, optional): The first (second) element corresponds to
-                unassigned (assigned) units. Each is an array of propensity scores of float (0,1) in the single-treatment
-                case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1). If None will run
-                ElasticNetPropensityModel() to generate the propensity scores.
+                unassigned (assigned) units. Each is an array of propensity scores of float (0,1) in the
+                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of float
+                (0,1). If None will run ElasticNetPropensityModel() to generate the propensity scores.
             pZ (np.array or pd.Series, optional): an array of assignment probability of float (0,1); if None
                 will run ElasticNetPropensityModel() to generate the assignment probability score.
             seed (int): random seed for cross-fitting
@@ -301,7 +302,6 @@ def predict(self, X, treatment=None, y=None, return_components=False, verbose=Tr
             if (y is not None) and (treatment is not None) and verbose:
                 mask = (treatment == group) | (treatment == self.control_name)
                 treatment_filt = treatment[mask]
-                X_filt = X[mask]
                 y_filt = y[mask]
                 w = (treatment_filt == group).astype(int)
 
@@ -344,9 +344,9 @@ def fit_predict(
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
             p (2-tuple of np.ndarray or pd.Series or dict, optional): The first (second) element corresponds to
-                unassigned (assigned) units. Each is an array of propensity scores of float (0,1) in the single-treatment
-                case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1). If None will run
-                ElasticNetPropensityModel() to generate the propensity scores.
+                unassigned (assigned) units. Each is an array of propensity scores of float (0,1) in the
+                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of float
+                (0,1). If None will run ElasticNetPropensityModel() to generate the propensity scores.
             pZ (np.array or pd.Series, optional): an array of assignment probability of float (0,1); if None
                 will run ElasticNetPropensityModel() to generate the assignment probability score.
             return_ci (bool): whether to return confidence intervals
@@ -453,9 +453,9 @@ def estimate_ate(
             treatment (np.array or pd.Series): a treatment vector
             y (np.array or pd.Series): an outcome vector
             p (2-tuple of np.ndarray or pd.Series or dict, optional): The first (second) element corresponds to
-                unassigned (assigned) units. Each is an array of propensity scores of float (0,1) in the single-treatment
-                case; or, a dictionary of treatment groups that map to propensity vectors of float (0,1). If None will run
-                ElasticNetPropensityModel() to generate the propensity scores.
+                unassigned (assigned) units. Each is an array of propensity scores of float (0,1) in the
+                single-treatment case; or, a dictionary of treatment groups that map to propensity vectors of float
+                (0,1). If None will run ElasticNetPropensityModel() to generate the propensity scores.
             pZ (np.array or pd.Series, optional): an array of assignment probability of float (0,1); if None
                 will run ElasticNetPropensityModel() to generate the assignment probability score.
             bootstrap_ci (bool): whether run bootstrap for confidence intervals
diff --git a/causalml/inference/iv/iv_regression.py b/causalml/inference/iv/iv_regression.py
index 41957e13..612c8b8e 100644
--- a/causalml/inference/iv/iv_regression.py
+++ b/causalml/inference/iv/iv_regression.py
@@ -5,7 +5,7 @@
 from statsmodels.sandbox.regression.gmm import IV2SLS
 
 
-class IVRegressor(object):
+class IVRegressor:
     """A wrapper class that uses IV2SLS from statsmodel
 
     A linear 2SLS model that estimates the average treatment effect with endogenous treatment variable.
diff --git a/causalml/inference/meta/drlearner.py b/causalml/inference/meta/drlearner.py
index 4e554403..2bdb93dc 100644
--- a/causalml/inference/meta/drlearner.py
+++ b/causalml/inference/meta/drlearner.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 from scipy.stats import norm
-from sklearn.model_selection import cross_val_predict, KFold
+from sklearn.model_selection import KFold
 from tqdm import tqdm
 from xgboost import XGBRegressor
 
@@ -13,7 +13,7 @@
     check_p_conditions,
     convert_pd_to_np,
 )
-from causalml.metrics import regression_metrics, classification_metrics
+from causalml.metrics import regression_metrics
 from causalml.propensity import compute_propensity_score
 
 
@@ -25,7 +25,7 @@ class BaseDRLearner(BaseLearner):
 
     A DR-learner estimates treatment effects with machine learning models.
 
-    Details of DR-learner are available at Kennedy (2020) (https://arxiv.org/abs/2004.14497).
+    Details of DR-learner are available at `Kennedy (2020) <https://arxiv.org/abs/2004.14497>`_.
     """
 
     def __init__(
@@ -235,7 +235,6 @@ def predict(
             if (y is not None) and (treatment is not None) and verbose:
                 mask = (treatment == group) | (treatment == self.control_name)
                 treatment_filt = treatment[mask]
-                X_filt = X[mask]
                 y_filt = y[mask]
                 w = (treatment_filt == group).astype(int)
 
diff --git a/causalml/inference/meta/explainer.py b/causalml/inference/meta/explainer.py
index 9db7aa9a..b92fee86 100644
--- a/causalml/inference/meta/explainer.py
+++ b/causalml/inference/meta/explainer.py
@@ -11,7 +11,7 @@
 VALID_METHODS = ("auto", "permutation", "shapley")
 
 
-class Explainer(object):
+class Explainer:
     def __init__(
         self,
         method,
@@ -205,7 +205,8 @@ def plot_importance(self, importance_dict=None, title_prefix="", figsize=(12, 8)
         Calculates and plots feature importances for each treatment group, based on specified method in __init__.
         Skips the calculation part if importance_dict is given.
         Args:
-            importance_dict (optional, dict): a dict of feature importance matrics. If None, importance_dict will be computed.
+            importance_dict (optional, dict): a dict of feature importance matrics. If None, importance_dict will be
+                computed.
             title_prefix (optional, str): a prefix to the title of the plot.
             figsize (optional, tuple): the size of the figure.
         """
diff --git a/causalml/inference/meta/rlearner.py b/causalml/inference/meta/rlearner.py
index 2adf8eac..915efdbc 100644
--- a/causalml/inference/meta/rlearner.py
+++ b/causalml/inference/meta/rlearner.py
@@ -1,7 +1,6 @@
 from copy import deepcopy
 import logging
 import numpy as np
-import pandas as pd
 from tqdm import tqdm
 from scipy.stats import norm
 from sklearn.model_selection import cross_val_predict, KFold, train_test_split
@@ -14,7 +13,7 @@
     convert_pd_to_np,
     get_weighted_variance,
 )
-from causalml.propensity import compute_propensity_score, ElasticNetPropensityModel
+from causalml.propensity import ElasticNetPropensityModel
 
 
 logger = logging.getLogger("causalml")
@@ -25,7 +24,7 @@ class BaseRLearner(BaseLearner):
 
     An R-learner estimates treatment effects with two machine learning models and the propensity score.
 
-    Details of R-learner are available at Nie and Wager (2019) (https://arxiv.org/abs/1712.04912).
+    Details of R-learner are available at `Nie and Wager (2019) <https://arxiv.org/abs/1712.04912>`_.
     """
 
     def __init__(
@@ -53,7 +52,8 @@ def __init__(
             control_name (str or int, optional): name of control group
             n_fold (int, optional): the number of cross validation folds for outcome_learner
             random_state (int or RandomState, optional): a seed (int) or random number generator (RandomState)
-            cv_n_jobs (int, optional): number of parallel jobs to run for cross_val_predict. -1 means using all processors
+            cv_n_jobs (int, optional): number of parallel jobs to run for cross_val_predict. -1 means using all
+                processors
         """
         assert (learner is not None) or (
             (outcome_learner is not None) and (effect_learner is not None)
@@ -651,7 +651,6 @@ def fit(self, X, treatment, y, p=None, sample_weight=None, verbose=True):
                     random_state=self.random_state,
                 )
 
-                weight = sample_weight_filt
                 self.models_tau[group].fit(
                     X=X_train_filt,
                     y=(y_train_filt - yhat_train_filt) / (w_train - p_train_filt),
diff --git a/causalml/inference/meta/slearner.py b/causalml/inference/meta/slearner.py
index c1390499..a8a5a12f 100644
--- a/causalml/inference/meta/slearner.py
+++ b/causalml/inference/meta/slearner.py
@@ -47,7 +47,7 @@ def predict(self, X):
 class BaseSLearner(BaseLearner):
     """A parent class for S-learner classes.
     An S-learner estimates treatment effects with one machine learning model.
-    Details of S-learner are available at Kunzel et al. (2018) (https://arxiv.org/abs/1706.03461).
+    Details of S-learner are available at `Kunzel et al. (2018) <https://arxiv.org/abs/1706.03461>`_.
     """
 
     def __init__(self, learner=None, ate_alpha=0.05, control_name=0):
diff --git a/causalml/inference/meta/tlearner.py b/causalml/inference/meta/tlearner.py
index a7a13ed8..7f0dff92 100644
--- a/causalml/inference/meta/tlearner.py
+++ b/causalml/inference/meta/tlearner.py
@@ -27,7 +27,7 @@ class BaseTLearner(BaseLearner):
 
     A T-learner estimates treatment effects with two machine learning models.
 
-    Details of T-learner are available at Kunzel et al. (2018) (https://arxiv.org/abs/1706.03461).
+    Details of T-learner are available at `Kunzel et al. (2018) <https://arxiv.org/abs/1706.03461>`_.
     """
 
     def __init__(
diff --git a/causalml/inference/meta/tmle.py b/causalml/inference/meta/tmle.py
index 46bcd2ca..44c5bb80 100644
--- a/causalml/inference/meta/tmle.py
+++ b/causalml/inference/meta/tmle.py
@@ -93,7 +93,7 @@ def simple_tmle(y, w, q0w, q1w, p, alpha=0.0001):
     return np.mean(q1star - q0star), np.sqrt(np.var(ic) / np.size(y))
 
 
-class TMLELearner(object):
+class TMLELearner:
     """Targeted maximum likelihood estimation.
 
     Ref: Gruber, S., & Van Der Laan, M. J. (2009). Targeted maximum likelihood estimation: A gentle introduction.
diff --git a/causalml/inference/meta/xlearner.py b/causalml/inference/meta/xlearner.py
index 1a556ead..1bd1c525 100644
--- a/causalml/inference/meta/xlearner.py
+++ b/causalml/inference/meta/xlearner.py
@@ -1,19 +1,15 @@
 from copy import deepcopy
 import logging
 import numpy as np
-import pandas as pd
 from tqdm import tqdm
 from scipy.stats import norm
 
 from causalml.inference.meta.base import BaseLearner
 from causalml.inference.meta.utils import (
     check_treatment_vector,
-    check_p_conditions,
     convert_pd_to_np,
 )
-from causalml.inference.meta.explainer import Explainer
 from causalml.metrics import regression_metrics, classification_metrics
-from causalml.propensity import compute_propensity_score
 
 logger = logging.getLogger("causalml")
 
@@ -23,7 +19,7 @@ class BaseXLearner(BaseLearner):
 
     An X-learner estimates treatment effects with four machine learning models.
 
-    Details of X-learner are available at Kunzel et al. (2018) (https://arxiv.org/abs/1706.03461).
+    Details of X-learner are available at `Kunzel et al. (2018) <https://arxiv.org/abs/1706.03461>`_.
     """
 
     def __init__(
diff --git a/causalml/inference/tree/causal/causaltree.py b/causalml/inference/tree/causal/causaltree.py
index 5e6c8180..f37941e9 100755
--- a/causalml/inference/tree/causal/causaltree.py
+++ b/causalml/inference/tree/causal/causaltree.py
@@ -1,5 +1,4 @@
 import logging
-import sys
 from typing import Union
 
 import tqdm
@@ -23,7 +22,7 @@
 class CausalTreeRegressor(RegressorMixin, BaseCausalDecisionTree):
     """A Causal Tree regressor class.
     The Causal Tree is a decision tree regressor with a split criteria for treatment effects.
-    Details are available at Athey and Imbens (2015) (https://arxiv.org/abs/1504.01132)
+    Details are available at `Athey and Imbens (2015) <https://arxiv.org/abs/1504.01132)>`_.
     """
 
     def __init__(
diff --git a/causalml/inference/tree/plot.py b/causalml/inference/tree/plot.py
index 595af7cd..6e17eb8d 100644
--- a/causalml/inference/tree/plot.py
+++ b/causalml/inference/tree/plot.py
@@ -4,6 +4,7 @@
 """
 
 from collections import defaultdict
+from typing import Union
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -399,7 +400,7 @@ def __init__(
         self.treatment_groups = treatment_groups
 
     def node_to_str(
-        self, tree: _tree.Tree, node_id: int, criterion: str or object
+        self, tree: _tree.Tree, node_id: int, criterion: Union[str, object]
     ) -> str:
         """
         Generate the node content string
diff --git a/causalml/inference/tree/utils.py b/causalml/inference/tree/utils.py
index a0c8ccd9..fbd22efe 100644
--- a/causalml/inference/tree/utils.py
+++ b/causalml/inference/tree/utils.py
@@ -348,8 +348,9 @@ def wrapped(*args, **kw):
             te = time.time()
             display_kw = {k: v for k, v in kw.items() if k not in exclude_kwargs}
             print(
-                f"Function: %s Kwargs:%r Elapsed time: %2.4f"
-                % (f.__name__, display_kw, te - ts)
+                "Function: {} Kwargs: {} Elapsed time: {:2.4f}".format(
+                    f.__name__, display_kw, te - ts
+                )
             )
             return result
 
diff --git a/causalml/match.py b/causalml/match.py
index 23e901a0..b076a5f4 100644
--- a/causalml/match.py
+++ b/causalml/match.py
@@ -81,7 +81,7 @@ def create_table_one(data, treatment_col, features, with_std=True, with_counts=T
     return t1
 
 
-class NearestNeighborMatch(object):
+class NearestNeighborMatch:
     """
     Propensity score matching based on the nearest neighbor algorithm.
 
@@ -233,7 +233,7 @@ def match_by_group(self, data, treatment_col, score_cols, groupby_col):
         return matched.reset_index(level=0, drop=True)
 
 
-class MatchOptimizer(object):
+class MatchOptimizer:
     def __init__(
         self,
         treatment_col="is_treatment",
diff --git a/causalml/metrics/sensitivity.py b/causalml/metrics/sensitivity.py
index 8a7686ed..3bd186d2 100644
--- a/causalml/metrics/sensitivity.py
+++ b/causalml/metrics/sensitivity.py
@@ -1,7 +1,6 @@
 import logging
 import numpy as np
 import pandas as pd
-from collections import defaultdict
 import matplotlib.pyplot as plt
 from importlib import import_module
 
@@ -77,7 +76,7 @@ def alignment_att(alpha, p, treatment):
     return adj
 
 
-class Sensitivity(object):
+class Sensitivity:
     """A Sensitivity Check class to support Placebo Treatment, Irrelevant Additional Confounder
     and Subset validation refutation methods to verify causal inference.
 
@@ -144,9 +143,6 @@ def get_ate_ci(self, X, p, treatment, y):
             (numpy.ndarray): Mean and confidence interval (LB, UB) of the ATE estimate.
         """
 
-        learner = self.learner
-        from ..inference.meta.tlearner import BaseTLearner
-
         try:
             ate, ate_lower, ate_upper = self.learner.estimate_ate(
                 X=X, p=p, treatment=treatment, y=y, return_ci=True
diff --git a/causalml/optimize/pns.py b/causalml/optimize/pns.py
index 5d2b6098..6e62ce46 100644
--- a/causalml/optimize/pns.py
+++ b/causalml/optimize/pns.py
@@ -1,7 +1,3 @@
-import numpy as np
-import pandas as pd
-
-
 def get_pns_bounds(data_exp, data_obs, T, Y, type="PNS"):
     """
     Args
@@ -14,26 +10,26 @@ def get_pns_bounds(data_exp, data_obs, T, Y, type="PNS"):
         Name of the binary treatment indicator
     y : str
         Name of the binary outcome indicator
-    'type' : str
+    type : str
         Type of probability of causation desired. Acceptable args are:
-        * 'PNS': Probability of necessary and sufficient causation
-        * 'PS': Probability of sufficient causation
-        * 'PN': Probability of necessary causation
+            - ``PNS``: Probability of necessary and sufficient causation
+            - ``PS``: Probability of sufficient causation
+            - ``PN``: Probability of necessary causation
 
     Notes
     -----
-    Based on Equation (24) in Tian and Pearl: https://ftp.cs.ucla.edu/pub/stat_ser/r271-A.pdf
+    Based on Equation (24) in `Tian and Pearl (2000) <https://ftp.cs.ucla.edu/pub/stat_ser/r271-A.pdf>`_.
 
-    To capture the counterfactual notation, we use `1' and `0' to indicate the actual and
-    counterfactual values of a variable, respectively, and we use `do' to indicate the effect
+    To capture the counterfactual notation, we use ``1`` and ``0`` to indicate the actual and
+    counterfactual values of a variable, respectively, and we use ``do`` to indicate the effect
     of an intervention.
 
     The experimental and observational data are either assumed to come to the same population,
     or from random samples of the population. If the data are from a sample, the bounds may
     be incorrectly calculated because the relevant quantities in the Tian-Pearl equations are
-    defined e.g. as P(YifT), not P(YifT \mid S) where S corresponds to sample selection.
-    Bareinboim and Pearl (https://www.pnas.org/doi/10.1073/pnas.1510507113) discuss conditions
-    under which P(YifT) can be recovered from P(YifT \mid S).
+    defined e.g. as :math:`P(Y|do(T))`, not :math:`P(Y|do(T), S)` where :math:`S` corresponds to sample selection.
+    `Bareinboim and Pearl (2016) <https://www.pnas.org/doi/10.1073/pnas.1510507113>`_ discuss conditions
+    under which :math:`P(Y|do(T))` can be recovered from :math:`P(Y|do(T), S)`.
     """
 
     # Probabilities calculated from observational data
diff --git a/causalml/optimize/policylearner.py b/causalml/optimize/policylearner.py
index 004b5eaa..48e003a3 100644
--- a/causalml/optimize/policylearner.py
+++ b/causalml/optimize/policylearner.py
@@ -3,19 +3,19 @@
 import numpy as np
 from causalml.propensity import compute_propensity_score
 from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
-from sklearn.model_selection import cross_val_predict, KFold
+from sklearn.model_selection import KFold
 from sklearn.tree import DecisionTreeClassifier
 
 
 logger = logging.getLogger("causalml")
 
 
-class PolicyLearner(object):
+class PolicyLearner:
     """
     A Learner that learns a treatment assignment policy with observational data using doubly robust estimator of causal
     effect for binary treatment.
 
-    Details of the policy learner are available at Athey and Wager (2018) (https://arxiv.org/abs/1702.02896).
+    Details of the policy learner are available at `Athey and Wager (2018) <https://arxiv.org/abs/1702.02896>`_.
 
     """
 
@@ -58,7 +58,7 @@ def __repr__(self):
         return (
             "{}(model_mu={},\n"
             "\tmodel_w={},\n"
-            "\model_pi={})".format(
+            "\tmodel_pi={})".format(
                 self.__class__.__name__,
                 self.model_mu.__repr__(),
                 self.model_w.__repr__(),
@@ -73,7 +73,7 @@ def _outcome_estimate(self, X, w, y):
         for train_index, test_index in self.cv.split(y):
             X_train, X_test = X[train_index], X[test_index]
             w_train, w_test = w[train_index], w[test_index]
-            y_train, y_test = y[train_index], y[test_index]
+            y_train, _ = y[train_index], y[test_index]
 
             self.model_mu.fit(
                 np.concatenate([X_train, w_train.reshape(-1, 1)], axis=1), y_train
diff --git a/causalml/optimize/unit_selection.py b/causalml/optimize/unit_selection.py
index 78977660..0d4232f2 100644
--- a/causalml/optimize/unit_selection.py
+++ b/causalml/optimize/unit_selection.py
@@ -1,5 +1,4 @@
 import numpy as np
-import pandas as pd
 
 from sklearn.base import clone
 
diff --git a/causalml/optimize/value_optimization.py b/causalml/optimize/value_optimization.py
index adba267f..a286e49d 100644
--- a/causalml/optimize/value_optimization.py
+++ b/causalml/optimize/value_optimization.py
@@ -27,10 +27,12 @@ class CounterfactualValueEstimator:
         Value of converting each unit.
 
     conversion_cost : shape = (num_samples, len(set(treatment)))
-        The cost of a treatment that is triggered if a unit converts after having been in the treatment, such as a promotion code.
+        The cost of a treatment that is triggered if a unit converts after having been in the treatment, such as a
+        promotion code.
 
     impression_cost : shape = (num_samples, len(set(treatment)))
-       The cost of a treatment that is the same for each unit whether or not they convert, such as a cost associated with a promotion channel.
+       The cost of a treatment that is the same for each unit whether or not they convert, such as a cost associated
+       with a promotion channel.
 
 
     Notes
diff --git a/tests/test_sensitivity.py b/tests/test_sensitivity.py
index 7690bbc7..7c0b0248 100644
--- a/tests/test_sensitivity.py
+++ b/tests/test_sensitivity.py
@@ -1,9 +1,16 @@
 import pandas as pd
+import pytest
 import numpy as np
 from sklearn.linear_model import LinearRegression
 
 from causalml.dataset import synthetic_data
-from causalml.inference.meta import BaseXLearner
+from causalml.inference.meta import (
+    BaseSLearner,
+    BaseTLearner,
+    XGBTRegressor,
+    BaseXLearner,
+    BaseRLearner,
+)
 from causalml.metrics.sensitivity import Sensitivity
 from causalml.metrics.sensitivity import (
     SensitivityPlaceboTreatment,
@@ -23,7 +30,17 @@
 from .const import TREATMENT_COL, SCORE_COL, OUTCOME_COL, NUM_FEATURES
 
 
-def test_Sensitivity():
+@pytest.mark.parametrize(
+    "learner",
+    [
+        BaseSLearner(LinearRegression()),
+        BaseTLearner(LinearRegression()),
+        XGBTRegressor(),
+        BaseXLearner(LinearRegression()),
+        BaseRLearner(LinearRegression()),
+    ],
+)
+def test_Sensitivity(learner):
     y, X, treatment, tau, b, e = synthetic_data(
         mode=1, n=100000, p=NUM_FEATURES, sigma=1.0
     )
@@ -36,7 +53,6 @@ def test_Sensitivity():
     df[SCORE_COL] = e
 
     # calling the Base XLearner class and return the sensitivity analysis summary report
-    learner = BaseXLearner(LinearRegression())
     sens = Sensitivity(
         df=df,
         inference_features=INFERENCE_FEATURES,
diff --git a/tox.ini b/tox.ini
index 9578554c..b8f42e80 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,17 +8,17 @@ commands =
 
 [flake8]
 max-line-length = 120
-ignore = E121,  # Continuation line under-indented for hanging indent
-         E123,  # Closing bracket does not match indentation of opening bracket's line
-         E126,  # Continuation line over-indented for hanging indent
-         E128,  # Continuation line under-indented for visual indent
-         E129,  # Visually indented line with same indent as next logical line
-         E226,  # Missing whitespace around arithmetic operator
-         E24,   #
-         E704,  # Multiple statements on one line (def)
-         E731,  # Do not assign a lambda expression, use a def
-         E741,  # Ambiguous variable name
-         W503   # Line break before binary operator
-         W504   # Line break after binary operator
+ignore = E121,
+         E123,
+         E126,
+         E128,
+         E129,
+         E226,
+         E24,
+         E704,
+         E731,
+         E741,
+         W503,
+         W504
 
 builtins = __builtins__