In [27]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler , OneHotEncoder
from feature_engine.outliers import Winsorizer
import joblib

In [28]:
datset = pd.read_csv(r"drug_regulatory_classification_dataset.csv")

In [29]:
datset.head()

Unnamed: 0,Dosage_mg,Price_Per_Unit,Production_Cost,Marketing_Spend,Clinical_Trial_Phase,Side_Effect_Severity_Score,Abuse_Potential_Score,Prescription_Rate,Hospital_Distribution_Percentage,Pharmacy_Distribution_Percentage,...,Manufacturing_Region,Requires_Cold_Storage,OTC_Flag,High_Risk_Substance,Insurance_Coverage_Percentage,Export_Percentage,Online_Sales_Percentage,Brand_Reputation_Score,Doctor_Recommendation_Rate,Target_Regulatory_Class
0,250,364.22,246.49,159132.53,3,4.26,2.52,0.74,57.93,42.07,...,South,No,No,No,64.46,41.88,41.15,6.56,0.47,Non-Regulated Drug
1,500,112.86,73.22,260595.45,2,8.12,1.8,0.48,48.0,52.0,...,East,Yes,Yes,No,93.37,10.3,50.05,4.01,0.44,Non-Regulated Drug
2,200,197.24,100.72,106818.45,3,1.58,1.47,0.93,54.48,45.52,...,North,Yes,Yes,No,64.53,34.21,46.42,5.95,0.16,Non-Regulated Drug
3,500,373.55,264.76,231304.59,4,1.72,1.61,0.54,47.2,52.8,...,North,No,Yes,No,95.21,71.04,49.52,9.65,0.76,Non-Regulated Drug
4,500,353.87,277.29,319403.02,2,9.64,0.45,0.5,86.96,13.04,...,North,No,Yes,No,98.05,70.28,44.9,1.79,0.36,Non-Regulated Drug


In [30]:
datset.tail()

Unnamed: 0,Dosage_mg,Price_Per_Unit,Production_Cost,Marketing_Spend,Clinical_Trial_Phase,Side_Effect_Severity_Score,Abuse_Potential_Score,Prescription_Rate,Hospital_Distribution_Percentage,Pharmacy_Distribution_Percentage,...,Manufacturing_Region,Requires_Cold_Storage,OTC_Flag,High_Risk_Substance,Insurance_Coverage_Percentage,Export_Percentage,Online_Sales_Percentage,Brand_Reputation_Score,Doctor_Recommendation_Rate,Target_Regulatory_Class
59995,200,260.68,167.61,337160.4,2,6.04,2.79,0.92,67.01,32.99,...,East,Yes,No,No,60.31,73.67,25.44,2.97,0.95,Regulated Drug
59996,200,249.67,139.19,457366.23,2,8.51,1.94,0.86,50.66,49.34,...,South,Yes,No,No,31.87,38.08,56.1,6.75,0.91,Regulated Drug
59997,100,155.15,78.35,367705.08,4,6.45,2.36,0.85,26.03,73.97,...,South,Yes,Yes,No,35.96,19.06,25.1,9.75,0.6,Non-Regulated Drug
59998,500,42.46,19.05,259954.0,1,6.98,2.29,0.16,39.96,60.04,...,North,No,Yes,No,67.17,57.8,28.23,3.0,0.39,Regulated Drug
59999,250,153.99,83.04,267876.28,1,1.51,1.15,0.51,52.75,47.25,...,North,Yes,Yes,No,82.2,10.26,22.85,5.38,0.87,Non-Regulated Drug


In [31]:
datset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 30 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Dosage_mg                         60000 non-null  int64  
 1   Price_Per_Unit                    60000 non-null  float64
 2   Production_Cost                   60000 non-null  float64
 3   Marketing_Spend                   60000 non-null  float64
 4   Clinical_Trial_Phase              60000 non-null  int64  
 5   Side_Effect_Severity_Score        60000 non-null  float64
 6   Abuse_Potential_Score             60000 non-null  float64
 7   Prescription_Rate                 60000 non-null  float64
 8   Hospital_Distribution_Percentage  60000 non-null  float64
 9   Pharmacy_Distribution_Percentage  60000 non-null  float64
 10  Annual_Sales_Volume               60000 non-null  float64
 11  Regulatory_Risk_Score             60000 non-null  float64
 12  Appr

In [32]:
#seprate features and target.
x=datset.iloc[:,1:]
y=datset.iloc[:,0]

In [36]:
numeric_features = x.select_dtypes(exclude=['object']).columns
categorical_features = x.select_dtypes(include=['object']).columns

In [37]:
# Numerical Pipeline
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy='mean')),
    ("winsorizer", Winsorizer(capping_method='iqr', tail='both', fold=1.5)),
    ("scale", MinMaxScaler())
])

# Categorical Pipeline
categ_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy='most_frequent')),
    ("encode", OneHotEncoder(drop='first'))
])

In [38]:
from sklearn.compose import ColumnTransformer

preprocess_pipeline = ColumnTransformer([
    ('numerical', num_pipeline, numeric_features),
    ('categorical', categ_pipeline, categorical_features)
])