In [2]:
import pandas as pd
import numpy as np

class Preprocessor:
    def __init__(self,nan_handle,scaling,training_fraction):
        self.nan_handle = nan_handle
        self.scaling = scaling
        self.training_fraction = training_fraction
    
    def explain(self):
        print("NaN values are handled with: ",self.nan_handle)
        print("Features are rescaled with: ",self.scaling)
        
    def apply(self,data_file,save_file):
        df1 = pd.read_csv(data_file)

        if self.nan_handle=='remove':
            df1 = df1.dropna()
            data_removed = df1.copy().dropna()
            df2 = data_removed
        
        elif self.nan_handle=='mean':
            data_replaced = df1.copy()
            numeric_cols = data_replaced.select_dtypes(include=['float64','int64']).columns
            mean_values = {col: data_replaced[col].mean() for col in numeric_cols}
            data_replaced_mean = data_replaced.copy().fillna(value=mean_values)
            df2 = data_replaced_mean
            
        elif self.nan_handle=='median':
            data_replaced = df1.copy()
            numeric_cols = data_replaced.select_dtypes(include=['float64','int64']).columns
            median_values = {col: data_replaced[col].median() for col in numeric_cols}
            data_replaced_median = data_replaced.copy().fillna(value=median_values)
            df2 = data_replaced_median

        else:
            raise TypeError(f'{self.nan_handle} is an incorrect handle type') 
        
        data = df2.copy()
        non_num_cols = df2.select_dtypes(exclude=['number']).columns.tolist()
        y_col = non_num_cols[0]
        X_cols = [c for c in data.columns if c != y_col]
        
        #input feature scaling method
        if self.scaling=='minmax':
            for properties in X_cols:
                minmax_example = pd.DataFrame(
                    {'unscaled': data[properties]}
                )
                min_value = min(minmax_example['unscaled'])
                max_value = max(minmax_example['unscaled'])
                diff = max_value - min_value
                minmax_example['scaled'] = minmax_example['unscaled'].apply(
                    lambda x: (x - min_value) / diff
                )
                data[properties]=minmax_example['scaled']
                
        elif self.scaling=='standard':
            for properties in X_cols:
                standard_example = pd.DataFrame(
                    {'unscaled': data[properties]}
                )
                mean = standard_example['unscaled'].mean()
                std = standard_example['unscaled'].std()
                standard_example['scaled'] = standard_example['unscaled'].apply(
                    lambda x: (x - mean) / std
                )
                data[properties]=standard_example['scaled']

        else:
            raise TypeError(f'{self.scaling} is an incorrect scaling type') 
                

        data_labelencoded = data.copy() 
        classes = set(data_labelencoded['label'])
       
        values = {list(classes)[i]: i for i in range(len(classes))}
        data_labelencoded['label'] = data_labelencoded['label'].apply(lambda x: values[x])
        data_labelencoded.to_csv(save_file)
        self.processed_data = data_labelencoded

        y = data_labelencoded[y_col]
        X = data_labelencoded[X_cols]

        training_fraction = self.training_fraction
        training_size = int(training_fraction * len(X))

        X_train = X[:training_size]
        X_test = X[training_size:]
        y_train = y[:training_size]
        y_test = y[training_size:]
        
        train_test={
            "X_train":X_train,
            "X_test":X_test,
            "y_train":y_train,
            "y_test":y_test
        }
        self.train_test=train_test

    
prep1 = Preprocessor(nan_handle='median',scaling='minmax',training_fraction=0.3)
prep1.explain()
prep1.apply("dataset_1.csv","dataset_3.csv")
prep1.train_test["X_train"].head(10)

NaN values are handled with:  median
Features are rescaled with:  minmax


Unnamed: 0,density,vacancy_content,melting_temperature,heat_conductivity,band_gap,crystallinity_index,thermal_expansion_coeff,young_modulus,hardness,lattice_parameter
0,0.374639,0.393798,0.373615,0.499581,1.0,0.092296,0.638277,0.609768,0.298909,0.329876
1,0.950982,0.473642,0.332872,0.746739,0.25,0.060666,0.459386,0.038759,0.492428,0.816754
2,0.732198,0.854963,0.176058,0.562599,0.5,0.60431,0.964702,0.612639,0.126348,0.994419
3,0.598823,0.340138,0.607323,0.083077,1.0,0.966372,0.21902,0.498852,0.180662,0.840781
4,0.156053,0.870073,0.476634,0.185389,0.75,0.502801,0.587978,0.710812,0.492428,0.346089
5,0.156029,0.08813,0.865849,0.513438,1.0,0.051424,0.700356,0.390253,0.242256,0.711253
6,0.058089,0.777171,0.031963,0.257696,1.0,0.221192,0.825737,0.83828,0.255455,0.533898
7,0.866419,0.847959,0.643937,0.565336,0.25,0.244775,0.407053,0.943209,0.45572,0.786529
8,0.60128,0.181864,0.76306,0.503878,1.0,0.337875,0.687065,0.417311,0.509579,0.153015
9,0.708269,0.430529,0.759597,0.202974,0.5,0.04351,0.303261,0.460795,0.308876,0.638124


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

class Classifier:
    def __init__(self,model_name):
        self.model_name = model_name
        
    def apply(self,X_train, y_train, X_test, y_test):
        X_train = X_train.to_numpy()
        y_train = y_train.to_numpy()
        X_test  = X_test.to_numpy()
        y_test  = y_test.to_numpy()

        if self.model_name == 'logistic':
            self.model = LogisticRegression(max_iter=500)
            self.model.fit(X_train, y_train)

        elif self.model_name == 'random_forest':
            self.model = RandomForestClassifier(n_estimators=200)
            self.model.fit(X_train, y_train)

        elif self.model_name == 'svm':
            self.model = SVC(probability=True)  
            self.model.fit(X_train, y_train)

        else:
            raise ValueError(f"{self.model_name} is not a supported classifier")
            
        y_pred = self.model.predict(X_test)

        print(f"Model '{self.model_name}' training finished.")
        return y_pred,y_test
 
        
clf = Classifier(model_name='random_forest')
y_pred, y_test = clf.apply(
    prep1.train_test["X_train"],
    prep1.train_test["y_train"],
    prep1.train_test["X_test"],
    prep1.train_test["y_test"],
)

Model 'random_forest' training finished.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    explained_variance_score,
    max_error,
    mean_absolute_error,
    mean_squared_error,
    median_absolute_error,
    r2_score,
)

class Evaluator:
    def __init__(self,metrics):
        self.metrics=metrics
    
    def explain(self):
        print("Model performance is tested with: ",self.metrics)
        
    def apply(self,test_data, pred_data):
        methods={
            "evs":explained_variance_score,
            "me":max_error,
            "mae":mean_absolute_error,
            "mse":mean_squared_error,
            "meae":median_absolute_error,
            "r2":r2_score,
        }
        performance = methods[self.metrics]
        return performance(test_data,pred_data)
        
eva1 = Evaluator(metrics='meae')
eva1.explain()
eva1.apply(y_test,y_pred)
print(eva1.apply(y_test,y_pred))

Model performance is tested with:  meae
0.0
