In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib

data = pd.read_csv('prediction_model/datasets/train.csv')

# Example preprocessing and feature engineering steps
X = data.drop('L', axis=1)  # Replace 'target_column' with your target variable
y = data['target_column']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
print(f'Model Accuracy: {accuracy}')

model_filename = 'classification_v2.pkl'
joblib.dump(model, model_filename)



KeyError: "['target_column'] not found in axis"

In [3]:
# Imprt libraries
import pathlib 
import os
import prediction_model 

PACKAGE_ROOT = pathlib.Path(prediction_model.__file__).resolve().parent

DATAPATH = os.path.join(PACKAGE_ROOT,'datasets')
SAVED_MODEL_PATH = os.path.join(PACKAGE_ROOT,'trained_models')

TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

TARGET = 'Loan_Status'

#Features to keep
FEATURES=['Gender','Married','Dependents',
    'Education','Self_Employed','ApplicantIncome',
    'CoapplicantIncome','LoanAmount','Loan_Amount_Term',
    'Credit_History','Property_Area'] # Final feature to keep in data

NUMERICAL_FEATURES=['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term'] 

CATEGORICAL_FEATURES=['Gender','Married','Dependents',
'Education','Self_Employed','Credit_History','Property_Area'] #Categorical

FEATURES_TO_ENCODE=['Gender','Married','Dependents',
'Education','Self_Employed','Credit_History','Property_Area'] #Features to Encode

TEMPORAL_FEATURES=['ApplicantIncome']
TEMPORAL_ADDITION='CoapplicantIncome'
LOG_FEATURES=['ApplicantIncome', 'LoanAmount'] #Features for Log Transformation
DROP_FEATURES=['CoapplicantIncome'] #Features to Drop

In [4]:
# Import libraries
import os 
import pandas as pd
import joblib

# Import other files/modules 
from prediction_model.config import config
# Imports all the LOCAL and GLOBAL paths and variables 

def load_dataset(file_name):
    '''Read data'''
    file_path = os.path.join(config.DATAPATH,file_name) # DATAPATH is dataset dir 
    _data = pd.read_csv(file_path)
    return _data 

def save_pipeline(pipeline_to_save):
    """ Store output of pipeline 
        Exporting pickle file of trained model
    """
    save_file_name = 'classification_v1.pkl'
    save_path = os.path.join(config.SAVED_MODEL_PATH,save_file_name)
    joblib.dump(pipeline_to_save,save_path)
    print("Saved pipeline :",save_file_name)
    
def load_pipeline(pipeline_to_load):
    '''Importing the pickle file'''
    save_path = os.path.join(config.SAVED_MODEL_PATH,pipeline_to_load)
    trained_model = joblib.load(save_path)
    return trained_model
    
    
    


In [5]:
#Import libraries
import numpy as np 
import pandas as pd 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

# Import other files/modules 
from prediction_model.config import config

# Numeric Imputer
class NumericalImputer(BaseEstimator,TransformerMixin):
    "Numerical Data Missing Value Imputer"
    def __init__(self,variables=None):
        self.variables = variables
    
    def fit(self,X,y=None):
        self.imputer_dict = {}
        for feature in self.variables:
            self.imputer_dict[feature] = X[feature].mean()
        return self
    
    def transform(self,X):
        X = X.copy()
        for feature in self.variables:
            X[feature].fillna(self.imputer_dict[feature],inplace=True)
        return X
    
    
#Categorical Imputer
class CategoricalImputer(BaseEstimator,TransformerMixin):
    """Categorical Data Missing Value Imputer"""
    def __init__(self, variables=None):
        self.variables = variables
    
    def fit(self, X,y=None):
        self.imputer_dict_={}
        for feature in self.variables:
            self.imputer_dict_[feature] = X[feature].mode()[0]
        return self

    def transform(self, X):
        X=X.copy()
        for feature in self.variables:
            X[feature].fillna(self.imputer_dict_[feature],inplace=True)
        return X
    
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Categorical Data Encoder"""
    
    def __init__(self, variables=None):
        self.variables = variables

    def fit(self, X, y):
        self.encoder_dict_ = {}
        for var in self.variables:
            t = X[var].value_counts().sort_values(ascending=True).index
            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.encoder_dict_[feature])
        return X


class TemporalVariableEstimator(BaseEstimator, TransformerMixin):
    """Feature Engineering"""
    
    def __init__(self, variables=None, reference_variable=None):
        self.variables = variables
        self.reference_variable = reference_variable

    def fit(self, X, y=None):
        # No need to put anything, needed for Sklearn Pipeline
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var] + X[self.reference_variable]
        return X


class LogTransformation(BaseEstimator, TransformerMixin):
    """Transforming variables using Log Transformations"""
    
    def __init__(self, variables=None):
        self.variables = variables

    def fit(self, X, y):
        return self
    #Need to check in advance if the features are <= 0
    #If yes, needs to be transformed properly (E.g., np.log1p(X[var]))
    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = np.log(X[var])
        return X


class DropFeatures(BaseEstimator, TransformerMixin):
    """Dropping Features Which Are Less Significant"""
    
    def __init__(self, variables_to_drop=None):
        self.variables_to_drop = variables_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(self.variables_to_drop, axis=1)
        return X

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

# Import other files/modules
from prediction_model.config import config
import prediction_model.processing.preprocessors as pp

loan_pipe = Pipeline([
    ('Numerical Imputer', pp.NumericalImputer(variables=config.NUMERICAL_FEATURES)),
    ('Categorical Imputer', pp.CategoricalImputer(variables=config.CATEGORICAL_FEATURES)),
    ('Temporal Features', pp.TemporalVariableEstimator(variables=config.TEMPORAL_FEATURES, reference_variable=config.TEMPORAL_ADDITION)),
    ('Categorical Encoder', pp.CategoricalEncoder(variables=config.FEATURES_TO_ENCODE)),
    ('Log Transform', pp.LogTransformation(variables=config.LOG_FEATURES)),
    ('Drop Features', pp.DropFeatures(variables_to_drop=config.DROP_FEATURES)),
    ('Scaler Transform', MinMaxScaler()),
    ('Linear Model', LogisticRegression(random_state=1))
])


In [11]:
import pandas as pd
import numpy as np
import joblib

# Import other files/modules
from prediction_model.config import config
from prediction_model.processing.data_management import load_pipeline

pipeline_file_name = 'classification_v1.pkl'

_loan_pipe = load_pipeline(pipeline_file_name)

def _make_prediction(input_data):
    '''Predict the output'''
    
    #Read Data 
    data = pd.DataFrame(input_data)
    
    # Prediction 
    prediction = _loan_pipe.predict(data[config.FEATURES])
    output = np.where(prediction==1,'Y','N').tolist()
    results = {'prediction':output}
    return results

In [10]:
# Import Libraries
import numpy as np 
import pandas as pd

# Import other files and modules
from prediction_model.config import config
from prediction_model.processing.data_management import load_dataset, save_pipeline
import prediction_model.processing.preprocessors as pp
import prediction_model.pipeline as pl
# from prediction_model.predict import _make_prediction

def run_training():
    '''Train the model'''
    train = load_dataset(config.TRAIN_FILE)
    
    # Separating Loan_Status in y
    y = train[config.TARGET].map({'N':0,'Y':1})
    pl.loan_pipe.fit(train[config.FEATURES],y)
    save_pipeline(pipeline_to_save=pl.loan_pipe)

if __name__=='__main__':
    run_training()
    


Saved pipeline : classification_v1.pkl


In [13]:
import prediction_model
from prediction_model import train_pipeline
from prediction_model.predict import make_prediction
import pandas as pd

train_pipeline.run_training() # Save the pickle object of the trained model

test_data = pd.read_csv("prediction_model/datasets/test.csv") # Load the data
result = make_prediction(test_data[0:1]) # Make prediction on the first row
print(result)


Saved pipeline : classification_v1.pkl
{'prediction': ['Y']}
