# Activate Auto Reload in Jupyter Notebook

In [1]:
%load_ext autoreload
%autoreload 2

**Library Import**

In [2]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib
import numpy as np
from sklearn.metrics import mean_squared_log_error

In [3]:
def data_path(main_path):
    ROOT_DIR = Path('.').resolve().parents[1].absolute()
    ROOT_DIR = Path('.').resolve().parents[1].absolute()
    # DATA_DIR = ROOT_DIR / 'dsa-dsp-Vadrama-Ndisang-Ngyibi' / 'data'
    ROOT_DIR = ROOT_DIR / 'dsa-dsp-Vadrama-Ndisang-Ngyibi'
    DATA_DIR = ROOT_DIR / main_path
    
    df_master = pd.read_csv(DATA_DIR / 'train.csv', index_col='Id')
    inference_df = pd.read_csv(DATA_DIR / 'test.csv', index_col='Id')
    
    return df_master, inference_df, ROOT_DIR

In [4]:
def training_data_preprocessing(df_master):
    # Handling Continous Data
    df_continuous = df_master.select_dtypes(include='number')
    df_continuous = df_continuous.dropna()
    
    # Handling Categorical Data
    df_categorical = df_master.select_dtypes(exclude='number')
    one_hot_encoder_get_dummies = pd.get_dummies(df_categorical, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
    
    
    # Get the indexes of the continuous variables so as to slice the corresponding rows from the cat Vars
    continuous_inference_indexes = list(df_continuous.index)
    one_hot_encoder_get_dummies = one_hot_encoder_get_dummies.loc[continuous_inference_indexes]
    
    
    # Joining the Continuous and Categorical variables together
    df_final_one_hot_encoder = pd.concat([df_continuous,one_hot_encoder_get_dummies] , axis=1)
    
    # Getting column names
    col_names_train = list(df_final_one_hot_encoder.columns)
    
    return df_final_one_hot_encoder, col_names_train

In [5]:
def testing_data_preprocessing(inference_df):
    # Handling Continous Data
    continuous_inference_df = inference_df.select_dtypes(include='number')
    continuous_inference_df = continuous_inference_df.dropna()
    
    # Handling Categorical Data
    categorical_inference_df = inference_df.select_dtypes(exclude='number')
    one_hot_encoder_get_dummies_categorical_inference = pd.get_dummies(categorical_inference_df, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
    

    # Get the indexes of the continuous variables and slice the corresponding rows from the cat Vars
    continuous_inference_indexes = list(continuous_inference_df.index)
    one_hot_encoder_get_dummies_categorical_inference = one_hot_encoder_get_dummies_categorical_inference.loc[continuous_inference_indexes]

    
    # Joining the Continuous and Categorical variables together
    df_final_inference_one_hot_encoder = pd.concat([continuous_inference_df,one_hot_encoder_get_dummies_categorical_inference] , axis=1)
    
    
    # Getting column names
    col_names_inference = list(df_final_inference_one_hot_encoder.columns)
    
    return df_final_inference_one_hot_encoder, col_names_inference

In [6]:
def col_names(col_names_train, col_names_inference):
    col_names_train_set = set(col_names_train)
    col_names_inference_set = set(col_names_inference)
      
    # check length 
    if len(col_names_train_set.intersection(col_names_inference_set)) > 0:
        return list(col_names_train_set.intersection(col_names_inference_set))  
    else:
        return []

In [7]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [8]:
df_master, inference_df, base_path = data_path('data')
model_path = base_path / 'models'

def training(df_master, inference_df, model_path):
    target_column = 'SalePrice'
    
    df_final_one_hot_encoder, col_names_train = training_data_preprocessing(df_master)
    f_final_inference_one_hot_encoder, col_names_inference = testing_data_preprocessing(inference_df)
    col_names_lst = col_names(col_names_train, col_names_inference)
    
    
    # Splitting Dependent(y) and independent Variables(X)
    X_one_hot, y_one_hot = df_final_one_hot_encoder.drop(target_column, axis=1), df_final_one_hot_encoder[target_column]
    X_one_hot = X_one_hot[col_names_lst]
    
    # Splitting data for training and Testing
    X_train_one_hot, X_test_one_hot, y_train_one_hot, y_test_one_hot = train_test_split(X_one_hot, y_one_hot, test_size=0.2)
    
    
    # Model Training
    model_one_hot = LinearRegression()
    model_one_hot.fit(X_train_one_hot, y_train_one_hot)
    
    # Model Saving
    joblib_file = model_path/"car_price_Model__V1.pkl"  
    joblib.dump(model_one_hot, joblib_file)
    
    # Model Performance
    y_pred_one_hot = model_one_hot.predict(X_test_one_hot)
    # Replace negative predictions with 0
    y_pred_one_hot = np.where(y_pred_one_hot < 0, 0, y_pred_one_hot)
    model_pef = compute_rmsle(y_test_one_hot, y_pred_one_hot)
    
    return {"model_performance": model_pef, "model_path": str(model_path)} 

In [9]:
df_master, inference_df, base_path = data_path('data')
model_path = base_path / 'models'

training(df_master, inference_df, model_path)

{'model_performance': 0.83,
 'model_path': 'E:\\My Stuff\\Documents\\EPITA\\DSA  Spring Semester\\DSP\\dsa-dsp-Vadrama-Ndisang-Ngyibi\\models'}

In [12]:
df_master, inference_df, base_path = data_path('data')
model_path = base_path / 'models'

def make_predictions(inference_df, model_path):
#     f_final_inference_one_hot_encoder, col_names_inference = testing_data_preprocessing(inference_df)
    joblib_LR_model = joblib.load(model_path/"car_price_Model__V1.pkl" )
    joblib_LR_model

In [10]:
def data_preprocessing(df_master, inference_df):    
    ROOT_DIR = Path('.').resolve().parents[1].absolute()
    ROOT_DIR = Path('.').resolve().parents[1].absolute()
#     DATA_DIR = ROOT_DIR / 'dsa-dsp-Vadrama-Ndisang-Ngyibi' / 'data'
    DATA_DIR = ROOT_DIR / main_path

    target_column = 'SalePrice'
    df_master = pd.read_csv(DATA_DIR / 'train.csv', index_col='Id')
    inference_df = pd.read_csv(DATA_DIR / 'test.csv', index_col='Id')
    
    
    