## Model Training

In [2]:
import pandas as pd
df2 = pd.read_csv('data/gemstone.csv')
df2.drop('id', axis=1, inplace=True)
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
## splitting in dependent and independet features
X = df2.drop('price', axis=1)
y = df2['price']
print(X.shape)
print(y.shape)

(193573, 9)
(193573,)


In [4]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [5]:
y.head()

0    13619
1    13387
2     2772
3      666
4    14453
Name: price, dtype: int64

In [6]:
## seprate numerical and categorical column
categorical_columns2 = X.columns[X.dtypes==object].to_list()
numerical_columns2 = X.select_dtypes(exclude=object).columns.to_list()
print(categorical_columns2,"\n", numerical_columns2)

['cut', 'color', 'clarity'] 
 ['carat', 'depth', 'table', 'x', 'y', 'z']


In [7]:
# define the custom ranking for each ordinal variable
cut_category = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_category = ['I1' , 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_category = ['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [8]:
from sklearn.impute import SimpleImputer  ## Handling missing values
from sklearn.preprocessing import StandardScaler ## handling feature scaling
from sklearn.preprocessing import OrdinalEncoder ## handling orinal encoding

## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
##numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

##categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OrdinalEncoder(categories=[cut_category,color_category,clarity_category])),
        ('scaler', StandardScaler())
        
    ]
)

In [10]:
## transforming data using pipelines
preprocess = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns2),
    ('cat_pipeline',cat_pipeline,categorical_columns2)
]
)

In [11]:
preprocess

In [12]:
## Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=30)
print(X_train.shape,X_test.shape)
print(y_train.shape, y_test.shape)

(135501, 9) (58072, 9)
(135501,) (58072,)


In [13]:
## transorming our data
X_train = pd.DataFrame(preprocess.fit_transform(X_train), columns=preprocess.get_feature_names_out())
X_test = pd.DataFrame(preprocess.transform(X_test),columns=preprocess.get_feature_names_out())

In [14]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,-0.132136,0.296826,0.01972
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.14467,1.352731


In [15]:
X_test.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.564688,-0.942132,-0.642862,-0.429765,-0.464061,-0.500036,-0.132136,-0.935071,0.01972
1,-0.175556,1.000906,-0.121531,-0.042137,-0.028595,0.036132,-1.138347,0.912774,-0.646786
2,-1.061913,0.260701,-0.121531,-1.30418,-1.298703,-1.26806,0.874076,0.912774,2.685743
3,0.970223,-0.201927,1.963794,1.048629,0.996563,0.978049,-0.132136,0.296826,0.01972
4,-0.932202,-1.312235,0.3998,-1.006699,-0.990248,-1.065186,-0.132136,-0.935071,0.686225


In [16]:
## Model Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def model_evaluation(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    mae = mean_absolute_error(actual,predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(actual, predicted)
    
    return mae, rmse, r2

In [17]:
## defining all the Models

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

models = {
    'Linear_Regression' : LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'Elastic_Net' : ElasticNet(),
    'Decision_Tree' : DecisionTreeRegressor(),
    'Random_Forest' : RandomForestRegressor(),
    'Gradient_Boosting': GradientBoostingRegressor(),
    'Ada_Boost' : AdaBoostRegressor(),
    'XGboost' : XGBRegressor()
}

In [18]:
## Model Pipeline

def Model_Pipeline(X_train, X_test, y_train, y_test, model):
    report = {}
    for i in range(len(model)):
        mod = list(model.values())[i]
        
        #train model
        mod.fit(X_train, y_train)
        
        #Prediction
        pred = mod.predict(X_test)
        
        # model evaluation
        mae, rmse, r2 = model_evaluation(y_test, pred)
        
        report[list(model.keys())[i]] = [mae, rmse, r2]
        
    return report


In [1]:
model_pred=Model_Pipeline(X_train, X_test, y_train, y_test, models)
eval = ['MAE', 'RMSE','R2_Score']
for k,v in model_pred.items():
    print(k)
    for i in range(len(v)):
        print(eval[i]," : ", v[i])
    print('='*35)
    print('\n')

NameError: name 'Model_Pipeline' is not defined

In [2]:
dc = {'a':1,'b':2,'c':3,'d':4}

In [11]:
list(dc.keys())[list(dc.values()).index(2)]

'b'