## Assignment3 Car Price Prediction (Cont.)


In [1]:
#import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
import time

## load data

In [2]:
df = pd.read_csv("Cars.csv")

In [3]:
#show the table
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [4]:
# Check the shape of your data
df.shape

(8128, 13)

It means this table consist of 8128 samples, and 12 features

## Exploratory Data Analysis

I rearranged data according to requiements of Chaky's Company

In [5]:
#For the feature owner, map First Owner to 1, ..., Test Drive Car to 5

# Using Dictionary for mapping
dict_map = {'First Owner': 1,
    'Second Owner': 2, 'Third Owner' :3, 'Fourth & Above Owner' :4, 'Test Drive Car':5}
updateOwner = df['owner'].map(dict_map)
df['owner'] = updateOwner

#For the feature fuel, remove all rows with CNG and LPG
df = df[df["fuel"].str.contains("CNG|LPG") == False]

#For the featurre mileage, remove "kmpl" and convert the column to float
df['mileage'] = df['mileage'].str.replace('kmpl', '').astype(float)

#For the feature engine, remove "CC" and convert the column to float
df['engine'] = df['engine'].str.replace('CC', '').astype(float)

#For the feature max power, remove "bhp" and convert the column to float
df['max_power'] = df['max_power'].str.replace('bhp', '').astype(float)

#For the feature brand, take only the first word and remove the rest
updateBrand = df['name'].str.split().str.get(0)
df['name'] = updateBrand

#Drop the feature torque since Chaky's company does not understand the kind of information
df =df.drop(columns=["torque"])

#Delete all sample related to Test Drive Cars because they are too expensive so that Chaky's company doesn't interested
df = df.loc[df["owner"] != 5 ]

#list unique values in the column fuel, and the data contains of 2 types of fuel which are Dissel and Petrol
df['fuel'].unique()

#convert categorical type of features in to numerical 
#use label encoding to covert fuel types to numbers
le = LabelEncoder()
df['fuel'] = le.fit_transform(df['fuel'])
df['fuel'] = le.fit_transform(df['fuel'])

#list unique values in the column seller_type, and the data contains of 3 types of sellers which are Individual, Dealer, and Trutmark Dealer
df['seller_type'].unique()

#use label encoding to covert seller types to numbers
df['seller_type'] = le.fit_transform(df['seller_type'])
df['seller_type'] = le.fit_transform(df['seller_type'])

#list unique values in the column transmission, and the data contains of 2 types of transmission which are Munual, and Automatic
df['transmission'].unique()

#use label encoding to covert transmission categories to numbers
df['transmission'] = le.fit_transform(df['transmission'])
df['transmission'] = le.fit_transform(df['transmission'])

#use label encoding to covert brands to numbers
df['name'] = le.fit_transform(df['name'])
df['name'] = le.fit_transform(df['name'])

In [6]:
#show statistical information of selling_price column
df['selling_price'].describe()

count    8.028000e+03
mean     6.403937e+05
std      8.027015e+05
min      2.999900e+04
25%      2.600000e+05
50%      4.500000e+05
75%      6.800000e+05
max      1.000000e+07
Name: selling_price, dtype: float64

In [7]:
#convert the label selling price into discrete variable by simply putting the price in a bucket of 0, 1, 2, 3
df['selling_price'] = pd.cut(x=df['selling_price'], bins=[29998, 260000, 450000, 680000, 10000000],
labels=['0', '1', '2','3'])

In [8]:
#show number of samples in each categories
df['selling_price'].value_counts()

0    2050
1    2044
3    1991
2    1943
Name: selling_price, dtype: int64

In [9]:
#display dataframe
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,20,2014,1,145500,0,1,1,1,23.40,1248.0,74.00,5.0
1,27,2014,1,120000,0,1,1,2,21.14,1498.0,103.52,5.0
2,10,2006,0,140000,1,1,1,3,17.70,1497.0,78.00,5.0
3,11,2010,0,127000,0,1,1,1,23.00,1396.0,90.00,5.0
4,20,2007,0,120000,1,1,1,1,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,11,2013,1,110000,1,1,1,1,18.50,1197.0,82.85,5.0
8124,11,2007,0,119000,0,1,1,4,16.80,1493.0,110.00,5.0
8125,20,2009,1,120000,0,1,1,1,19.30,1248.0,73.90,5.0
8126,28,2013,1,25000,0,1,1,1,23.57,1396.0,70.00,5.0


In [10]:
#transform selling price using log transform because big number can cause prediction to be unstable
#df['selling_price'] = np.log(df["selling_price"])

In [11]:
#convert selling price categories into int type
df['selling_price'] = df['selling_price'].astype(int)

### Train test split

In [12]:
#x is our strong features
X = df[['max_power', 'mileage',  'year']]

#y is selling price column which I would like to predict
y = df["selling_price"]

In [13]:
#split data into train and test set with ratio 70:30, and choose random state = 42
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# convert into array
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()



## Preprocessing

### Null values

In [14]:
#check for null values for X_train
X_train[['max_power', 'mileage',  'year']].isna().sum()


max_power    149
mileage      154
year           0
dtype: int64

In [15]:
#check for null values for X_test
X_test[['max_power', 'mileage',  'year']].isna().sum()

max_power    59
mileage      60
year          0
dtype: int64

In [16]:
#fill the training set 

X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_train['mileage'].fillna(X_train['mileage'].mean(), inplace=True)

#fill the testing set with the training distribution 

X_test['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_test['mileage'].fillna(X_train['mileage'].mean(), inplace=True)


In [17]:
#check null values to make sure that there is no null values left
X_train[['max_power', 'mileage', 'year']].isna().sum()

max_power    0
mileage      0
year         0
dtype: int64

In [18]:
#check null values to make sure that there is no null values left
X_test[['max_power', 'mileage', 'year']].isna().sum()

max_power    0
mileage      0
year         0
dtype: int64

In [19]:
#scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[['year','mileage','max_power']] = scaler.fit_transform(X_train[['year','mileage','max_power']])
X_test[['year','mileage','max_power']]  = scaler.transform(X_test[['year','mileage','max_power']])

## Modeling



In the experiment, I will perform using MLflow

In [None]:
#experiment tracking
import mlflow
import os
# This the dockerized method.
# We build two docker containers, one for python/jupyter and another for mlflow.
# The url `mlflow` is resolved into another container within the same composer.
mlflow.set_tracking_uri("https://mlflow.cs.ait.ac.th/")
# In the dockerized way, the user who runs this code will be `root`.
# The MLflow will also log the run user_id as `root`.
# To change that, we need to set this environ["LOGNAME"] to your name.
os.environ["LOGNAME"] = "thamakorn"
#mlflow.create_experiment(name="a3_assignment")  #create if you haven't create
mlflow.set_experiment(experiment_name="st124481-a3")

In [21]:
intercept = np.ones((X_train.shape[0], 1))
X_train   = np.concatenate((intercept, X_train), axis=1)
intercept = np.ones((X_test.shape[0], 1))
X_test    = np.concatenate((intercept, X_test), axis=1)

In [22]:
k = len(set(y))  # no. of class  
m = X_train.shape[0]  # no.of samples
n = X_train.shape[1]  # no. of features
Y_train_encoded = np.zeros((m, k))
for each_class in range(k):
    cond = y_train==each_class
    Y_train_encoded[np.where(cond), each_class] = 1

In [23]:
from sklearn.model_selection import KFold
class LogisticRegression:

    kfold = KFold(n_splits=3)
    
    def __init__(self,regulaization, k, n, method, alpha = 0.001, max_iter=5000, cv=kfold):
        self.regularization = regulaization
        self.k = k
        self.n = n
        self.alpha = alpha
        self.max_iter = max_iter
        self.method = method
        self.cv =cv

    #loss validation
    def loss_val(self, X, y):
        m = y.shape[0]
        h = self.h_theta(X, self.W)
        loss = - np.sum(y*np.log(h)) / m
        return loss
    
    def fit(self, X, Y):
        self.val_losses = []
        for fold, (train_idx, val_idx) in enumerate(self.cv.split(X)):
            print("="*5,f"Fold{fold}","="*5)
            X_cross_train = X[train_idx]
            y_cross_train = Y[train_idx]
            X_cross_val   = X[val_idx]
            y_cross_val   = Y[val_idx]
            
            self.W = np.random.rand(self.n, self.k)
            self.losses = []


          #one epoch will exhaust the WHOLE training set
            with mlflow.start_run(run_name=f"Fold-{fold}", nested=True):            
                params = {"reg": type(self).__name__, "method": self.method, "lr": self.alpha, "max_iter": self.max_iter}
                mlflow.log_params(params=params)
                if self.method == "batch":
                    start_time = time.time()
                    for i in range(self.max_iter):
                        loss, grad =  self.gradient(X_cross_train, y_cross_train)
                        self.losses.append(loss)
                        self.W = self.W - self.alpha * grad
                        val_loss = self.loss_val(X_cross_val, y_cross_val)
                        if i % 1000 == 0:
                            print(f"Train Loss at iteration {i}", loss, end=",")
                            print(f"Val Loss at iteration {i}", val_loss)
                    print(f"time taken: {time.time() - start_time}")
                    
                elif self.method == "minibatch":
                    start_time = time.time()
                    batch_size = int(0.3 * X.shape[0])
                    for i in range(self.max_iter):
                        ix = np.random.randint(0, X.shape[0]) #<----with replacement
                        batch_X = X[ix:ix+batch_size]
                        batch_Y = Y[ix:ix+batch_size]
                        loss, grad = self.gradient(batch_X, batch_Y)
                        self.losses.append(loss)
                        self.W = self.W - self.alpha * grad
                        val_loss= self.loss_val(X_cross_val, y_cross_val)
                        if i % 1000 == 0:
                            print(f"Train Loss at iteration {i}", loss, end=",")
                            print(f"Val Loss at iteration {i}", val_loss)
                    print(f"time taken: {time.time() - start_time}")
                    
                elif self.method == "sto":
                    start_time = time.time()
                    list_of_used_ix = []
                    for i in range(self.max_iter):
                        idx = np.random.randint(X.shape[0])
                        while i in list_of_used_ix:
                            idx = np.random.randint(X.shape[0])
                        X_train = X[idx, :].reshape(1, -1)
                        Y_train = Y[idx]
                        loss, grad = self.gradient(X_train, Y_train)
                        self.losses.append(loss)
                        self.W = self.W - self.alpha * grad
                        val_loss= self.loss_val(X_cross_val, y_cross_val)
                        list_of_used_ix.append(i)
                        if len(list_of_used_ix) == X.shape[0]:
                            list_of_used_ix = []
                        if i % 1000 == 0:
                            print(f"Train Loss at iteration {i}", loss, end=",")
                            print(f"Val Loss at iteration {i}", val_loss)
                    print(f"time taken: {time.time() - start_time}")
                    
                else:
                    raise ValueError('Method must be one of the followings: "batch", "minibatch" or "sto".')
            self.val_losses.append(val_loss)
        # log the average of validation loss of 3 fold on MLflow
        mlflow.log_metric(key="avg val_loss", value=sum(self.val_losses) / len(self.val_losses))
        print("Mean of val_loss of 3 fold: ",sum(self.val_losses) / len(self.val_losses) )
        params = {"reg": type(self).__name__,"method": self.method, "lr": self.alpha, "max_iter": self.max_iter}
        mlflow.log_params(params=params)
 
        
        
    def gradient(self, X, Y):
        m = X.shape[0]
        h = self.h_theta(X, self.W)
        loss = - np.sum(Y*np.log(h)) / m
        error = h - Y
        grad = self.softmax_grad(X, error)+ self.regularization.derivation(self.W)
        return loss, grad

    def softmax(self, theta_t_x):
        return np.exp(theta_t_x) / np.sum(np.exp(theta_t_x), axis=1, keepdims=True)

    def softmax_grad(self, X, error):
        return  X.T @ error

    def h_theta(self, X, W):
        '''
        Input:
            X shape: (m, n)
            w shape: (n, k)
        Returns:
            yhat shape: (m, k)
        '''
        return self.softmax(X @ W)
    
    def predict(self, X_test):
        return np.argmax(self.h_theta(X_test, self.W), axis=1)
    
    def plot(self):
        plt.plot(np.arange(len(self.losses)) , self.losses, label = "Train Losses")
        plt.title("Losses")
        plt.xlabel("epoch")
        plt.ylabel("losses")
        plt.legend()

    #create a function that calculate accuracy
    def accuracy(self, X_test, y_test):

        y_pred = self.predict(X_test)
        true_predict = 0
        for i in range(len(y_test)):
            if (y_test[i] == y_pred[i]):
                true_predict +=1
            else:
                true_predict += 0
        accuracy = true_predict/ len(y_test)
        return accuracy
    
    #create a function that calculate precision
    def precision(self, X_test, y_test, class_label):
        y_pred = self.predict(X_test)
    
        true_positives = sum(1 for i in range(len(y_test)) if y_test[i] == class_label and y_pred[i] == class_label)
        predicted_positives = sum(1 for i in range(len(y_test)) if y_pred[i] == class_label)
    
        if predicted_positives == 0:
            return 0  #  avoid division by zero.
    
        precision = true_positives / predicted_positives
        return precision
    
    #create a function that calculate recall
    def recall(self, X_test, y_test, class_label):
        y_pred = self.predict(X_test)
    
        true_positive = sum(1 for i in range(len(y_test)) if y_test[i] == class_label and y_pred[i] == class_label)
        false_negative = sum(1 for i in range(len(y_test)) if y_test[i] == class_label and y_pred[i] != class_label)
    
        if true_positive + false_negative == 0:
            return 0  
    
        recall = true_positive / (true_positive + false_negative)
        return recall
    
    #create a function that calculate f1
    def f1(self, precision, recall, class_label):
        precision = self.precision(X_test, y_test, class_label)
        recall = self.recall(X_test, y_test, class_label)

        if precision + recall == 0:
            return 0  

        f1_score = 2 * (precision * recall) / (precision + recall)
        return f1_score
    
    #create a function that calculate macro precision
    def macro_precision(self):
        class_labels = np.unique(y_test)
        total_precision = 0.0

        for label in class_labels:
            precision = self.precision(X_test, y_test, class_label=label)
            total_precision += precision

        macro_precision = total_precision / len(class_labels)
        return macro_precision
    
    #create a function that calculate macro recall
    def macro_recall(self):
        class_labels = np.unique(y_test)
        total_recall = 0.0

        for label in class_labels:
            recall = self.recall(X_test, y_test, class_label=label)
            total_recall += recall

        macro_recall = total_recall / len(class_labels)
        return macro_recall
    
    #create a function that calculate macro f1
    def macro_f1(self):
        class_labels = np.unique(y_test)
        total_f1 = 0.0

        for label in class_labels:
            f1 = self.f1(X_test, y_test, class_label=label)
            total_f1 += f1

        macro_f1 = total_f1 / len(class_labels)
        return macro_f1
    
    #create a function that calculate weighted precision
    def weighted_precision(self, X_test, y_test, class_weights):
        class_labels = np.unique(y_test)
        weighted_precision_sum = 0.0
        total_weight = 0.0

        for label in class_labels:
            precision = self.precision(X_test, y_test, class_label=label)
            class_weight = class_weights[label]
            weighted_precision_sum += precision * class_weight
            total_weight += class_weight

        weighted_precision = weighted_precision_sum / total_weight
        return weighted_precision
    
    #create a function that calculate weighted recall
    def weighted_recall(self, X_test, y_test, class_weights):
        class_labels = np.unique(y_test)
        weighted_recall_sum = 0.0
        total_weight = 0.0

        for label in class_labels:
            recall = self.recall(X_test, y_test, class_label=label)
            class_weight = class_weights[label]
            weighted_recall_sum += recall * class_weight
            total_weight += class_weight

        weighted_recall = weighted_recall_sum / total_weight
        return weighted_recall
    
    #create a function that calculate weighted f1
    def weighted_f1(self, X_test, y_test, class_weights):
        class_labels = np.unique(y_test)
        weighted_f1_sum = 0.0
        total_weight = 0.0

        for label in class_labels:
            f1 = self.f1(X_test, y_test, class_label=label)
            class_weight = class_weights[label]
            weighted_f1_sum += f1 * class_weight
            total_weight += class_weight

        weighted_f1 = weighted_f1_sum / total_weight
        return weighted_f1

In [24]:
#helper function for looping classnames
import sys

def str_to_class(classname):
    return getattr(sys.modules[__name__], classname)

In [25]:
class RidgePenalty:
    
    def __init__(self, l):
        self.l = l
        
    def __call__(self, W): #__call__ allows us to call class as method
        return self.l * np.sum(np.square(W))
        
    def derivation(self, W):
        return self.l * 2 * W

class NormalPenalty:

    def __init__(self, l):
        self.l = l
        
    def __call__(self, W): #__call__ allows us to call class as method
        return 0
        
    def derivation(self, W):
        return 0
    
    
        
class Ridge(LogisticRegression):
    
    def __init__(self, k,n, method, alpha, l):
        self.regularization = RidgePenalty(l)
        super().__init__(self.regularization,k, n, method, alpha)

class Normal(LogisticRegression):
    
    def __init__(self, k,n, method,alpha, l):
        self.regularization = NormalPenalty(l)
        super().__init__(self.regularization,k,n, method, alpha)

In [None]:
import time

regs = ["Ridge", "Normal"]
methods = ["sto","minibatch", "batch"]

for reg in regs:
    for method in methods:
        params = {"k": k, "n": n, "method": method, "alpha": 0.0001, "l": 0.1}
        # mlflow.start_run(run_name=f"method-{params['method']}-reg-{reg}-lr-{params['alpha']}-l-{params['l']}", nested=True)
        print("="*5, reg,method,"lr",0.0001,"l",0.1, "="*5)

        # #######
        type_of_regression = str_to_class(reg)    #Ridge, Normal
        model = type_of_regression(**params)  
        model.fit(X_train, Y_train_encoded)
      

        # signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
        # mlflow.sklearn.log_model(model, artifact_path='model', signature=signature)

        # # #######

        # mlflow.end_run()

In [None]:
model = LogisticRegression(k, X_train.shape[1], "batch")
model.fit(X_train, Y_train_encoded)
yhat = model.predict(X_test)
#model.plot()
print("=========Classification report=======")
print("Report: ", classification_report(y_test, yhat))

In [None]:
class_label = [0,1,2,3]
class_weights = [0.2, 0.3, 0.2, 0.3]
print(f"accuracy: {model.accuracy(X_test, y_test)}")
for i in class_label:
    print("-"*10)
    print(f"precision for class %d: {model.precision(X_test, y_test, class_label=i)}" %i)
    print(f"recall for class %d: {model.recall(X_test, y_test, class_label=i)}" %i)
    print(f"f1 for class %d: {model.f1(X_test, y_test, class_label=i)}" %i)
print("-"*10)
print(f"macro average precision: {model.macro_precision()}" )
print(f"macro average recall: {model.macro_recall()}" )
print(f"macro average f1: {model.macro_f1()}" )
print("-"*10)
print(f"macro weighted precision: {model.weighted_precision(X_test, y_test, class_weights)}" )
print(f"macro weighted recall: {model.weighted_recall(X_test, y_test, class_weights)}" )
print(f"macro weighted f1: {model.weighted_f1(X_test, y_test, class_weights)}" )

The classification report from scratch gives almost the same values of classification report of sk-learn library

## What is support

In [None]:
class RidgePenalty:
    
    def __init__(self, l):
        self.l = l
        
    def __call__(self, theta): #__call__ allows us to call class as method
        return self.l * np.sum(np.square(theta))
        
    def derivation(self, theta):
        return self.l * 2 * theta


class Ridge(LogisticRegression):
    
    def __init__(self, method,initweight,momentum, lr, l):
        self.regularization = RidgePenalty(l)
        super().__init__(self.regularization,lr, method,initweight,momentum)

        


In [None]:
#helper function for looping classnames
import sys

def str_to_class(classname):
    return getattr(sys.modules[__name__], classname)

In [None]:
# Model
# regs = ["Ridge", "Lasso", "Normal"]
methods = ["sto","mini", "batch"]
# for reg in regs:
for method in methods:
    params = {"method": method, "l": 0.1}
    mlflow.start_run(run_name=f"method-{params['method']}-theta-{params['initweight']}-momentum-{params['momentum']}-lr-{params['lr']}-reg-{reg}", nested=True)
    
    print("="*5, reg,method,theta,lr,"="*5)

    # #######
    type_of_regression = str_to_class(reg)    #Ridge, Lasso, Normal
    model = type_of_regression(**params)  
    model.fit(X_train, y_train)

    signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(model, artifact_path='model', signature=signature)

    # #######

    mlflow.end_run()

## Testing



After I obtain the best model, I test performance of the model with test dataset.

In [None]:
model = mlflow.pyfunc.load_model('runs:/c9d107e58e6a4c13ba6749dcda021f02/model/')

yhat = model.predict(X_test)
mse  = ((yhat - y_test) ** 2).sum() / y_test.shape[0]
r2 = 1-(((yhat - y_test) ** 2).sum() / (((y_test.sum()/y_test.shape[0])- y_test) ** 2).sum())


print("Test MSE: ", mse)
print("Test R2: ", r2)

## Analysis:  Feature Importance


In [None]:
def feature_importance(theta):
        coefs = pd.DataFrame(
        theta, columns=["Coefficients"], index=['max_power', 'mileage',  'year']
        )

        coefs.plot(kind="barh", figsize=(9, 7))
        plt.title("Feature Importance")
        plt.axvline(x=0, color=".5")
        plt.subplots_adjust(left=0.3)

        
run = mlflow.get_run(run_id="c9d107e58e6a4c13ba6749dcda021f02")
thetas = list()

for i in range(X_train.shape[1]-1):
        thetas.append(run.data.metrics[f'weight-{i}'])
feature_importance(thetas)