link of data used:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# load dataset

In [3]:
datapath = '/content/drive/MyDrive/ml_code/regression model/train-data.csv'
datapath2 = '/content/drive/MyDrive/ml_code/regression model/test-data.csv'

df = pd.read_csv(datapath)
df_test = pd.read_csv(datapath2)


In [4]:
index = df["Unnamed: 0"];
df.drop(df.columns[0],axis = 1,inplace = True)
df.insert(0,"index",index)

index = df_test["Unnamed: 0"];
df_test.drop(df_test.columns[0],axis = 1,inplace = True)
df_test.insert(0,"index",index)

target_variable = df["Price"]

# **understanding data**

In [5]:
df.head(10)

Unnamed: 0,index,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
5,5,Hyundai EON LPG Era Plus Option,Hyderabad,2012,75000,LPG,Manual,First,21.1 km/kg,814 CC,55.2 bhp,5.0,,2.35
6,6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5
7,7,Toyota Innova Crysta 2.8 GX AT 8S,Mumbai,2016,36000,Diesel,Automatic,First,11.36 kmpl,2755 CC,171.5 bhp,8.0,21 Lakh,17.5
8,8,Volkswagen Vento Diesel Comfortline,Pune,2013,64430,Diesel,Manual,First,20.54 kmpl,1598 CC,103.6 bhp,5.0,,5.2
9,9,Tata Indica Vista Quadrajet LS,Chennai,2012,65932,Diesel,Manual,Second,22.3 kmpl,1248 CC,74 bhp,5.0,,1.95


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB
None


In [7]:
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              1234 non-null   int64  
 1   Name               1234 non-null   object 
 2   Location           1234 non-null   object 
 3   Year               1234 non-null   int64  
 4   Kilometers_Driven  1234 non-null   int64  
 5   Fuel_Type          1234 non-null   object 
 6   Transmission       1234 non-null   object 
 7   Owner_Type         1234 non-null   object 
 8   Mileage            1234 non-null   object 
 9   Engine             1224 non-null   object 
 10  Power              1224 non-null   object 
 11  Seats              1223 non-null   float64
 12  New_Price          182 non-null    object 
dtypes: float64(1), int64(3), object(9)
memory usage: 125.5+ KB
None


# handeling missing data

###finding amount of missing data in each column

In [8]:
max_non_null_val = df["index"].count()
print("Percentage of missing values in each column:\n")
missing_percent = []
for col in df.columns:
  missing_percent.append(df[col].isnull().sum()/max_non_null_val*100)
  print(f"{col} : {df[col].isnull().sum()/max_non_null_val*100:.2f}%")

Percentage of missing values in each column:

index : 0.00%
Name : 0.00%
Location : 0.00%
Year : 0.00%
Kilometers_Driven : 0.00%
Fuel_Type : 0.00%
Transmission : 0.00%
Owner_Type : 0.00%
Mileage : 0.03%
Engine : 0.60%
Power : 0.60%
Seats : 0.70%
New_Price : 86.31%
Price : 0.00%


###dropping columns with more than 40% missing values


In [9]:
from operator import indexOf
index_list =[]
for i in missing_percent:
  if(i>40):
    index_list.append(indexOf(missing_percent,i))
df_test.drop(df.columns[index_list],axis=1,inplace=True)
df.drop(df.columns[index_list],axis=1,inplace=True)
print(df.info())
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(8)
memory usage: 611.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 12 columns

###classifying columns based on numerical or categorical data

In [10]:
def classify_cols(df , cat_thresh = 15, card_thresh  = 20 ):
  cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
  num_but_cat_cols = [col for col in df.columns if df[col].dtypes != "O" and df[col].nunique()<cat_thresh]
  cat_but_card_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique()>card_thresh]
  # cat_cols = cat_cols + num_but_cat_cols
  cat_cols = [col for col in cat_cols if col not in cat_but_card_cols]
  num_cols = [col for col in df.columns if df[col].dtypes != "O" and col not in num_but_cat_cols]
  return cat_cols, num_cols, cat_but_card_cols,num_but_cat_cols

cat_cols, num_cols, cat_but_card_cols, num_but_cat_cols = classify_cols(df)
print(f"cat_cols: {(cat_cols)}")
print(f"num_cols: {(num_cols)}")
print(f"cat_but_car: {(cat_but_card_cols)}")
print(f"num_but_cat_cols: {(num_but_cat_cols)}")

cat_cols: ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type']
num_cols: ['index', 'Year', 'Kilometers_Driven', 'Price']
cat_but_car: ['Name', 'Mileage', 'Engine', 'Power']
num_but_cat_cols: ['Seats']


# Dropping Name column

In [11]:
df.drop(columns = ["Name"],inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              6019 non-null   int64  
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(7)
memory usage: 564.4+ KB


In [12]:
df_test.drop(columns = ["Name"],inplace = True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              1234 non-null   int64  
 1   Location           1234 non-null   object 
 2   Year               1234 non-null   int64  
 3   Kilometers_Driven  1234 non-null   int64  
 4   Fuel_Type          1234 non-null   object 
 5   Transmission       1234 non-null   object 
 6   Owner_Type         1234 non-null   object 
 7   Mileage            1234 non-null   object 
 8   Engine             1224 non-null   object 
 9   Power              1224 non-null   object 
 10  Seats              1223 non-null   float64
dtypes: float64(1), int64(3), object(7)
memory usage: 106.2+ KB


In [13]:
def get_int(str):
  l=[]
  l = str.split(" ")
  if l[0]== 'null':
        return np.nan
  return l[0]

In [14]:
for col in ['Mileage', 'Engine', 'Power']:
  df[col] = df[col].astype(str).apply(get_int)
  df_test[col] = df_test[col].astype(str).apply(get_int)

for col in ['Mileage', 'Engine', 'Power']:
    df[col] = df[col].astype(float)
    df_test[col] = df_test[col].astype(float)

df.head(10)


Unnamed: 0,index,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74
5,5,Hyderabad,2012,75000,LPG,Manual,First,21.1,814.0,55.2,5.0,2.35
6,6,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461.0,63.1,5.0,3.5
7,7,Mumbai,2016,36000,Diesel,Automatic,First,11.36,2755.0,171.5,8.0,17.5
8,8,Pune,2013,64430,Diesel,Manual,First,20.54,1598.0,103.6,5.0,5.2
9,9,Chennai,2012,65932,Diesel,Manual,Second,22.3,1248.0,74.0,5.0,1.95


In [15]:
def classify_cols(df , cat_thresh = 15, card_thresh  = 20 ):
  cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
  num_but_cat_cols = [col for col in df.columns if df[col].dtypes != "O" and df[col].nunique()<cat_thresh]
  cat_but_card_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique()>card_thresh]
  # cat_cols = cat_cols + num_but_cat_cols
  cat_cols = [col for col in cat_cols if col not in cat_but_card_cols]
  num_cols = [col for col in df.columns if df[col].dtypes != "O" and col not in num_but_cat_cols]
  return cat_cols, num_cols, cat_but_card_cols,num_but_cat_cols

cat_cols, num_cols, cat_but_card_cols, num_but_cat_cols = classify_cols(df)
print(f"cat_cols: {(cat_cols)}")
print(f"num_cols: {(num_cols)}")
print(f"cat_but_car: {(cat_but_card_cols)}")
print(f"num_but_cat_cols: {(num_but_cat_cols)}")

cat_cols: ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type']
num_cols: ['index', 'Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Price']
cat_but_car: []
num_but_cat_cols: ['Seats']


###Filling missing values in numerical columns with the mean value

In [16]:
for col in num_cols:
  df[col] = df[col].fillna(df[col].mean())
  if(col != "Price"):
    df_test[col] = df_test[col].fillna(df[col].mean())
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              6019 non-null   int64  
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6019 non-null   float64
 8   Engine             6019 non-null   float64
 9   Power              6019 non-null   float64
 10  Seats              5977 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(5), int64(3), object(4)
memory usage: 564.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
--

###Filling missing values in categorical columns with mode value

In [17]:
for col in cat_cols:
  df[col] = df[col].fillna(df[col].mode()[0])
  df_test[col] = df_test[col].fillna(df[col].mode()[0])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              6019 non-null   int64  
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6019 non-null   float64
 8   Engine             6019 non-null   float64
 9   Power              6019 non-null   float64
 10  Seats              5977 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(5), int64(3), object(4)
memory usage: 564.4+ KB


In [18]:
for col in num_but_cat_cols:
  df[col] = df[col].fillna(df[col].mode()[0])
  df_test[col] = df_test[col].fillna(df[col].mode()[0])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              6019 non-null   int64  
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6019 non-null   float64
 8   Engine             6019 non-null   float64
 9   Power              6019 non-null   float64
 10  Seats              6019 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(5), int64(3), object(4)
memory usage: 564.4+ KB


In [19]:
for col in cat_but_card_cols:
  df[col] = df[col].fillna(df[col].mode()[0])
  df_test[col] = df_test[col].fillna(df[col].mode()[0])
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              6019 non-null   int64  
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6019 non-null   float64
 8   Engine             6019 non-null   float64
 9   Power              6019 non-null   float64
 10  Seats              6019 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(5), int64(3), object(4)
memory usage: 564.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
--

# Descriptive stats


In [20]:
stats = df.describe()
print(stats)

             index         Year  Kilometers_Driven      Mileage       Engine  \
count  6019.000000  6019.000000       6.019000e+03  6019.000000  6019.000000   
mean   3009.000000  2013.358199       5.873838e+04    18.134961  1621.276450   
std    1737.679967     3.269742       9.126884e+04     4.581528   599.553865   
min       0.000000  1998.000000       1.710000e+02     0.000000    72.000000   
25%    1504.500000  2011.000000       3.400000e+04    15.170000  1198.000000   
50%    3009.000000  2014.000000       5.300000e+04    18.150000  1493.000000   
75%    4513.500000  2016.000000       7.300000e+04    21.100000  1969.000000   
max    6018.000000  2019.000000       6.500000e+06    33.540000  5998.000000   

             Power        Seats        Price  
count  6019.000000  6019.000000  6019.000000  
mean    113.253050     5.276790     9.479468  
std      53.231019     0.806346    11.187917  
min      34.200000     0.000000     0.440000  
25%      78.000000     5.000000     3.500000

#correlation matrix of numerical columns



###making df with only numerical columns

In [21]:
num_df = df[num_cols]
num_df

Unnamed: 0,index,Year,Kilometers_Driven,Mileage,Engine,Power,Price
0,0,2010,72000,26.60,998.0,58.16,1.75
1,1,2015,41000,19.67,1582.0,126.20,12.50
2,2,2011,46000,18.20,1199.0,88.70,4.50
3,3,2012,87000,20.77,1248.0,88.76,6.00
4,4,2013,40670,15.20,1968.0,140.80,17.74
...,...,...,...,...,...,...,...
6014,6014,2014,27365,28.40,1248.0,74.00,4.75
6015,6015,2015,100000,24.40,1120.0,71.00,4.00
6016,6016,2012,55000,14.00,2498.0,112.00,2.90
6017,6017,2013,46000,18.90,998.0,67.10,2.65


###making correlation matrix of numerical df

In [22]:
num_corr_df = num_df.corr()
num_corr_df

Unnamed: 0,index,Year,Kilometers_Driven,Mileage,Engine,Power,Price
index,1.0,0.002354,-0.008734,0.023673,-0.004163,-0.012765,-0.020275
Year,0.002354,1.0,-0.173048,0.321534,-0.051712,0.013895,0.305327
Kilometers_Driven,-0.008734,-0.173048,1.0,-0.065253,0.091029,0.033419,-0.011493
Mileage,0.023673,0.321534,-0.065253,1.0,-0.588354,-0.506801,-0.306588
Engine,-0.004163,-0.051712,0.091029,-0.588354,1.0,0.85902,0.657118
Power,-0.012765,0.013895,0.033419,-0.506801,0.85902,1.0,0.767331
Price,-0.020275,0.305327,-0.011493,-0.306588,0.657118,0.767331,1.0


#removing numerical columns that have less than 5% correlation with target value

In [23]:
corr_coef_with_target_value = num_corr_df["Price"].values
indexes_with_lessthan_5_corr = []
for i in corr_coef_with_target_value:
  if (abs(i)<0.1):
    indexes_with_lessthan_5_corr.append(indexOf(corr_coef_with_target_value,i))
num_df.columns[indexes_with_lessthan_5_corr]
df.drop(num_df.columns[indexes_with_lessthan_5_corr], axis=1,inplace=True)
df_test.drop(num_df.columns[indexes_with_lessthan_5_corr], axis=1,inplace=True)
num_df.drop(num_df.columns[indexes_with_lessthan_5_corr], axis=1,inplace=True)
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      6019 non-null   object 
 1   Year          6019 non-null   int64  
 2   Fuel_Type     6019 non-null   object 
 3   Transmission  6019 non-null   object 
 4   Owner_Type    6019 non-null   object 
 5   Mileage       6019 non-null   float64
 6   Engine        6019 non-null   float64
 7   Power         6019 non-null   float64
 8   Seats         6019 non-null   float64
 9   Price         6019 non-null   float64
dtypes: float64(5), int64(1), object(4)
memory usage: 470.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      1234 non-null   object 
 1   Year          1234 non-null   int64  
 2   Fuel_Type     1234 non-null   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df.drop(num_df.columns[indexes_with_lessthan_5_corr], axis=1,inplace=True)


# encoding categorical data

In [24]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

columns_to_encode = cat_cols
columns_to_encode = [col for col in columns_to_encode if col not in cat_but_card_cols]

columns_to_encode

encoder = OneHotEncoder(sparse_output=False, drop="first")
encoder = encoder.fit(df[columns_to_encode])

encoded_categorical_data = encoder.transform(df[columns_to_encode])
encoded_categorical_data_test = encoder.transform(df_test[columns_to_encode])

encoded_categorical_df = pd.DataFrame(encoded_categorical_data,
                                      columns=encoder.get_feature_names_out(columns_to_encode))
encoded_categorical_df_test = pd.DataFrame(encoded_categorical_data_test,
                                      columns=encoder.get_feature_names_out(columns_to_encode))

encoded_categorical_df.index = df.index
encoded_categorical_df_test.index = df_test.index


df.drop(columns=columns_to_encode, inplace=True)
df_test.drop(columns=columns_to_encode , inplace =True)

df = pd.concat([df, encoded_categorical_df], axis=1)
df_test = pd.concat([df_test,encoded_categorical_df_test], axis = 1)

Price = df["Price"]
df.drop(columns=["Price"], inplace=True)
df = pd.concat([df, Price], axis=1)

print(df.info(10))
print(df_test.info(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       6019 non-null   int64  
 1   Mileage                    6019 non-null   float64
 2   Engine                     6019 non-null   float64
 3   Power                      6019 non-null   float64
 4   Seats                      6019 non-null   float64
 5   Location_Bangalore         6019 non-null   float64
 6   Location_Chennai           6019 non-null   float64
 7   Location_Coimbatore        6019 non-null   float64
 8   Location_Delhi             6019 non-null   float64
 9   Location_Hyderabad         6019 non-null   float64
 10  Location_Jaipur            6019 non-null   float64
 11  Location_Kochi             6019 non-null   float64
 12  Location_Kolkata           6019 non-null   float64
 13  Location_Mumbai            6019 non-null   float

#correlation matrix of whole dataframe after encoding


In [25]:
df_corr = df.corr()
df_corr

Unnamed: 0,Year,Mileage,Engine,Power,Seats,Location_Bangalore,Location_Chennai,Location_Coimbatore,Location_Delhi,Location_Hyderabad,...,Location_Pune,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Manual,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third,Price
Year,1.0,0.3215336,-0.051712,0.013895,0.015204,-0.042806,-0.131993,0.214594,-0.002714,-0.07787,...,-0.112649,0.126185,0.0007907273,-0.031919,-0.12749,-0.097059,-0.077886,-0.297913,-0.231184,0.305327
Mileage,0.321534,1.0,-0.588354,-0.506801,-0.299631,-0.077411,-0.007708,-0.024676,-0.016036,0.028454,...,-0.014661,0.113106,3.032703e-17,0.011131,-0.143909,0.333096,-0.049101,-0.122522,-0.084158,-0.306588
Engine,-0.051712,-0.5883536,1.0,0.85902,0.392982,0.080467,0.000351,0.055696,0.046141,0.006981,...,-0.041632,0.426749,-0.02087046,-0.040411,-0.4069,-0.499082,0.019378,0.042432,0.019114,0.657118
Power,0.013895,-0.5068014,0.85902,1.0,0.100188,0.088377,-0.027181,0.078231,0.040338,-0.027747,...,-0.056653,0.288974,-0.01926825,-0.04088,-0.268738,-0.638835,-0.004626,0.030194,0.003455,0.767331
Seats,0.015204,-0.2996312,0.392982,0.100188,1.0,0.015603,0.013713,0.010699,0.036109,0.017313,...,-0.004849,0.308869,-0.006258788,-0.014004,-0.302446,0.074838,0.040043,-0.006132,0.011725,0.052811
Location_Bangalore,-0.042806,-0.07741145,0.080467,0.088377,0.015603,1.0,-0.075196,-0.086439,-0.080067,-0.094298,...,-0.085372,0.039944,-0.004584795,-0.010259,-0.034309,-0.064833,0.008448,0.10789,0.027321,0.086526
Location_Chennai,-0.131993,-0.007707808,0.000351,-0.027181,0.013713,-0.075196,1.0,-0.102781,-0.095204,-0.112126,...,-0.101512,0.031487,0.02776002,-0.012198,-0.028405,0.028359,0.019759,0.068465,0.119189,-0.04613
Location_Coimbatore,0.214594,-0.02467598,0.055696,0.078231,0.010699,-0.086439,-0.102781,1.0,-0.10944,-0.128892,...,-0.11669,0.018785,-0.006266735,-0.014022,-0.011021,-0.078064,0.000686,-0.057793,-0.043563,0.172026
Location_Delhi,-0.002714,-0.01603626,0.046141,0.040338,0.036109,-0.080067,-0.095204,-0.10944,1.0,-0.11939,...,-0.108088,0.012677,-0.005804767,-0.012988,-0.017016,-0.016141,-0.012321,0.007671,-0.031337,0.006496
Location_Hyderabad,-0.07787,0.02845426,0.006981,-0.027747,0.017313,-0.094298,-0.112126,-0.128892,-0.11939,1.0,...,-0.1273,0.112327,-0.006836493,0.059155,-0.113148,0.017937,-0.014511,-0.048603,-0.051868,0.009958


# dropping columns with less than 10% correlation

In [26]:
corr_coef_with_target_value = df_corr["Price"].values
indexes_with_lessthan_30_corr = []
for i in corr_coef_with_target_value:
  if (abs(i)<0.1):
    indexes_with_lessthan_30_corr.append(indexOf(corr_coef_with_target_value,i))
df.drop(df.columns[indexes_with_lessthan_30_corr], axis=1,inplace=True)
df_test.drop(df_test.columns[indexes_with_lessthan_30_corr], axis =1, inplace = True)
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year                 6019 non-null   int64  
 1   Mileage              6019 non-null   float64
 2   Engine               6019 non-null   float64
 3   Power                6019 non-null   float64
 4   Location_Coimbatore  6019 non-null   float64
 5   Location_Kolkata     6019 non-null   float64
 6   Fuel_Type_Diesel     6019 non-null   float64
 7   Fuel_Type_Petrol     6019 non-null   float64
 8   Transmission_Manual  6019 non-null   float64
 9   Price                6019 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 470.4 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year                 1234 non-

# Splitting training data into training and testing datasets

In [27]:
y = df['Price']
df_X = df.drop(columns=['Price'])

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_X, y, test_size = 0.25, random_state = 0)


#Data scaling

In [29]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)

scaled_array_train = scaler.transform(X_train)
scaled_array_test = scaler.transform(X_test)

X_train = pd.DataFrame(scaled_array_train, columns=X_train.columns)
X_test = pd.DataFrame(scaled_array_test, columns=X_test.columns)

scaled_array_test = scaler.transform(df_test)
df_test = pd.DataFrame(scaled_array_test, columns=df_test.columns)


#Trainnig multiple regression model



In [30]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train , y_train)

In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression - MAE: {mae}, RMSE: {rmse}, R²: {r2}")

Linear Regression - MAE: 3.598931527008787, RMSE: 5.8505172154092255, R²: 0.7261393259747739


#Training a Descision Tree model

In [32]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

In [33]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"descision tree Regression - MAE: {mae}, RMSE: {rmse}, R²: {r2}")

descision tree Regression - MAE: 2.0560306872898235, RMSE: 5.827973498816888, R²: 0.7282457867929254


#Training a random forest model

In [34]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(max_depth=30, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=1000)
regressor.fit(X_train, y_train)

In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [500, 700, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(regressor, param_grid, cv=5,verbose = 2)
grid_search.fit(X_train, y_train)
best_regressor = grid_search.best_estimator_
best_parameters_rf = grid_search.best_params_

print("Best Parameters:", best_parameters_rf)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=700; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=700; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
s

Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 700}


###test for overfitting

In [36]:
regressor = RandomForestRegressor(**best_parameters_rf)
regressor.fit(X_train, y_train)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = regressor.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
r2 = r2_score(y_train, y_pred)

print(f"random forest Regression - MAE: {mae}, RMSE: {rmse}, R²: {r2}")

random forest Regression - MAE: 0.7789303795556172, RMSE: 1.6424692989235201, R²: 0.9784482626121408


###building best model

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"random forest Regression - MAE: {mae}, RMSE: {rmse}, R²: {r2}")

random forest Regression - MAE: 1.6041726242010403, RMSE: 3.9792449456011094, R²: 0.8733099533381604


#Training a svm model

In [38]:
from sklearn.svm import SVR
regressor = SVR(C=100, epsilon=1, gamma=0.01, kernel='linear')
regressor.fit(X_train, y_train)

In [43]:
!pip install optuna


Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [44]:
import optuna
def svr_objective(trial):
    params = {
        'C': trial.suggest_float('C', 0.1, 100),
        'epsilon': trial.suggest_float('epsilon', 0.01, 1),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
        'degree': trial.suggest_int('degree', 2, 5) if trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']) == 'poly' else 3,
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto'])
    }

    svr_model = SVR(**params)
    svr_model.fit(X_train, y_train)
    y_pred = svr_model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

svr_study = optuna.create_study(direction='minimize')
svr_study.optimize(svr_objective, n_trials=50)
best_parameters_svr = svr_study.best_params
print("Best hyperparameters for SVR: ", svr_study.best_params)


[I 2024-07-30 11:47:38,302] A new study created in memory with name: no-name-ed447367-b1aa-497e-bfc4-0df18132080e
[I 2024-07-30 11:48:39,492] Trial 0 finished with value: 44.18094148545178 and parameters: {'C': 99.36962336541723, 'epsilon': 0.1647724464495137, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 0 with value: 44.18094148545178.
[I 2024-07-30 11:48:43,588] Trial 1 finished with value: 31173101.117640033 and parameters: {'C': 97.15165444572065, 'epsilon': 0.5093828744477639, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 44.18094148545178.
[I 2024-07-30 11:48:45,005] Trial 2 finished with value: 20.434279674663216 and parameters: {'C': 83.04206515232055, 'epsilon': 0.7804322601172766, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 2 with value: 20.434279674663216.
[I 2024-07-30 11:48:47,073] Trial 3 finished with value: 20643927.261338394 and parameters: {'C': 79.0481933274398, 'epsilon': 0.5963164133798825, 'kernel': 'sigmoid', 'gamma': 'scale'}.

Best hyperparameters for SVR:  {'C': 99.98000958846642, 'epsilon': 0.9359223548529659, 'kernel': 'rbf', 'gamma': 'scale'}


In [46]:
regressor = SVR(**best_parameters_svr)
regressor.fit(X_train, y_train)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"svr Regression - MAE: {mae}, RMSE: {rmse}, R²: {r2}")

svr Regression - MAE: 2.0026050622048275, RMSE: 4.487321024428119, R²: 0.8388926218963053


#Stacking models

In [47]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

base_models = [
    ('linear', LinearRegression()),
    ('svr', SVR(**best_parameters_svr)),
    ('rf', RandomForestRegressor(**best_parameters_rf))
]

stack_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge()
)

stack_model.fit(X_train, y_train)

y_pred = stack_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Stacking Regressor - MAE: {mae}, RMSE: {rmse}, R²: {r2}")


Stacking Regressor - MAE: 1.6352664151091745, RMSE: 4.039853971930086, R²: 0.8694212570340741


#Training XG Boost model

In [48]:
import xgboost as xgb
model = xgb.XGBRegressor(eval_metric= 'rmse')
model.fit(X_train, y_train)


In [57]:

import optuna

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10)

    }

    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )
    y_pred = xgb_model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)
print("Best hyperparameters: ", study.best_params)


[I 2024-07-30 12:15:56,097] A new study created in memory with name: no-name-149d3f90-4cd7-4f8d-8468-e9c43e555691
[I 2024-07-30 12:15:56,276] Trial 0 finished with value: 20.11409846506419 and parameters: {'learning_rate': 0.1936200475298816, 'max_depth': 4, 'n_estimators': 136, 'colsample_bytree': 0.5945256361122702, 'subsample': 0.871001443885298, 'scale_pos_weight': 4.0599604579151976}. Best is trial 0 with value: 20.11409846506419.
[I 2024-07-30 12:15:56,395] Trial 1 finished with value: 19.946235553460205 and parameters: {'learning_rate': 0.04293254448282202, 'max_depth': 7, 'n_estimators': 50, 'colsample_bytree': 0.6658979409919759, 'subsample': 0.834422185125615, 'scale_pos_weight': 6.0732827794720725}. Best is trial 1 with value: 19.946235553460205.
[I 2024-07-30 12:15:56,698] Trial 2 finished with value: 16.74187826687289 and parameters: {'learning_rate': 0.19135198449675178, 'max_depth': 8, 'n_estimators': 146, 'colsample_bytree': 0.7147255694155832, 'subsample': 0.5164627221

Best hyperparameters:  {'learning_rate': 0.04284189640550591, 'max_depth': 10, 'n_estimators': 126, 'colsample_bytree': 0.8698605137266143, 'subsample': 0.5177425463557167, 'scale_pos_weight': 3.28936797786666}


In [58]:
model = xgb.XGBRegressor(eval_metric= 'rmse', **study.best_params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"XGBoost - MAE: {mae}, RMSE: {rmse}, R²: {r2}")

XGBoost - MAE: 1.5520885560868982, RMSE: 3.7624190475744235, R²: 0.8867402813981602


#Building an ann model

In [54]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

model.fit(X_train, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=2)

y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"ANN - MAE: {mae}, RMSE: {rmse}, R²: {r2}")


Epoch 1/100
362/362 - 2s - loss: 40.1076 - val_loss: 23.5728 - 2s/epoch - 5ms/step
Epoch 2/100
362/362 - 1s - loss: 27.9252 - val_loss: 30.0963 - 1s/epoch - 3ms/step
Epoch 3/100
362/362 - 1s - loss: 25.8421 - val_loss: 20.8706 - 1s/epoch - 3ms/step
Epoch 4/100
362/362 - 1s - loss: 25.1364 - val_loss: 19.1370 - 997ms/epoch - 3ms/step
Epoch 5/100
362/362 - 1s - loss: 24.3710 - val_loss: 21.6323 - 709ms/epoch - 2ms/step
Epoch 6/100
362/362 - 1s - loss: 25.5052 - val_loss: 16.8467 - 650ms/epoch - 2ms/step
Epoch 7/100
362/362 - 1s - loss: 22.1399 - val_loss: 19.4866 - 640ms/epoch - 2ms/step
Epoch 8/100
362/362 - 1s - loss: 22.9889 - val_loss: 20.0629 - 682ms/epoch - 2ms/step
Epoch 9/100
362/362 - 1s - loss: 24.6118 - val_loss: 17.1102 - 648ms/epoch - 2ms/step
Epoch 10/100
362/362 - 1s - loss: 22.8370 - val_loss: 21.8942 - 639ms/epoch - 2ms/step
Epoch 11/100
362/362 - 1s - loss: 21.8813 - val_loss: 18.5365 - 657ms/epoch - 2ms/step
Epoch 12/100
362/362 - 1s - loss: 21.9484 - val_loss: 34.2136

In [55]:
!pip install keras-tuner

import keras
from keras import layers
from keras_tuner import RandomSearch

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units1', min_value=32, max_value=128, step=32), activation='relu', input_dim=X_train.shape[1]))
    model.add(Dense(units=hp.Int('units2', min_value=16, max_value=64, step=16), activation='relu'))
    model.add(Dense(1))

    model.compile(
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-1, sampling='LOG')),
        loss='mean_squared_error'
    )
    return model

tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='my_dir',
    project_name='intro_to_kt'
)

tuner.search(X_train, y_train, epochs=100, validation_split=0.2)

best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()



Trial 10 Complete [00h 00m 43s]
val_loss: 16.421184539794922

Best val_loss So Far: 14.549753189086914
Total elapsed time: 00h 06m 06s
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 96)                960       
                                                                 
 dense_1 (Dense)             (None, 48)                4656      
                                                                 
 dense_2 (Dense)             (None, 1)                 49        
                                                                 
Total params: 5665 (22.13 KB)
Trainable params: 5665 (22.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [56]:
best_model.fit(X_train, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=2)

y_pred = best_model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"ANN - MAE: {mae}, RMSE: {rmse}, R²: {r2}")


Epoch 1/100
362/362 - 3s - loss: 18.7117 - val_loss: 26.7424 - 3s/epoch - 7ms/step
Epoch 2/100
362/362 - 2s - loss: 19.0846 - val_loss: 43.8406 - 2s/epoch - 6ms/step
Epoch 3/100
362/362 - 2s - loss: 20.2149 - val_loss: 19.2248 - 2s/epoch - 7ms/step
Epoch 4/100
362/362 - 1s - loss: 17.8889 - val_loss: 16.6487 - 1s/epoch - 3ms/step
Epoch 5/100
362/362 - 1s - loss: 16.8199 - val_loss: 15.3411 - 1s/epoch - 4ms/step
Epoch 6/100
362/362 - 3s - loss: 16.8236 - val_loss: 16.7463 - 3s/epoch - 8ms/step
Epoch 7/100
362/362 - 1s - loss: 16.4457 - val_loss: 17.0408 - 1s/epoch - 4ms/step
Epoch 8/100
362/362 - 1s - loss: 16.9658 - val_loss: 15.1745 - 1s/epoch - 4ms/step
Epoch 9/100
362/362 - 1s - loss: 16.6627 - val_loss: 18.4689 - 1s/epoch - 4ms/step
Epoch 10/100
362/362 - 4s - loss: 17.0136 - val_loss: 16.3303 - 4s/epoch - 11ms/step
Epoch 11/100
362/362 - 3s - loss: 16.7120 - val_loss: 16.5723 - 3s/epoch - 7ms/step
Epoch 12/100
362/362 - 2s - loss: 16.3653 - val_loss: 14.8038 - 2s/epoch - 6ms/step
