In [16]:
import pandas as pd 
import pickle as pkl 
import json 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge

In [17]:
!ls -lrt --block-size=K

total 508K
drwxr-xr-x 7 tirthankar-mittra tirthankar-mittra   4K Feb 20 21:07 streamlit
-rw-r--r-- 1 tirthankar-mittra tirthankar-mittra 445K Feb 21 00:45 Cars24.csv
drwxr-xr-x 2 tirthankar-mittra tirthankar-mittra   4K Feb 21 10:49 checkpoint
-rw-r--r-- 1 tirthankar-mittra tirthankar-mittra   1K Feb 21 10:52 app_stock_price.py
-rw-r--r-- 1 tirthankar-mittra tirthankar-mittra   1K Feb 21 12:37 app_hello.py
-rw-r--r-- 1 tirthankar-mittra tirthankar-mittra   1K Feb 21 12:46 requirements.txt
-rw-r--r-- 1 tirthankar-mittra tirthankar-mittra   1K Feb 21 13:08 README.md
-rw-r--r-- 1 tirthankar-mittra tirthankar-mittra   2K Feb 21 20:26 app_cars24_price.py
-rw-r--r-- 1 tirthankar-mittra tirthankar-mittra  29K Feb 21 20:31 cars24_train.ipynb


In [18]:
df = pd.read_csv("Cars24.csv")

In [19]:
df[['Model Year', 'Driven (Kms)', 'Ownership']].describe()

Unnamed: 0,Model Year,Driven (Kms),Ownership
count,5918.0,5918.0,5918.0
mean,2014.547651,60842.778979,1.285738
std,2.905185,42362.990292,0.53282
min,2007.0,179.0,1.0
25%,2012.0,30856.0,1.0
50%,2015.0,53514.0,1.0
75%,2017.0,81979.25,1.0
max,2021.0,912380.0,4.0


In [20]:
df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
Car Brand,Hyundai,Maruti,Maruti,Maruti,Tata
Model,EonERA PLUS,Wagon R 1.0LXI,Alto K10LXI,RitzVXI BS IV,NanoTWIST XTA
Price,330399,350199,229199,306399,208699
Model Year,2016,2011,2011,2011,2015
Location,Hyderabad,Hyderabad,Hyderabad,Hyderabad,Hyderabad
Fuel,Petrol,Petrol,Petrol,Petrol,Petrol
Driven (Kms),10674,20979,47330,19662,11256
Gear,Manual,Manual,Manual,Manual,Automatic
Ownership,2,1,2,1,1


In [21]:
df.isna().sum(), df.shape

(Unnamed: 0         0
 Car Brand          0
 Model            265
 Price              0
 Model Year         0
 Location           0
 Fuel               0
 Driven (Kms)       0
 Gear             265
 Ownership          0
 EMI (monthly)      0
 dtype: int64,
 (5918, 11))

In [22]:
df.dropna(inplace  = True)
df.isna().sum(), df.shape

(Unnamed: 0       0
 Car Brand        0
 Model            0
 Price            0
 Model Year       0
 Location         0
 Fuel             0
 Driven (Kms)     0
 Gear             0
 Ownership        0
 EMI (monthly)    0
 dtype: int64,
 (5653, 11))

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5653 entries, 0 to 5917
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     5653 non-null   int64 
 1   Car Brand      5653 non-null   object
 2   Model          5653 non-null   object
 3   Price          5653 non-null   int64 
 4   Model Year     5653 non-null   int64 
 5   Location       5653 non-null   object
 6   Fuel           5653 non-null   object
 7   Driven (Kms)   5653 non-null   int64 
 8   Gear           5653 non-null   object
 9   Ownership      5653 non-null   int64 
 10  EMI (monthly)  5653 non-null   int64 
dtypes: int64(6), object(5)
memory usage: 530.0+ KB


In [24]:
try:
    df.drop(columns = ["Unnamed: 0"], inplace = True, axis = 1)
except Exception as e:
    print('Already deleted!!')
categorical, numerical = [], []
for column in df.columns:
    if df[column].dtype == "object":
        categorical.append(column)
    else:
        numerical.append(column)

print(categorical, numerical)

['Car Brand', 'Model', 'Location', 'Fuel', 'Gear'] ['Price', 'Model Year', 'Driven (Kms)', 'Ownership', 'EMI (monthly)']


In [25]:
for col in categorical:
    print(f'ColName[{col}]: Nuniq[{df[col].nunique()}]')

ColName[Car Brand]: Nuniq[26]
ColName[Model]: Nuniq[902]
ColName[Location]: Nuniq[5]
ColName[Fuel]: Nuniq[5]
ColName[Gear]: Nuniq[2]


In [26]:
features = ["Car Brand", "Fuel", "Gear", \
            "Model Year", "Driven (Kms)", \
            "Ownership", "Price"]
mappings = {}
for feature in features:
    if df[feature].dtype == "object":
        df_sub = df[feature].value_counts()\
                            .reset_index()\
                            .rename(columns={"count": "rank"})
        
        df_sub["rank"] = df_sub.index 
        mapping = {}
        for index, row in df_sub.iterrows():
            mapping[row.iloc[0]] = row.iloc[1]
        mappings[feature] = mapping 

In [27]:
df_new = df[features].copy()
for k, v in mappings.items():
    df_new[k] = df_new[k].map(v)

In [28]:
X, Y = df_new.drop(columns = ["Price"]), df_new["Price"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle = True,\
                                                    test_size = 0.1)

In [29]:
X_train.columns

Index(['Car Brand', 'Fuel', 'Gear', 'Model Year', 'Driven (Kms)', 'Ownership'], dtype='object')

In [30]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
X_train = pd.DataFrame(X_train, columns = features[:-1])
X_test = pd.DataFrame(X_test, columns = features[:-1])

X_train.head().T

Unnamed: 0,0,1,2,3,4
Car Brand,0.045943,-0.259962,-0.565867,-0.565867,-0.565867
Fuel,1.07962,1.07962,-0.702927,-0.702927,-0.702927
Gear,-0.33707,-0.33707,-0.33707,-0.33707,-0.33707
Model Year,1.230392,0.18844,1.230392,-0.853511,-0.158877
Driven (Kms),-0.532841,0.386415,-0.908232,-1.180704,-0.248833
Ownership,-0.543193,1.335273,-0.543193,-0.543193,-0.543193


In [32]:
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_train)
mean_absolute_error(Y_pred, Y_train)

148418.5123543139

In [33]:
model.coef_, model.intercept_

(array([ 78961.50201624,  54549.44246194,  80164.8483803 , 139118.64808597,
          9948.42107974,   2392.43639549]),
 np.float64(507321.4505602566))

## TRAIN ERROR

In [34]:
model = Ridge(alpha = 1e3)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_train)
mean_absolute_error(Y_pred, Y_train)

146751.99895145994

In [35]:
model.coef_, model.intercept_

(array([ 69714.40431607,  48856.58584722,  70790.29255134, 113973.53652509,
          2073.14242633,  -2057.3976504 ]),
 np.float64(507321.45056025573))

## TEST ERROR

In [36]:
Y_pred_test = model.predict(X_test)
mean_absolute_error(Y_pred_test, Y_test)

134815.9002671979

In [37]:
%cd checkpoint

/home/tirthankar-mittra/StreamlitApp/checkpoint


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [38]:
with open("model.pkl", "wb") as fm:
    pkl.dump(model, fm)

with open("scaler.pkl", "wb") as fs:
    pkl.dump(scaler, fs)

with open("feature_map.json", "w") as ff:
    json.dump(mappings, ff)

with open("default_values.json", "w") as fd:
    json.dump(df_new.median().to_dict(), fd)