In [46]:
# File for machine learning model - Lasso Regression


import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dataname = "lasso_regression"

OUTPUT_PATH = f"../../Datasets/Evaluations/Regression/{dataname}.csv"

TEST_PATH = "../../Datasets/Vehice dataset/Downsampled/Test/test_sampled.csv"
TRAIN_PATH = "../../Datasets/Vehice dataset/Downsampled/Train/train.csv"
VALID_PATH = "../../Datasets/Vehice dataset/Downsampled/Valid/valid.csv"




In [47]:
df_test = pd.read_csv(TEST_PATH)
df_train = pd.read_csv(TRAIN_PATH)

df_test.drop(columns=['prediction', 'prompt'], inplace=True)
#df_train.drop(columns=['prediction', 'prompt'], inplace=True)


In [48]:
def pre_process(df):
    df['year'] = 2020 - df['year']
    df['mileage'] = df['mileage'].str.replace(' kmpl', '', regex=False)
    df['mileage'] = df['mileage'].str.replace(' km/kg', '', regex=False)
    df['mileage'] = pd.to_numeric(df['mileage'])

    df['engine'] = df['engine'].str.replace(' CC', '', regex=False)
    df['engine'] = pd.to_numeric(df['engine'])


    df['max_power'] = df['max_power'].str.replace(' bhp', '', regex=False)
    df['max_power'] = pd.to_numeric(df['max_power'])
    
    df = df.dropna(how='any')
    return df

df_train = pre_process(df_train)
df_test = pre_process(df_test)



In [49]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso

X_train = df_train.drop('selling_price', axis=1)
y_train = df_train['selling_price']
X_test = df_test.drop('selling_price', axis=1)
y_test = df_test['selling_price']

categorical_features = ['name', 'fuel', 'seller_type', 'transmission', 'owner', 'torque']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough'  
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

model = Lasso(alpha=1.0, random_state=42)
model.fit(X_train_transformed, y_train)

y_train_pred = model.predict(X_train_transformed)
y_test_pred = model.predict(X_test_transformed)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")
print(f"Training R^2: {train_r2}")
print(f"Test R^2: {test_r2}")



Training RMSE: 75815.86846722524
Test RMSE: 229249.8101528772
Training R^2: 0.9913595941959932
Test R^2: 0.8996625978064465


  model = cd_fast.sparse_enet_coordinate_descent(


In [50]:
df_test['prediction'] = y_test_pred

In [51]:
import pandas as pd
import numpy as np

numerical_features = ['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

categorical_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_features)
all_feature_names = np.concatenate((categorical_feature_names, numerical_features))

coefficients = model.coef_

feature_weights = dict(zip(all_feature_names, coefficients))

def get_categorical_weights(row, feature_names, feature_weights):
    weights_str = "Categorical feature coefficients from Lasso Regression:\n"
    for feature in categorical_features:
        specific_feature_name = f"{feature}_{row[feature]}"
        weight = feature_weights.get(specific_feature_name, 0)
        weights_str += f"{feature}: '{row[feature]}', weight: {weight}\n"
    return weights_str

def add_feature_weights(row, numerical_features, categorical_feature_names, feature_weights):
    numerical_weights_str = "Numerical feature coefficients from Lasso Regression:\n" + \
                            "\n".join([f"{feature}: {feature_weights[feature]}" for feature in numerical_features])
    
    categorical_weights_str = get_categorical_weights(row, categorical_feature_names, feature_weights)
    
    return numerical_weights_str + "\n" + categorical_weights_str

df_test['feature_weights'] = df_test.apply(add_feature_weights, axis=1, numerical_features=numerical_features, categorical_feature_names=categorical_feature_names, feature_weights=feature_weights)

print(df_test['feature_weights'][1])

Numerical feature coefficients from Lasso Regression:
year: -31103.119270615327
km_driven: -0.16451002827316483
mileage: 5260.349728608884
engine: 181.81024255383468
max_power: 3621.43811732179
seats: -27557.451975485492
Categorical feature coefficients from Lasso Regression:
name: 'Mahindra Scorpio VLX 2WD AIRBAG BSIII', weight: 119211.59386669246
fuel: 'Diesel', weight: 94793.5538955685
seller_type: 'Individual', weight: 11514.651168594737
transmission: 'Manual', weight: -1.1126889289197336e-06
owner: 'First Owner', weight: 27776.354307328493
torque: '290Nm@ 1800-2800rpm', weight: -132809.01702756013



In [52]:
print(df_test.head())   

                                    name  year  selling_price  km_driven  \
0     Ford Figo Aspire 1.5 TDCi Titanium     3         670000      70000   
1  Mahindra Scorpio VLX 2WD AIRBAG BSIII     8         525000     120000   
2                 Maruti Swift Dzire VDI     6         438999      81000   
3              Ford Figo Diesel Titanium    10         144000      50000   
4                 Hyundai i10 Magna 1.1L    12         185000     110000   

     fuel seller_type transmission                 owner  mileage  engine  \
0  Diesel  Individual       Manual           First Owner    25.83  1498.0   
1  Diesel  Individual       Manual           First Owner    12.05  2179.0   
2  Diesel      Dealer       Manual           First Owner    23.40  1248.0   
3  Diesel  Individual       Manual          Second Owner    20.00  1399.0   
4  Petrol  Individual       Manual  Fourth & Above Owner    19.81  1086.0   

   max_power               torque  seats     prediction  \
0      99.00  215Nm@ 

In [None]:
df_test.to_csv(OUTPUT_PATH, index=False)
