In [21]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import r2_score

In [3]:
data = pd.read_csv('Data/CarPrice.csv')

In [5]:
data.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [7]:
selected_columns = ['horsepower', 'citympg', 'compressionratio', 'fueltype', 'price']
df = data[selected_columns]

In [8]:
df.head(3)

Unnamed: 0,horsepower,citympg,compressionratio,fueltype,price
0,111,21,9.0,gas,13495.0
1,111,21,9.0,gas,16500.0
2,154,19,9.0,gas,16500.0


In [9]:
df.isna().sum()

horsepower          0
citympg             0
compressionratio    0
fueltype            0
price               0
dtype: int64

In [10]:
df.fueltype.value_counts()

gas       185
diesel     20
Name: fueltype, dtype: int64

In [12]:
df['fueltype'] = df.fueltype.replace({'gas':1, 'diesel':2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fueltype'] = df.fueltype.replace({'gas':1, 'diesel':2})


In [13]:
df.head(3)

Unnamed: 0,horsepower,citympg,compressionratio,fueltype,price
0,111,21,9.0,1,13495.0
1,111,21,9.0,1,16500.0
2,154,19,9.0,1,16500.0


In [17]:
x, y = df.drop(columns = ['price']), df[['price']]

In [18]:
x.shape

(205, 4)

In [23]:
x_tr, x_ts, y_tr, y_ts = train_test_split(x,y, test_size = 0.35, random_state=64)

# LinearRegression

In [24]:
lin_reg = LinearRegression()

lin_reg.fit(x_tr, y_tr)

y_pred = lin_reg.predict(x_ts)

r2_score(y_ts, y_pred)

0.732721781672701

In [25]:
lin_reg.coef_

array([[  134.50924641,  -302.01458093,   754.78222398, -3360.50739432]])

# Ridge Regression

In [26]:
ridge_reg = Ridge(alpha = 1.3)

ridge_reg.fit(x_tr, y_tr)

y_pred_r = ridge_reg.predict(x_ts)

r2_score(y_ts, y_pred_r)

0.7335992647360231

In [27]:
ridge_reg.coef_

array([[ 135.2486452 , -289.18857348,  547.68190439, -606.73743057]])

# Lasso Regression

In [30]:
lasso_reg = Lasso(alpha = 10)

lasso_reg.fit(x_tr, y_tr)

y_pred_l = lasso_reg.predict(x_ts)

r2_score(y_ts, y_pred_l)

0.7337086552598483

In [31]:
lasso_reg.coef_

array([ 135.48769809, -285.73139968,  501.61906409,   -0.        ])

# Changing the data as per the feature coeff by lasso

In [32]:
x, y = df.drop(columns = ['fueltype', 'price']), df[['price']]
x_tr, x_ts, y_tr, y_ts = train_test_split(x,y, test_size = 0.35, random_state=64)

# LinearRegression

In [33]:
lin_reg = LinearRegression()

lin_reg.fit(x_tr, y_tr)

y_pred = lin_reg.predict(x_ts)

r2_score(y_ts, y_pred)

0.7337635403889695

In [34]:
lin_reg.coef_

array([[ 135.36632923, -286.80561467,  502.62983345]])

# Ridge Regression

In [35]:
ridge_reg = Ridge(alpha = 1.3)

ridge_reg.fit(x_tr, y_tr)

y_pred_r = ridge_reg.predict(x_ts)

r2_score(y_ts, y_pred_r)

0.7337369032054581

In [36]:
ridge_reg.coef_

array([[ 135.40236996, -286.45658565,  502.19000646]])

# Lasso Regression

In [37]:
lasso_reg = Lasso(alpha = 10)

lasso_reg.fit(x_tr, y_tr)

y_pred_l = lasso_reg.predict(x_ts)

r2_score(y_ts, y_pred_l)

0.7337086552598484

In [38]:
lasso_reg.coef_

array([ 135.48769809, -285.73139968,  501.61906409])