### Scikit Learn:
- Humongous repository of almost everything related to ML
- Typically used for Machine Learning models, preprocessings, embeddings and mathematical or feature transformations

In [48]:
import pandas as pd
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score

In [28]:
df = pd.read_csv('Data/CarPrice.csv')
df.head(3).T

Unnamed: 0,0,1,2
car_ID,1,2,3
symboling,3,3,1
CarName,alfa-romero giulia,alfa-romero stelvio,alfa-romero Quadrifoglio
fueltype,gas,gas,gas
aspiration,std,std,std
doornumber,two,two,two
carbody,convertible,convertible,hatchback
drivewheel,rwd,rwd,rwd
enginelocation,front,front,front
wheelbase,88.6,88.6,94.5


In [17]:
for i in df.columns:
    print(i)

car_ID
symboling
CarName
fueltype
aspiration
doornumber
carbody
drivewheel
enginelocation
wheelbase
carlength
carwidth
carheight
curbweight
enginetype
cylindernumber
enginesize
fuelsystem
boreratio
stroke
compressionratio
horsepower
peakrpm
citympg
highwaympg
price


In [19]:
df.fueltype.unique()

array(['gas', 'diesel'], dtype=object)

In [20]:
df.horsepower

0      111
1      111
2      154
3      102
4      115
      ... 
200    114
201    160
202    134
203    106
204    114
Name: horsepower, Length: 205, dtype: int64

In [23]:
df.peakrpm

0      5000
1      5000
2      5000
3      5500
4      5500
       ... 
200    5400
201    5300
202    5500
203    4800
204    5400
Name: peakrpm, Length: 205, dtype: int64

In [29]:
col_names = ['wheelbase', 'carlength', 'carwidth','carheight',
             'curbweight','enginesize','boreratio','stroke',
             'compressionratio','horsepower','peakrpm','citympg',
             'highwaympg','price']

df = df[col_names]

In [31]:
df.isna().sum()

wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginesize          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   wheelbase         205 non-null    float64
 1   carlength         205 non-null    float64
 2   carwidth          205 non-null    float64
 3   carheight         205 non-null    float64
 4   curbweight        205 non-null    int64  
 5   enginesize        205 non-null    int64  
 6   boreratio         205 non-null    float64
 7   stroke            205 non-null    float64
 8   compressionratio  205 non-null    float64
 9   horsepower        205 non-null    int64  
 10  peakrpm           205 non-null    int64  
 11  citympg           205 non-null    int64  
 12  highwaympg        205 non-null    int64  
 13  price             205 non-null    float64
dtypes: float64(8), int64(6)
memory usage: 22.5 KB


In [35]:
target_column = ['price']
input_cols = df.columns.difference(target_column)

In [37]:
target_column

['price']

In [36]:
input_cols

Index(['boreratio', 'carheight', 'carlength', 'carwidth', 'citympg',
       'compressionratio', 'curbweight', 'enginesize', 'highwaympg',
       'horsepower', 'peakrpm', 'stroke', 'wheelbase'],
      dtype='object')

In [39]:
# Segregating the input and the target features
X = df[input_cols]
y = df[target_column]

In [41]:
X.shape, y.shape

((205, 13), (205, 1))

## Model building

In [42]:
# Instantiating the model class
lin_reg_model = LinearRegression()

In [43]:
# Training the model
lin_reg_model.fit(X, y)

In [52]:
X_test.head(3)

Unnamed: 0,boreratio,carheight,carlength,carwidth,citympg,compressionratio,curbweight,enginesize,highwaympg,horsepower,peakrpm,stroke,wheelbase
10,3.5,54.3,176.8,64.8,23,8.8,2395,108,29,101,5800,2.8,101.2
11,3.5,54.3,176.8,64.8,23,8.8,2395,108,29,101,5800,2.8,101.2
12,3.31,54.3,176.8,64.8,21,9.0,2710,164,28,121,4250,3.19,101.2


In [53]:
# Generating the test values
X_test = X.iloc[10:35,]
y_test = y.iloc[10:35,]

# Predicting the values using the model been trained
y_pred = lin_reg_model.predict(X_test)

##### Original answers
##### Predicted answers

- Evaluation : Comparing the original answers with the predicted answers to see how good the model is in making predictions

In [56]:
lin_reg_model.score(X_test, y_test) # r2 score

0.8225790245542736

In [55]:
# Evaluate the model # r2 score
r2_score(y_test, y_pred)

0.8225790245542736