### Importing the Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

### Reading the Dataset

In [2]:
car = pd.read_csv('car.csv')
car

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


### Using Sklearn for creating One Hot Vectors
Using sklearn's **LabelEncoder()** module, the three categories of **Car Model** are represented as 3 numeric values - 
- **Audi A5**: 0
- **BMW X5**: 1
- **Mercedez Benz C class**: 2

In [3]:
label_encoder = LabelEncoder()

In [4]:
car_label_encoder = car
car_label_encoder['Car Model'] = label_encoder.fit_transform(car['Car Model'])
car_label_encoder

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


Calculating the index of the column for which one hot vectors need to be generated.

In [5]:
Car_Model_idx = car_label_encoder.columns.get_loc('Car Model')

Using sklearn's **ColumnTransformaer()** module to encode the categories stored in the index **Car_Model_idx**, with sklearn's **OneHotEncoder()** module as the transformer class.

In [6]:
transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [Car_Model_idx]  # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)

Passing the labeled dataframe in this tranformer to generate a numpy array with the required encodings.

In [7]:
car_one_hot_encoded = transformer.fit_transform(car_label_encoder)

In [8]:
car_one_hot_encoded

array([[0.00e+00, 1.00e+00, 0.00e+00, 6.90e+04, 1.80e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 3.50e+04, 3.40e+04, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.70e+04, 2.61e+04, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 2.25e+04, 4.00e+04, 2.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 4.60e+04, 3.15e+04, 4.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.90e+04, 2.94e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.20e+04, 3.20e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 7.20e+04, 1.93e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.10e+04, 1.20e+04, 8.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 6.70e+04, 2.20e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 8.30e+04, 2.00e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 7.90e+04, 2.10e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.90e+04, 3.30e+04, 5.00e+00]])

Passing the encoded numpy array into a pandas dataframe for better visual representation. In order to avoid the phenomenon of **Dummy Variable Trap**, the Audi A5 attribute is dropped.

In [9]:
car_final = pd.DataFrame(columns=['Audi A5', 'BMW X5', 'Mercedez Benz C class', 'Mileage', 'Sell Price($)', 'Age(yrs)'])
for i, column in enumerate(car_final.columns):
    car_final[column] = car_one_hot_encoded[:, i]
car_final = car_final.drop(['Audi A5'], axis='columns')
car_final

Unnamed: 0,BMW X5,Mercedez Benz C class,Mileage,Sell Price($),Age(yrs)
0,1.0,0.0,69000.0,18000.0,6.0
1,1.0,0.0,35000.0,34000.0,3.0
2,1.0,0.0,57000.0,26100.0,5.0
3,1.0,0.0,22500.0,40000.0,2.0
4,1.0,0.0,46000.0,31500.0,4.0
5,0.0,0.0,59000.0,29400.0,5.0
6,0.0,0.0,52000.0,32000.0,5.0
7,0.0,0.0,72000.0,19300.0,6.0
8,0.0,0.0,91000.0,12000.0,8.0
9,0.0,1.0,67000.0,22000.0,6.0


### Splitting Dataset
The dataset is split into 2 parts - **Independent Variables** and **Dependent Variables** (To be predicted).

In [10]:
# Independent Variables
X = car_final.drop(['Sell Price($)'], axis='columns')

# Dependent Variables
y = car_final['Sell Price($)']

### Initializing Linear Regression Model

In [11]:
linReg = LinearRegression()

### Fitting the model 

In [12]:
linReg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Exercise 1 -
#### Predict price of a mercedez benz that is 4 yr old with mileage 45000

In [13]:
linReg.predict([[0, 1, 45000, 4]])

array([36991.31721062])

### Exercise 2 -
#### Predict price of a BMW X5 that is 7 yr old with mileage 86000

In [14]:
linReg.predict([[1, 0, 86000, 7]])

array([11080.74313219])

### Exercise 3 -
#### Tell me the score (accuracy) of your model. (Hint: use LinearRegression().score())

In [15]:
linReg.score(X, y)

0.9417050937281082