# Importing all necessary libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder  

## 1. Problem statement

## 2. Data Gatharing

In [2]:
dataset = pd.read_csv("autos.csv")
dataset.head()

Unnamed: 0,symbol,loss,make,fuel,aspir,doors,style,drive,eng_loc,wb,...,eng_cc,fuel.sys,bore,stroke,comp.ratio,hp,rpm,city_mpg,hw_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [3]:
dataset.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,195,196,197,198,199,200,201,202,203,204
symbol,3,3,1,2,2,2,1,1,1,0,...,-1,-2,-1,-2,-1,-1,-1,-1,-1,-1
loss,,,,164.0,164.0,,158.0,,158.0,,...,74.0,103.0,74.0,103.0,74.0,95.0,95.0,95.0,95.0,95.0
make,alfa-romero,alfa-romero,alfa-romero,audi,audi,audi,audi,audi,audi,audi,...,volvo,volvo,volvo,volvo,volvo,volvo,volvo,volvo,volvo,volvo
fuel,gas,gas,gas,gas,gas,gas,gas,gas,gas,gas,...,gas,gas,gas,gas,gas,gas,gas,gas,diesel,gas
aspir,std,std,std,std,std,std,std,std,turbo,turbo,...,std,std,std,turbo,turbo,std,turbo,std,turbo,turbo
doors,two,two,two,four,four,two,four,four,four,two,...,four,four,four,four,four,four,four,four,four,four
style,convertible,convertible,hatchback,sedan,sedan,sedan,sedan,wagon,sedan,hatchback,...,wagon,sedan,wagon,sedan,wagon,sedan,sedan,sedan,sedan,sedan
drive,rwd,rwd,rwd,fwd,4wd,fwd,fwd,fwd,fwd,4wd,...,rwd,rwd,rwd,rwd,rwd,rwd,rwd,rwd,rwd,rwd
eng_loc,front,front,front,front,front,front,front,front,front,front,...,front,front,front,front,front,front,front,front,front,front
wb,88.6,88.6,94.5,99.8,99.4,99.8,105.8,105.8,105.8,99.5,...,104.3,104.3,104.3,104.3,104.3,109.1,109.1,109.1,109.1,109.1


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   symbol      205 non-null    int64  
 1   loss        164 non-null    float64
 2   make        205 non-null    object 
 3   fuel        205 non-null    object 
 4   aspir       205 non-null    object 
 5   doors       203 non-null    object 
 6   style       205 non-null    object 
 7   drive       205 non-null    object 
 8   eng_loc     205 non-null    object 
 9   wb          205 non-null    float64
 10  length      205 non-null    float64
 11  width       205 non-null    float64
 12  height      205 non-null    float64
 13  weight      205 non-null    int64  
 14  eng_type    205 non-null    object 
 15  cylinders   205 non-null    object 
 16  eng_cc      205 non-null    int64  
 17  fuel.sys    205 non-null    object 
 18  bore        201 non-null    float64
 19  stroke      201 non-null    f

## LableEncoding

In [5]:
dataset["cylinders"].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: cylinders, dtype: int64

In [6]:
dataset["cylinders"].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [7]:
sorted(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'])

['eight', 'five', 'four', 'six', 'three', 'twelve', 'two']

In [8]:
data = pd.DataFrame({"Test" : ['four', 'six', 'five', 'three', 'twelve', 'two', 'eight']})

In [9]:
data["Test"].replace({'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8})

0     4
1     6
2     5
3     3
4    12
5     2
6     8
Name: Test, dtype: int64

In [10]:
lableEnc = LabelEncoder()
dataset.cylinders = lableEnc.fit_transform(dataset.cylinders)

In [11]:
dataset.cylinders.head()

0    2
1    2
2    3
3    2
4    1
Name: cylinders, dtype: int32

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   symbol      205 non-null    int64  
 1   loss        164 non-null    float64
 2   make        205 non-null    object 
 3   fuel        205 non-null    object 
 4   aspir       205 non-null    object 
 5   doors       203 non-null    object 
 6   style       205 non-null    object 
 7   drive       205 non-null    object 
 8   eng_loc     205 non-null    object 
 9   wb          205 non-null    float64
 10  length      205 non-null    float64
 11  width       205 non-null    float64
 12  height      205 non-null    float64
 13  weight      205 non-null    int64  
 14  eng_type    205 non-null    object 
 15  cylinders   205 non-null    int32  
 16  eng_cc      205 non-null    int64  
 17  fuel.sys    205 non-null    object 
 18  bore        201 non-null    float64
 19  stroke      201 non-null    f

## OneHotEncoding

In [13]:
Test = ['Low','Medium','Medium','High','High','Low']
df = pd.DataFrame({"Test":Test})
df

Unnamed: 0,Test
0,Low
1,Medium
2,Medium
3,High
4,High
5,Low


In [14]:
pd.get_dummies(df,drop_first=True)

Unnamed: 0,Test_Low,Test_Medium
0,1,0
1,0,1
2,0,1
3,0,0
4,0,0
5,1,0


In [15]:
dataset.fuel.value_counts()

gas       185
diesel     20
Name: fuel, dtype: int64

In [16]:
pd.get_dummies(dataset.fuel,drop_first=True)

Unnamed: 0,gas
0,1
1,1
2,1
3,1
4,1
...,...
200,1
201,1
202,1
203,0


In [17]:
dataset.fuel = pd.get_dummies(dataset.fuel,drop_first=True)

In [18]:
dataset.head()

Unnamed: 0,symbol,loss,make,fuel,aspir,doors,style,drive,eng_loc,wb,...,eng_cc,fuel.sys,bore,stroke,comp.ratio,hp,rpm,city_mpg,hw_mpg,price
0,3,,alfa-romero,1,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,1,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,1,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,1,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,1,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [19]:
dataset.aspir.value_counts()

std      168
turbo     37
Name: aspir, dtype: int64

In [24]:
transformer = make_column_transformer(
    (OneHotEncoder(), ['aspir']),
    remainder='passthrough')

NameError: name 'make_column_transformer' is not defined

In [None]:
dataset.head()

In [None]:
dataset.aspir

## 3. EDA (Exploratory Data Analysis)