In [1]:
# import important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import file from external storage
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
# drop unusable column from dataframe
df.drop('car name', axis = 1, inplace = True)

In [4]:
# dataframe after drop unusable columns
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [5]:
# check null values
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
dtype: int64

In [6]:
# check info of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 25.0+ KB


**In Our Data Frame Horsepower Column Has String Data Type But It Should Be Integer Data Type So We Have To Convert It**

In [7]:
# check unique values for horsepower column
df.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [8]:
# remove uncommon data from horsepower column
df = df[df['horsepower'] != '?']

In [9]:
# convert data type of horsepower column
df['horsepower'] = df['horsepower'].astype(int)

In [10]:
# check info after converting
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int32  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
dtypes: float64(3), int32(1), int64(4)
memory usage: 26.0 KB


In [11]:
# data frame shape after cleaning
df.shape

(392, 8)

In [12]:
# check description of data and check there is any outliers or not
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


**In Our Data Frame There Is No Outliers So Our Data Frame Is Now Very Clean**

# Start Model Building

In [13]:
# create X and y
X = df.drop('mpg',axis = 1)
y = df['mpg']

In [14]:
# import train test split for split data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [22]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [15]:
# import KNN regressor for train model
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()

In [25]:
knr.fit(X_train,y_train)

KNeighborsRegressor()

In [26]:
y_pred = knr.predict(X_test)

In [19]:
# import r2 score and mean squared error for check accuracy of our model
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

In [27]:
print('R 2 Square:', r2_score(y_test,y_pred))
print('Mean Squared Error:', mean_squared_error(y_test,y_pred))

R 2 Square: 0.837828423214193
Mean Squared Error: 10.75239493670886


# Check Cross Val Score With Diff.-Diff. K Value

In [31]:
from sklearn.model_selection import cross_val_score
k_value = [1,2,3,4,5,6,7,8,9,10,20]
for i in k_value:
    knr = KNeighborsRegressor(i)
    knr.fit(X_train,y_train)
    train_acc = r2_score(y_train,knr.predict(X_train))
    val_acc = cross_val_score(knr,X_train,y_train,cv = 10,scoring=make_scorer(r2_score))
    print('K Value:',i,'Train Accuracy:',train_acc,'VAl Accuracy:', np.mean(val_acc))
    

K Value: 1 Train Accuracy: 1.0 VAl Accuracy: 0.7726659805513654
K Value: 2 Train Accuracy: 0.9507627893108449 VAl Accuracy: 0.7910045179416063
K Value: 3 Train Accuracy: 0.9182373091571079 VAl Accuracy: 0.813292825518003
K Value: 4 Train Accuracy: 0.9050822994773028 VAl Accuracy: 0.8326148619653377
K Value: 5 Train Accuracy: 0.9020239290229325 VAl Accuracy: 0.8347407929282034
K Value: 6 Train Accuracy: 0.89751216563817 VAl Accuracy: 0.8337694148931265
K Value: 7 Train Accuracy: 0.8933784794944069 VAl Accuracy: 0.8380352178664012
K Value: 8 Train Accuracy: 0.8872159134969619 VAl Accuracy: 0.8367965925188324
K Value: 9 Train Accuracy: 0.8826824690780392 VAl Accuracy: 0.836941141063152
K Value: 10 Train Accuracy: 0.8784242511613056 VAl Accuracy: 0.8391237004706369
K Value: 20 Train Accuracy: 0.8539179759572824 VAl Accuracy: 0.8251795567438215
