# DSO106 MachineLearn L2 Hands On

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans

# Load in data

In [20]:
Mpg = sns.load_dataset('mpg')

In [21]:
Mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


# Data wrangling

In [22]:
MpgTrimmed = Mpg.drop(['origin', 'name'], axis=1)
MpgTrimmed.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


### Drop the NAs

In [23]:
MpgTrimmed.dropna(inplace=True)

### Convert everything to integers

In [24]:
MpgTrimmed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
dtypes: float64(4), int64(3)
memory usage: 24.5 KB


In [25]:
MpgTrimmed.mpg = MpgTrimmed.mpg.astype(int)
MpgTrimmed.displacement = MpgTrimmed.displacement.astype(int)
MpgTrimmed.horsepower = MpgTrimmed.horsepower.astype(int)
MpgTrimmed.acceleration = MpgTrimmed.acceleration.astype(int)

In [26]:
MpgTrimmed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   mpg           392 non-null    int32
 1   cylinders     392 non-null    int64
 2   displacement  392 non-null    int32
 3   horsepower    392 non-null    int32
 4   weight        392 non-null    int64
 5   acceleration  392 non-null    int32
 6   model_year    392 non-null    int64
dtypes: int32(4), int64(3)
memory usage: 18.4 KB


# k-means clustering

### 2 clusters

In [27]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(MpgTrimmed)

KMeans(n_clusters=2)

In [29]:
MpgTrimmed['Group'] = kmeans.labels_
MpgTrimmed.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,Group
0,18,8,307,130,3504,12,70,1
1,15,8,350,165,3693,11,70,1
2,18,8,318,150,3436,11,70,1
3,16,8,304,150,3433,12,70,1
4,17,8,302,140,3449,10,70,1


In [30]:
MpgTrimmed.groupby('Group').mean()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,27.889831,4.305085,123.521186,82.59322,2381.381356,15.813559,76.783898
1,16.314103,7.237179,301.653846,137.564103,3879.532051,14.237179,74.762821


## Using 2 clusters it appears that the first group has a higher average in every group accept acceleration.