<a href="https://colab.research.google.com/github/xoro-o/colab--11/blob/main/predicting_mpg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

In [3]:
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight','Accelaration','Model Year','Origin']
df = pd.read_csv(url,names=column_names,na_values = "?",comment='\t',sep =" ",skipinitialspace=True)
df.shape

(398, 8)

In [6]:
# droppingthe na values rows

df = df.dropna()
df = df.reset_index(drop = True)#reset the index after dropping rows
df.shape # was to check any dropped rows

(392, 8)

In [8]:
## test train splits
import sklearn
from sklearn.model_selection import train_test_split  
df_train,df_test = train_test_split(df,train_size = 0.8,random_state = 1)

In [16]:
train_stats = df_train.describe().transpose()
df_train.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Accelaration,Model Year,Origin
count,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0
mean,23.404153,5.402556,189.51278,102.929712,2961.198083,15.704473,75.929712,1.591054
std,7.666909,1.701506,102.675646,37.919046,848.602146,2.725399,3.675305,0.807923
min,9.0,3.0,68.0,46.0,1613.0,8.5,70.0,1.0
25%,17.5,4.0,104.0,75.0,2219.0,14.0,73.0,1.0
50%,23.0,4.0,140.0,92.0,2755.0,15.5,76.0,1.0
75%,29.0,8.0,260.0,120.0,3574.0,17.3,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [12]:
numeric_cnames = ['Cylinders','Displacement','Horsepower','Weight','Accelaration']
df_train_norm,df_test_norm = df_train.copy(),df_test.copy()

In [13]:
for cname in numeric_cnames:
  mean = train_stats.loc[cname,'mean']
  std = train_stats.loc[cname,'std']
  df_train_norm.loc[:,cname] = (df_train_norm.loc[:,cname] - mean)/std
  df_test_norm.loc[:,cname] = (df_test_norm.loc[:,cname]-mean)/std
  

In [14]:
#df_train_norm.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Accelaration,Model Year,Origin
count,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0
mean,23.404153,7.306899000000001e-17,1.276934e-16,3.050453e-17,-1.184711e-16,2.272233e-15,75.929712,1.591054
std,7.666909,1.0,1.0,1.0,1.0,1.0,3.675305,0.807923
min,9.0,-1.412018,-1.183463,-1.501349,-1.588728,-2.643456,70.0,1.0
25%,17.5,-0.8243028,-0.8328438,-0.7365616,-0.8746125,-0.625403,73.0,1.0
50%,23.0,-0.8243028,-0.4822252,-0.2882381,-0.2429856,-0.07502492,76.0,1.0
75%,29.0,1.526556,0.6865038,0.4501771,0.722131,0.5854288,79.0,2.0
max,46.6,1.526556,2.585688,3.351094,2.567519,3.337319,82.0,3.0


In [17]:
#to check normalization
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Accelaration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


In [19]:
import torch
boundaries = torch.tensor([73,76,79])
v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(v,boundaries,right=True)
v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v,boundaries,right=True)

In [20]:
numeric_cnames.append('Model Year Bucketed')

In [28]:
# preprocessing X input,test
from torch.nn.functional import one_hot
total_origin = len(set(df_train_norm['Origin'].values))

origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values)% total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_cnames].values)
x_train = torch.cat([x_train_numeric,origin_encoded],1).float()

origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values)% total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_cnames].values)
x_test = torch.cat([x_test_numeric,origin_encoded],1).float()


In [65]:
# preprocessing Y train,test
y_train = torch.tensor(df_train_norm['MPG'].values).float() ########### NEED TO CONVERT TO FLOAT!
y_test = torch.tensor(df_test_norm['MPG'].values)         ################# NEED TO CONVERT TO FLOAT!

In [66]:
# creating dataloader
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
train_ds = TensorDataset(x_train,y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds,batch_size,shuffle = True)


In [67]:
# building the model using sequential
import torch.nn as nn
hidden_units = [8,4]
input_size = x_train.shape[1]
all_layers = []
for hidden_unit in hidden_units:
  layer = nn.Linear(input_size,hidden_unit)
  all_layers.append(layer)
  all_layers.append(nn.ReLU())
  input_size = hidden_unit
layer = nn.Linear(hidden_units[-1],1)
all_layers.append(layer)
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [71]:
# defining the loss and optimizer
loss_fn  = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr = 0.001)
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20


In [72]:
# training the model and printing loss values after every 20 epochs
for epoch in range(num_epochs):
  for x_batch,y_batch in train_dl:
    pred = model(x_batch)[:,0].float()
    loss = loss_fn(pred,y_batch).float()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  if(epoch % log_epochs ==0):
    print(f'Epoch {epoch} Loss : {loss.item():.4f} ')


Epoch 0 Loss : 0.2462 
Epoch 20 Loss : 0.2773 
Epoch 40 Loss : 5.1781 
Epoch 60 Loss : 0.2258 
Epoch 80 Loss : 0.0145 
Epoch 100 Loss : 6.6759 
Epoch 120 Loss : 5.1065 
Epoch 140 Loss : 14.8403 
Epoch 160 Loss : 2.8786 
Epoch 180 Loss : 0.7531 


In [73]:
# predictions using trained model
with torch.no_grad():
  pred = model(x_test.float())[:,0]
  loss = loss_fn(pred,y_test)
  print(f' MSE loss : {loss.item():.4f}' )
  

 MSE loss : 9.6009


In [74]:
# saving and loading the model parameters:
#   path = 'mpg.pt'
#   torch.save(model.state_dict(),path)
#   to load model first create architecture of model
#   then : model_new.load_state_dict(torch.load(path))


# saving the whole model and parameters:
path = 'mpg.pt'
torch.save(model,path)

In [75]:
model_new = torch.load(path)
model_new.eval()

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)