In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import math

from practice.practice7 import minibatch_size, x_batch_list

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [2]:
print(df_train.shape)
print(df_test.shape)

(1460, 81)
(1459, 80)


In [3]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
df_train.corr(numeric_only=True)['SalePrice']

Id              -0.021917
MSSubClass      -0.084284
LotFrontage      0.351799
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.477493
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageYrBlt      0.486362
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePr

In [6]:
df_train.corr(numeric_only=True)['SalePrice'][(abs(df_train.corr(numeric_only=True)['SalePrice'])>0.5)]

OverallQual     0.790982
YearBuilt       0.522897
YearRemodAdd    0.507101
TotalBsmtSF     0.613581
1stFlrSF        0.605852
GrLivArea       0.708624
FullBath        0.560664
TotRmsAbvGrd    0.533723
GarageCars      0.640409
GarageArea      0.623431
SalePrice       1.000000
Name: SalePrice, dtype: float64

In [7]:
high_features = df_train.corr(numeric_only=True)['SalePrice'][abs(df_train.corr(numeric_only=True)['SalePrice'])>0.5].index

In [8]:
high_features

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
       'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea',
       'SalePrice'],
      dtype='object')

In [9]:
df_train[high_features].isnull().sum()

OverallQual     0
YearBuilt       0
YearRemodAdd    0
TotalBsmtSF     0
1stFlrSF        0
GrLivArea       0
FullBath        0
TotRmsAbvGrd    0
GarageCars      0
GarageArea      0
SalePrice       0
dtype: int64

In [10]:
features = list(high_features[:-1])

In [11]:
features

['OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea']

In [12]:
for f in features:
    df_test[f].fillna((df_test[f].mean()), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[f].fillna((df_test[f].mean()), inplace=True)


In [13]:
x_train = df_train[features]
y_train = df_train[['SalePrice']].values
x_test = df_test[features]

In [14]:
std_scaler = StandardScaler()
std_scaler.fit(x_train)
x_train_tensor = torch.from_numpy(std_scaler.transform(x_train)).float()
x_test_tensor = torch.from_numpy(std_scaler.transform(x_test)).float()
y_train_tensor = torch.from_numpy(y_train).float()
y_train_tensor = y_train_tensor.unsqueeze(1)

In [15]:
print(x_train_tensor.shape, x_test_tensor.shape, y_train_tensor.shape)

torch.Size([1460, 10]) torch.Size([1459, 10]) torch.Size([1460, 1, 1])


In [16]:
nb_epochs = 10000
minibatch_size=256


In [17]:
class FunModel(nn.Module):
    def __init__(self,i,o):
        super().__init__()
        self.linear_layers = nn.Sequential(
             nn.Linear(i, 20),
            nn.LeakyReLU(),
            nn.Linear(20, 10),
            nn.LeakyReLU(),
            nn.Linear(10, 8),
            nn.LeakyReLU(),
            nn.Linear(8, 6),
            nn.LeakyReLU(),
            nn.Linear(6, o)

        )
    def forward(self,x):
        y = self.linear_layers(x)
        return y

In [18]:
input_dim = x_train_tensor.size(-1)
output_dim = y_train_tensor.size(-1)
model = FunModel(input_dim, output_dim)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [19]:
indices = torch.randperm(x_train_tensor.size(0))
print(indices)
x_batch_list = torch.index_select(x_train_tensor,0,index=indices)
x_batch_list = x_batch_list.split(minibatch_size,0)
y_batch_list = torch.index_select(y_train_tensor,0,index=indices)
y_batch_list = y_batch_list.split(minibatch_size,0)

tensor([ 503, 1097, 1034,  ..., 1457,   54, 1045])


In [20]:
for index in range(nb_epochs):
    indices = torch.randperm(x_train_tensor.size(0))

    x_batch_list = torch.index_select(x_train_tensor, 0, index=indices)
    y_batch_list = torch.index_select(y_train_tensor, 0, index=indices)
    x_batch_list = x_batch_list.split(minibatch_size, 0)
    y_batch_list = y_batch_list.split(minibatch_size, 0)

    epoch_loss =[]
    for x,y in zip(x_batch_list, y_batch_list):
        y_pred = model(x)
        if y.dim()==3:
            y = y.squeeze(2)

        loss = torch.sqrt(loss_function(y_pred,y))
        epoch_loss.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if(index%100)==0:
        print(index,sum(epoch_loss)/len(epoch_loss))

0 tensor(197606.4844, grad_fn=<DivBackward0>)
100 tensor(188850.3281, grad_fn=<DivBackward0>)
200 tensor(81645.9922, grad_fn=<DivBackward0>)
300 tensor(70331.6953, grad_fn=<DivBackward0>)
400 tensor(58727.9531, grad_fn=<DivBackward0>)
500 tensor(43789.4102, grad_fn=<DivBackward0>)
600 tensor(35908.3711, grad_fn=<DivBackward0>)
700 tensor(35165.3320, grad_fn=<DivBackward0>)
800 tensor(34227.4219, grad_fn=<DivBackward0>)
900 tensor(33170.4648, grad_fn=<DivBackward0>)
1000 tensor(32716.2168, grad_fn=<DivBackward0>)
1100 tensor(33482.4531, grad_fn=<DivBackward0>)
1200 tensor(32819.3398, grad_fn=<DivBackward0>)
1300 tensor(33775.8867, grad_fn=<DivBackward0>)
1400 tensor(33564.3555, grad_fn=<DivBackward0>)
1500 tensor(33831.4727, grad_fn=<DivBackward0>)
1600 tensor(32538.8027, grad_fn=<DivBackward0>)
1700 tensor(32579.8184, grad_fn=<DivBackward0>)
1800 tensor(32424.0605, grad_fn=<DivBackward0>)
1900 tensor(33293.4414, grad_fn=<DivBackward0>)
2000 tensor(32687.0391, grad_fn=<DivBackward0>)
21