In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

In [2]:
train_df = pd.read_csv('D:\\DATA/houseprice/train.csv')
test_df = pd.read_csv('D:\\DATA/houseprice/test.csv')

In [3]:
test_df.shape

(1459, 80)

In [4]:
train_df.shape

(1460, 81)

In [5]:
train_df.columns,len(train_df.columns)

(Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
        'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
        'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
        'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
        'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
        'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
        'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
        'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
        'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
        'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
        'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
        'GarageCond

In [6]:
all_feature = pd.concat([train_df.iloc[:, 1:-1], test_df.iloc[:, 1:]], keys=["train", "test"])
all_feature

Unnamed: 0,Unnamed: 1,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
train,0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
train,1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
train,2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
train,3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
train,4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
test,1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
test,1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
test,1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [7]:
numeric_features = all_feature.dtypes[all_feature.dtypes != 'object'].index
numeric_features, all_feature.dtypes

(Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
        'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
        'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
        'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
        'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
        'MoSold', 'YrSold'],
       dtype='object'),
 MSSubClass         int64
 MSZoning          object
 LotFrontage      float64
 LotArea            int64
 Street            object
                   ...   
 MiscVal            int64
 MoSold             int64
 YrSold             int64
 SaleType          object
 SaleCondition     object
 Length: 79, dtype: object)

In [8]:
all_feature[numeric_features] = all_feature[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
all_feature[numeric_features]

Unnamed: 0,Unnamed: 1,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
train,0,0.067320,-0.184443,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.523038,0.580708,-0.29303,...,0.348780,-0.740634,0.199972,-0.359539,-0.103313,-0.285886,-0.063139,-0.089577,-1.551918,0.157619
train,1,-0.873466,0.458096,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.569893,1.177709,-0.29303,...,-0.059772,1.614603,-0.702722,-0.359539,-0.103313,-0.285886,-0.063139,-0.089577,-0.446848,-0.602858
train,2,0.067320,-0.055935,0.137173,0.646073,-0.507197,0.980053,0.848819,0.333448,0.097840,-0.29303,...,0.627338,-0.740634,-0.081195,-0.359539,-0.103313,-0.285886,-0.063139,-0.089577,1.026577,0.157619
train,3,0.302516,-0.398622,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.569893,-0.494771,-0.29303,...,0.785188,-0.740634,-0.184783,3.874303,-0.103313,-0.285886,-0.063139,-0.089577,-1.551918,-1.363335
train,4,0.067320,0.629439,0.518814,1.355319,-0.507197,0.947040,0.753100,1.381770,0.468770,-0.29303,...,1.685860,0.776834,0.540332,-0.359539,-0.103313,-0.285886,-0.063139,-0.089577,2.131647,0.157619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,1454,2.419286,-2.069222,-1.043758,-1.481667,1.289537,-0.043338,-0.682695,-0.569893,-0.968860,-0.29303,...,-2.195385,-0.740634,-0.702722,-0.359539,-0.103313,-0.285886,-0.063139,-0.089577,-0.078492,-1.363335
test,1455,2.419286,-2.069222,-1.049083,-1.481667,-0.507197,-0.043338,-0.682695,-0.569893,-0.415757,-0.29303,...,-0.867591,-0.740634,-0.347564,-0.359539,-0.103313,-0.285886,-0.063139,-0.089577,-0.815205,-1.363335
test,1456,-0.873466,3.884968,1.246594,-0.772420,1.289537,-0.373465,0.561660,-0.569893,1.717643,-0.29303,...,0.478774,3.005615,-0.702722,-0.359539,-0.103313,-0.285886,-0.063139,-0.089577,1.026577,-1.363335
test,1457,0.655311,-0.312950,0.034599,-0.772420,-0.507197,0.682939,0.370221,-0.569893,-0.229194,-0.29303,...,-2.195385,-0.108355,-0.229178,-0.359539,-0.103313,-0.285886,-0.063139,1.144116,0.289865,-1.363335


In [9]:
all_feature[numeric_features] = all_feature[numeric_features].fillna(0)
all_feature

Unnamed: 0,Unnamed: 1,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
train,0,0.067320,RL,-0.184443,-0.217841,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,0.157619,WD,Normal
train,1,-0.873466,RL,0.458096,-0.072032,Pave,,Reg,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,-0.446848,-0.602858,WD,Normal
train,2,0.067320,RL,-0.055935,0.137173,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,0.157619,WD,Normal
train,3,0.302516,RL,-0.398622,-0.078371,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,-1.363335,WD,Abnorml
train,4,0.067320,RL,0.629439,0.518814,Pave,,IR1,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,2.131647,0.157619,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,1454,2.419286,RM,-2.069222,-1.043758,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-0.078492,-1.363335,WD,Normal
test,1455,2.419286,RM,-2.069222,-1.049083,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-0.815205,-1.363335,WD,Abnorml
test,1456,-0.873466,RL,3.884968,1.246594,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,-1.363335,WD,Abnorml
test,1457,0.655311,RL,-0.312950,0.034599,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,MnPrv,Shed,1.144116,0.289865,-1.363335,WD,Normal


In [10]:
all_feature = pd.get_dummies(all_feature, dummy_na=True)
all_feature

Unnamed: 0,Unnamed: 1,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
train,0,0.067320,-0.184443,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.523038,0.580708,-0.29303,...,0,1,0,0,0,0,0,1,0,0
train,1,-0.873466,0.458096,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.569893,1.177709,-0.29303,...,0,1,0,0,0,0,0,1,0,0
train,2,0.067320,-0.055935,0.137173,0.646073,-0.507197,0.980053,0.848819,0.333448,0.097840,-0.29303,...,0,1,0,0,0,0,0,1,0,0
train,3,0.302516,-0.398622,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.569893,-0.494771,-0.29303,...,0,1,0,1,0,0,0,0,0,0
train,4,0.067320,0.629439,0.518814,1.355319,-0.507197,0.947040,0.753100,1.381770,0.468770,-0.29303,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,1454,2.419286,-2.069222,-1.043758,-1.481667,1.289537,-0.043338,-0.682695,-0.569893,-0.968860,-0.29303,...,0,1,0,0,0,0,0,1,0,0
test,1455,2.419286,-2.069222,-1.049083,-1.481667,-0.507197,-0.043338,-0.682695,-0.569893,-0.415757,-0.29303,...,0,1,0,1,0,0,0,0,0,0
test,1456,-0.873466,3.884968,1.246594,-0.772420,1.289537,-0.373465,0.561660,-0.569893,1.717643,-0.29303,...,0,1,0,1,0,0,0,0,0,0
test,1457,0.655311,-0.312950,0.034599,-0.772420,-0.507197,0.682939,0.370221,-0.569893,-0.229194,-0.29303,...,0,1,0,0,0,0,0,1,0,0


In [11]:
all_feature.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       ...
       'SaleType_Oth', 'SaleType_WD', 'SaleType_nan', 'SaleCondition_Abnorml',
       'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family',
       'SaleCondition_Normal', 'SaleCondition_Partial', 'SaleCondition_nan'],
      dtype='object', length=331)

In [12]:
all_feature.loc['train', slice(None), :].head()

Unnamed: 0,Unnamed: 1,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
train,0,0.06732,-0.184443,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.523038,0.580708,-0.29303,...,0,1,0,0,0,0,0,1,0,0
train,1,-0.873466,0.458096,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.569893,1.177709,-0.29303,...,0,1,0,0,0,0,0,1,0,0
train,2,0.06732,-0.055935,0.137173,0.646073,-0.507197,0.980053,0.848819,0.333448,0.09784,-0.29303,...,0,1,0,0,0,0,0,1,0,0
train,3,0.302516,-0.398622,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.569893,-0.494771,-0.29303,...,0,1,0,1,0,0,0,0,0,0
train,4,0.06732,0.629439,0.518814,1.355319,-0.507197,0.94704,0.7531,1.38177,0.46877,-0.29303,...,0,1,0,0,0,0,0,1,0,0


In [13]:
train_feature = np.array(all_feature.loc['train',].values)
train_feature.shape

(1460, 331)

In [14]:
train_feature[:13,:13].shape

(13, 13)

In [15]:
test_feature = np.array(all_feature.loc['test',].values)
test_feature.shape

(1459, 331)

In [16]:
train_labels = np.array(train_df.SalePrice.values).reshape((-1,1))
train_labels.shape

(1460, 1)

In [63]:
train_X = train_feature[:50,:]
train_y = train_labels[:50,:]
test_X = test_feature[:50,:]
test_y = train_labels[1450:,:]
print('train_X shape',train_X.shape)
print('train_y shape',train_y.shape)
print('test_X shape',test_X.shape)
print('test_y shape',test_y.shape)

train_X shape (50, 331)
train_y shape (50, 1)
test_X shape (50, 331)
test_y shape (10, 1)


In [18]:
linreg = LinearRegression()

In [19]:
linreg.fit(train_X,train_y)

LinearRegression()

In [20]:
y_predict = linreg.predict(test_X)
y_predict.shape

(1459, 1)

In [21]:
y_predict

array([[112727.],
       [159797.],
       [186629.],
       ...,
       [179188.],
       [115077.],
       [223316.]])

In [22]:
np.savetxt("output.csv", y_predict, delimiter=',')

In [23]:
import torch

In [24]:
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.utils.data as data
import torch.nn.functional as F

In [26]:
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.fc1 = nn.Linear(331, 496)
        self.fc2 = nn.Linear(496, 248)
        self.fc3 = nn.Linear(248, 124)
        self.fc4 = nn.Linear(124, 32)
        self.fc5 = nn.Linear(32, 1)
    def forward(self, x):
        x = F.relu(self.fc1(x)).clamp(min=0)
        x = F.dropout(x, p=0.1)
        x = F.relu(self.fc2(x)).clamp(min=0)
        x = F.dropout(x, p=0.1)
        x = F.relu(self.fc3(x)).clamp(min=0)
#         x = F.dropout(x, p=0.1)
        x = F.relu(self.fc4(x)).clamp(min=0)
#         x = F.dropout(x, p=0.1)
        x = F.relu(self.fc5(x)).clamp(min=0)
        return x

In [27]:
model = LinearRegression()
model

LinearRegression(
  (fc1): Linear(in_features=331, out_features=496, bias=True)
  (fc2): Linear(in_features=496, out_features=248, bias=True)
  (fc3): Linear(in_features=248, out_features=124, bias=True)
  (fc4): Linear(in_features=124, out_features=32, bias=True)
  (fc5): Linear(in_features=32, out_features=1, bias=True)
)

In [28]:
import math 
# 定义数据集的读取
class MyData(data.Dataset):
    def __init__(self, feature, label):
        self.feature = feature
        self.label = label

    def __len__(self):
        return len(self.feature)

    def __getitem__(self, idx):
        return self.feature[idx], self.label[idx]

In [57]:
class Linear_Model(): 
    def __init__(self): 
        """ Initialize the Linear Model """ 
        self.learning_rate = 0.01 
        self.epoches = 10000 
        self.loss_function = torch.nn.MSELoss()
        self.create_model()
    def create_model(self):
        self.model = LinearRegression()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.learning_rate)
    def train(self, data_X, data_y, model_save_path="model.pth"):
        for epoch in range(self.epoches):
            prediction = self.model(data_X)
            loss = self.loss_function(prediction, data_y)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            if epoch % 500 == 0:
                print(data_X.grad)
                print("epoch: {}, loss is: {}".format(epoch, loss.item()))
            
        torch.save(self.model.state_dict(), "linear.pth")
    def test(self, data_X_test, model_path="linear.pth"):
        self.model.load_state_dict(torch.load(model_path))
        prediction = self.model(data_X_test)
        return prediction

In [58]:
linear = Linear_Model()

In [64]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
print('X shape and type:',X.shape,X.dtype)
print('y shape and type:',y.shape,y.dtype)
X

X shape and type: torch.Size([50, 331]) torch.float32
y shape and type: torch.Size([50, 1]) torch.float32


  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)


tensor([[ 0.0673, -0.1844, -0.2178,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8735,  0.4581, -0.0720,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.0673, -0.0559,  0.1372,  ...,  1.0000,  0.0000,  0.0000],
        ...,
        [-0.8735,  0.6294,  0.1176,  ...,  1.0000,  0.0000,  0.0000],
        [ 3.1249, -1.5552, -0.7242,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8735, -0.1416, -0.3076,  ...,  1.0000,  0.0000,  0.0000]])

In [60]:
y

tensor([[208500.],
        [181500.],
        [223500.],
        [140000.],
        [250000.],
        [143000.],
        [307000.],
        [200000.],
        [129900.],
        [118000.],
        [129500.],
        [345000.],
        [144000.],
        [279500.],
        [157000.],
        [132000.],
        [149000.],
        [ 90000.],
        [159000.],
        [139000.],
        [325300.],
        [139400.],
        [230000.],
        [129900.],
        [154000.],
        [256300.],
        [134800.],
        [306000.],
        [207500.],
        [ 68500.],
        [ 40000.],
        [149350.],
        [179900.],
        [165500.],
        [277500.],
        [309000.],
        [145000.],
        [153000.],
        [109000.],
        [ 82000.],
        [160000.],
        [170000.],
        [144000.],
        [130250.],
        [141000.],
        [319900.],
        [239686.],
        [249700.],
        [113000.],
        [127000.]])

In [61]:
linear.train(X,y)

None
epoch: 0, loss is: 37437591552.0
None
epoch: 500, loss is: 37437591552.0
None
epoch: 1000, loss is: 37437591552.0
None
epoch: 1500, loss is: 37437591552.0
None
epoch: 2000, loss is: 37437591552.0


KeyboardInterrupt: 

In [None]:
y = 

In [37]:
x = torch.linspace(0, 1, 5)
x = torch.unsqueeze(x, dim=1)
k = 2
y = k * x + torch.rand(x.size())
x.dtype

torch.float32