In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [5]:
train = train.fillna(0)
test = test.fillna(0)

In [6]:
print(len(train.columns))
print(len(test.columns))

81
80


In [7]:
common_cols = list(set(train.columns).intersection(test.columns))

In [8]:
a = test['Id']

In [9]:
train = train.drop(columns = ['Id', 'PID', 'Lot Shape', 'Alley', 'Mo Sold', 'Fireplace Qu', 'Fireplaces'])
test = test.drop(columns = ['Id', 'PID', 'Lot Shape', 'Alley', 'Mo Sold', 'Fireplace Qu', 'Fireplaces'])

In [10]:
my_list = []
for x in train:
    if train[x].dtype == 'object':
        my_list.append(x)

In [11]:
train = pd.get_dummies(train, columns = my_list, drop_first = True)
test = pd.get_dummies(test, columns = my_list, drop_first = True)

In [12]:
common_cols = list(set(train.columns).intersection(test.columns))


In [13]:
test = test[common_cols]

In [14]:
common_cols.append('SalePrice')

In [15]:
train = train[common_cols]

In [16]:
X = train.drop(columns= ['SalePrice'])
y = train['SalePrice']

In [17]:
lr = LinearRegression()

In [18]:
lr.fit(X, y)

LinearRegression()

In [19]:
lr.predict(test)

array([122686.82622267, 151723.4182018 , 214201.40221831, 109965.43439728,
       177034.17986057,  86040.41446539, 105881.73902165, 163512.33775977,
       168829.29902944, 167062.75253157, 137879.08528848, 130586.71987958,
       133526.79775518, 280426.42746529, 117869.92927066, 125976.81410129,
       164735.52808258, 121127.37298538, 196041.80394223, 163123.15341536,
       162711.67639895, 131538.14307371, 169932.58822725, 171790.02110475,
       173342.44236587, 124790.06827212, 116906.3677053 , 133584.63337123,
       169033.35184679,  41235.6534964 , 100121.09135736, 102797.28402047,
       182637.80040688, 149086.2733554 , 207357.24628863, 176533.25240054,
       109302.11447163,  98278.24827436, 146297.46158365, 199525.38078121,
       158069.74890828, 202710.41689943, 156483.79517614, 129955.61885923,
       193673.29478013,  90302.26339259, 208627.7105771 , 150384.97052277,
       138104.79781526, 120952.39069469,  99999.48268871, 209823.58287236,
       226105.25117187, 1

In [20]:
sc = StandardScaler()
Z = sc.fit_transform(X)
test2 = sc.fit_transform(test)

In [21]:
ridge_model = Ridge(alpha = 10)

ridge_model.fit(Z, y)

Ridge(alpha=10)

In [22]:
test ['SalePrice'] = ridge_model.predict(test2)

In [23]:
test['Id'] = a

In [24]:
submission = test[['Id', 'SalePrice']].copy()

In [25]:
submission.head()

Unnamed: 0,Id,SalePrice
0,2658,148803.316425
1,2718,154941.722698
2,2414,215432.315521
3,1989,114871.586079
4,625,181395.809735


In [26]:
submission.to_csv('./datasets/submission_2.csv', index = False)