In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import numpy as np 
import pandas as pd 
import cudf as cpd
import cupy as cnp
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
dftest = cpd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
dftest.head()
dftrain = cpd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
dftrain.head()

DATA PRE-PROCESSING

In [5]:
dftrain.dtypes

In [7]:
dftrain.isnull().sum().sort_values(ascending=False)

In [8]:
def cleaning_data_none(dftrain: dict, fields: dict):
    for field in fields:
        dftrain[field].fillna('None', inplace=True)

def cleaning_data_int(dftrain: dict, fields: dict):
    for field in fields:
        dftrain[field].fillna(0, inplace=True)

def cleaning_data_median(dftrain: dict, fields: dict):
    for field in fields:
        dftrain[field].fillna(dftrain[field].median(), inplace=True)

In [9]:
fields_clean_none= ['PoolQC','Alley',
                     'FireplaceQu',
                     'MasVnrType',
                     'Electrical',
                     'BsmtFinType2',
                     'BsmtFinType1',
                     'BsmtExposure',
                     'BsmtQual',
                     'BsmtCond',
                     'Fence',
                     'MiscFeature',
                     'GarageCond',
                     'GarageQual',
                     'GarageFinish',
                     'GarageType',
                     'SaleType',
                     'Utilities',
                     'Exterior1st',
                     'Exterior2nd',
                     'KitchenQual',
                     'Functional']

fields_clean_int = ['GarageYrBlt', 'MSZoning', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath']

fields_clean_median = ['LotFrontage',
                        'MasVnrArea',
                        'BsmtUnfSF',
                        'TotalBsmtSF',
                         'GarageCars',
                         'GarageArea']
                       

cleaning_data_none(dftrain, fields_clean_none)
cleaning_data_int(dftrain, fields_clean_int)
cleaning_data_median(dftrain, fields_clean_median)

In [10]:
features=dftrain.columns
features=list(features[1:len(features)-1])

In [11]:
len(features)

In [12]:
features

In [13]:
dftrain.head()


In [14]:
df_types = pd.DataFrame(dftrain.dtypes, columns=["types"])
df_types_object = df_types[df_types["types"] == "object"]
for field_obj in df_types_object.index:
    dftrain[field_obj] = dftrain[field_obj].astype('category').cat.codes
dftrain.head()

In [15]:
dftrain['SalePrice'] = dftrain['SalePrice'].astype('float64')


In [16]:
Y = dftrain['SalePrice']
Y = cnp.array(Y)
Y.reshape(-1,1)
Y

In [17]:
X = dftrain[features]
X

In [18]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_scaled = scaler.fit_transform(X.as_matrix())

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_scaled, Y, test_size = 0.3, random_state = 42)

In [20]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [21]:
import cuml
from cuml import LinearRegression

In [22]:
X_test


In [23]:
lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = 'eig')
reg = lr.fit(X_train, Y_train)
print("Coefficients:")
print(reg.coef_)
print("Intercept:")
print(reg.intercept_)
print("Predictions: ")
pred = lr.predict(X_test)
print(pred)
print("MSE: ")
print(cuml.metrics.regression.mean_squared_error(Y_test,pred))
print("R2 Score:")
print(cuml.metrics.regression.r2_score(Y_test,pred))
print("MAE:")
print(cuml.metrics.regression.mean_absolute_error(Y_test,pred))

In [24]:
algorithm = ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']
print('\033[1m',"Generating comparison of MSE, MAE, R2-score for all the five Linear Regression algorithms ",'\033[0m')
for i in algorithm:
    print("Algorithm:")
    print(i)
    lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = i)
    reg = lr.fit(X_train,Y_train)
    preds = lr.predict(X_test)
    print("MSE:")
    print(cuml.metrics.regression.mean_squared_error(Y_test,preds))
    print("R2 Score:")
    print(cuml.metrics.regression.r2_score(Y_test,preds))
    print("MAE:")
    print(cuml.metrics.regression.mean_absolute_error(Y_test,preds))