# House Prices Dataset

In [72]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

# Reading Train and Test Data and handling missing values

In [73]:
from sklearn import feature_selection, model_selection, ensemble,linear_model, preprocessing, metrics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.shape
test.shape


#replacing 'NA' values based on given data
def replaceNAValues(cols):
    if pd.isnull(cols):
        return "NO"
    else:
        return cols


#mode Imputation
train.LotFrontage.fillna(train.LotFrontage.mode()[0], inplace=True)
train.MasVnrType.fillna(train.MasVnrType.mode()[0], inplace=True)
train.Electrical.fillna(train.Electrical.mode()[0], inplace=True)
train.GarageYrBlt.fillna(train.GarageYrBlt.mode()[0], inplace=True)
train.MasVnrArea.fillna(train.MasVnrArea.mode()[0], inplace=True)



missFeatures = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2','FireplaceQu',
               'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PoolQC', 'Fence','MiscFeature']
for feature in missFeatures:
    train[feature] = train[feature].apply(replaceNAValues)


#converting dataTypes
encoder = preprocessing.LabelEncoder()
def convertType(data):
    for col in data:
        if data[col].dtype =='object':
            data[col] = encoder.fit_transform(data[col])
    return data


train = convertType(train)





(1460, 81)

(1459, 80)

# Applying Feature Selection Wrapper Technique based on Classifer Model

In [74]:
model = ensemble.GradientBoostingRegressor()

Y_train = train.SalePrice
Y = Y_train
X = train.drop(['SalePrice'],axis=1)
mean = {}

for i in [20,25,30,45,55]:
    rfe = feature_selection.RFE(model, n_features_to_select=i)
    rfe_fit = rfe.fit(X,Y)
    kfold = model_selection.KFold(n_splits=5)
    f = rfe_fit.get_support(1) #the most important features
    X_train= train[train.columns[f]]
    mean_squared_error = model_selection.cross_val_score(model, X_train,Y, cv=kfold, scoring ='neg_mean_absolute_error').mean()
    mean[i] = [mean_squared_error, f]

In [75]:
lst=[]
low=99999999999999
for val in mean.values():
    lst.append(val[0])
    for i in range(0, len(lst)):
        #print(low, abs(lst[i]))
        if abs(lst[i]) < low:
            low = abs(lst[i])
            features = val[1]

            
X_train = train[train.columns[features]] 


# Filtering Selected columns based on Wrapper Method from Test and Train Data 

In [76]:
X_test = test[test.columns[features]]

# Handling Test Data Missing Values

In [77]:
def handleFeaturesMissingData(X_test, features):
    for val in features:
        X_test[val].fillna(X_test[val].mode()[0], inplace=True)
        
null_values_features = X_test[X_test.columns[X_test.isnull().any()]]

features_Null =[fea for fea in null_values_features if fea not in missFeatures]
handleFeaturesMissingData(X_test, features_Null)


Na_values_featuers = X_test[X_test.columns[X_test.isnull().any()]]

for feature in Na_values_featuers:
    X_test[feature] = X_test[feature].apply(replaceNAValues)

#X_test.isnull().sum()


X_test = convertType(X_test)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Train the Model

In [78]:
model.fit(X_train,Y_train)

GradientBoostingRegressor()

# Predict the  Data

In [79]:
y_predict = model.predict(X_test)


# comparing prediction with Original Values

In [80]:
final = pd.DataFrame({'Id': test.Id, 'SalePrice': y_predict})
kaggle_sub = pd.read_csv("sample_submission.csv")

ypred = final.SalePrice
apred = kaggle_sub.SalePrice
metrics.mean_absolute_error(ypred,apred)

54670.20135931107

# Converting prediction result to CSV file

In [81]:
final.to_csv('F:\Kaggle_competition\house-prices-advanced-regression-techniques/Housing_Predictions.csv',index = False,header = True)