# Predicting House Prices

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import statsmodels.api as sm

In [2]:
def correlated_variables(data,obj_variable,min_corr=0.6):
    """
    Generates a DataFrame with the correlations and variables that are higher than "min_corr" for variable selection.
    data: DataFrame. Correlation matrix.
    min_corr: int. Minimum correlation that will be selected as high correlated
    obj_variable: string. Name of the objective variable for the model
    """
    high_corr = pd.DataFrame(columns=['Variable1','Variable2','Corr_V1&V2','Corr_V1&Obj','Corr_V2&Obj'])
    k=0
    obj_var_pos = df1_corr.columns.get_loc(obj_variable)
    for i in range(0,len(data)):
        for j in range(k,len(data)):
            if(data.iloc[i,j]>=min_corr and data.iloc[i,j]!=1):
                a = data.index[i]
                b = data.columns[j]
                c = data.iloc[i,j]
                d = data.iloc[i,obj_var_pos]
                e = data.iloc[j,obj_var_pos]
                high_corr = high_corr.append(pd.DataFrame([[a,b,c,d,e]],columns=high_corr.columns))
        k+=1
    return high_corr.sort_values('Corr_V1&V2',ascending=False)

In [3]:
def null_info(df):
    null_count = df.isna().sum().to_frame()
    null_count.columns = ['#_NA'] 
    null_count['%'] = null_count*100/len(df)
    #null_count[null_count['%']>30].sort_values('%',ascending=False)
    df_dtypes = df.dtypes.to_frame()
    df_dtypes.columns = ['data_type']
    return null_count.merge(df_dtypes,how='left',left_on=null_count.index,right_on=df_dtypes.index).sort_values('%',ascending=False)


In [4]:
#Generate 'Empty' category for categorical variables and fill with 0 numerical ones
def fill_missing_values(data):
    """
    Fill missing values with category 'Empty' for categorical variables and 0 for numerical
    Data: DataFrame. Contains the data that will be analyzed
    """
    df1 = data.copy()
    for i in df1:
        if df1[i].dtypes == 'object': 
            #print('Transforming column: ',i,'\n')
            df1[i].fillna('Empty',inplace=True)
        else:
            df1[i].fillna(0,inplace=True)
    return df1

In [5]:
def integrate_dummies(data,key):
    """Generates data set ready for applying a machine learning algorithm: sepparates the object variables, gets the dummie 
    variables and returns a data frame with all the features.
    data: pandas data frame, only contains the features that will be used for the machine learning algorithm
    key: pandas series, column that contains primary key of the dataset
    """
    is_object = data.dtypes==object
    numeric_features = [i for i in is_object.index if is_object[i]==False]
    object_features = [i for i in is_object.index if is_object[i]==True]
    dummies = pd.get_dummies(data[object_features],drop_first=True)
    dummies['key'] = key
    #dummies['key']=df_1.rut_deudor.astype(str)+df_1.rut_cliente.astype(str)
    numeric_df = data.loc[:,numeric_features]
    numeric_df['key'] = key
    return pd.merge(left=numeric_df,right=dummies, how='inner', on='key').drop('key',axis=1)

In [6]:
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
    """
    Returns the mean absolute percentage error
    y_true: 1d array. Real values of "y"
    y_pred: 1d array. Predicted values of "y"
    """
    #y_true, y_pred = check_array(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [7]:
path = 'C:/Users/tgrasty/Desktop/DS/Otros/1. House Prices/train.csv'

In [8]:
df = pd.read_csv(path)

In [9]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [10]:
df.shape

(1460, 81)

In [11]:
pd.set_option('display.max_rows', 100)

In [12]:
# Check data types
sample = df.head().transpose()
types = df.dtypes.to_frame()
types.merge(sample,how='left',left_on=types.index,right_on=sample.index)

Unnamed: 0,key_0,0_x,0_y,1,2,3,4
0,Id,int64,1,2,3,4,5
1,MSSubClass,int64,60,20,60,70,60
2,MSZoning,object,RL,RL,RL,RL,RL
3,LotFrontage,float64,65,80,68,60,84
4,LotArea,int64,8450,9600,11250,9550,14260
5,Street,object,Pave,Pave,Pave,Pave,Pave
6,Alley,object,,,,,
7,LotShape,object,Reg,Reg,IR1,IR1,IR1
8,LandContour,object,Lvl,Lvl,Lvl,Lvl,Lvl
9,Utilities,object,AllPub,AllPub,AllPub,AllPub,AllPub


### Data Cleansing

In [13]:
df_nulls = null_info(df)
df_nulls
#df_nulls[df_nulls['%']>0].sort_values('%',ascending=False)

Unnamed: 0,key_0,#_NA,%,data_type
72,PoolQC,1453,99.520548,object
74,MiscFeature,1406,96.30137,object
6,Alley,1369,93.767123,object
73,Fence,1179,80.753425,object
57,FireplaceQu,690,47.260274,object
3,LotFrontage,259,17.739726,float64
59,GarageYrBlt,81,5.547945,float64
64,GarageCond,81,5.547945,object
58,GarageType,81,5.547945,object
60,GarageFinish,81,5.547945,object


In [14]:
df1 = fill_missing_values(df)

In [15]:
df1_nulls = null_info(df1)
df1_nulls[df1_nulls['%']>0].sort_values('%',ascending=False)

Unnamed: 0,key_0,#_NA,%,data_type


In [17]:
#Transform numerical variables that represent categories into string
to_string = ['MSSubClass','YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold']
categorical_to_add = df1[to_string].astype(str)
df1.drop(to_string,axis=1,inplace=True)

### Variable Generation

In [18]:
df1['TotalArea'] = df1.PoolArea+df1.LotArea+df1.GrLivArea+df1.GarageArea

In [19]:
#features_subset = df.loc[:,['Neighborhood','TotalArea','OveralQual','HeatingQC','CentralAir','SaleCondition']]
df1['TotalLivingArea'] = df1['1stFlrSF']+df1['2ndFlrSF']

### Standardization

In [20]:
df1.SalePrice.dtypes!=object

True

In [22]:
numerical_variables = [i for i in df1 if df1[i].dtypes!=object]
numerical_variables[0:10]
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler() 
#data_scaled = scaler.fit_transform(df1)

['Id',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF']

In [None]:
data_scaled[0][:10]
df2 = pd.DataFrame(data_scaled,columns=df1.columns)
df2.head()

In [None]:
#CentralAir -> transform to binary
df1['CentralAir'] = df1['CentralAir'].map(lambda x: 1 if x=='Y' else 0)

### Variable Selection

In [None]:
df1.loc[:,df1.dtypes!='object'].shape

In [None]:
df1_corr = df1.loc[:,df1.dtypes!='object'].corr()

In [None]:
correlated_variables(df1_corr,'SalePrice')

## Transformation

In [None]:
df1 = integrate_dummies(df1,df1['Id'])
df1.head()

## Modeling

In [None]:
df1.drop(columns=['Id'],inplace=True)
df1.columns

In [None]:
#help(train_test_split)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df1.drop('SalePrice',axis=1),df1['SalePrice'], test_size=0.33)

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
y_pred = np.maximum(model.predict(x_test),np.full(len(x_test),0))
print('R^2: ',score)
print('Mean Absolute Error (MAE): ',metrics.mean_absolute_error(y_pred,y_test))
print('Root Mean Square Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_pred,y_test)))
print('Mean Absolute Percentage Error (MAPE): ',mean_absolute_percentage_error(y_test,y_pred),'%')
print('Root Mean Square Logarithmic Error (RMSLE): ',np.sqrt(metrics.mean_squared_log_error(y_test,y_pred)))

In [None]:
#model.intercept_
#model.coef_

In [None]:
results = pd.DataFrame({'y_pred':y_pred[:],'y_test':y_test[:]})
results.head(10)

In [None]:
# Train Using Stats Model
X = x_train
y = y_train

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
results_as_html = est2.summary().tables[1].as_html()

In [None]:
# Display results
lm_results = pd.read_html(results_as_html, header=0, index_col=0)[0]
lm_results.sort_values('P>|t|',ascending=True)

In [None]:
#Validate OLS coef of determination (R**2)
model.score(x_train,y_train)

### Decision Tree

In [None]:
#tree = DecisionTreeClassifier(#class_weight=None, criterion='gini', max_depth=4,max_features=None, max_leaf_nodes=None,
                       #min_impurity_decrease=0.0, min_impurity_split=None,
                       #min_samples_leaf=200, min_samples_split=2,
                       #min_weight_fraction_leaf=0.0, presort=False,
                       #random_state=123, splitter='best'
                    )
#classifier = tree.fit(x_train,y_train)
#predictions = classifier.predict(x_test)
#accuracy_score(y_true = y_test, y_pred = predictions)