# Kaggle competition: Predicting House Prices 202003

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import StandardScaler
from pydoc import help
from scipy.stats.stats import pearsonr
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
from scipy.stats.stats import kendalltau

In [2]:
def correlated_variables(data,obj_variable,min_corr=0.6):
    """
    Generates a DataFrame with the correlations and variables that are higher than "min_corr" for variable selection.
    data: DataFrame. Correlation matrix only for numeric variables and including objective variable.
    min_corr: int. Minimum correlation that will be selected as high correlated
    obj_variable: string. Name of the objective variable for the model
    """
    high_corr = pd.DataFrame(columns=['Variable1','Variable2','Corr_V1&V2','Corr_V1&Obj','Corr_V2&Obj'])
    k=0
    obj_var_pos = data.columns.get_loc(obj_variable)
    for i in range(0,len(data)):
        for j in range(k,len(data)):
            if(data.iloc[i,j]>=min_corr and data.iloc[i,j]!=1 and j!=obj_var_pos):
                a = data.index[i]
                b = data.columns[j]
                c = data.iloc[i,j]
                d = data.iloc[i,obj_var_pos]
                e = data.iloc[j,obj_var_pos]
                high_corr = high_corr.append(pd.DataFrame([[a,b,c,d,e]],columns=high_corr.columns))
        k+=1
    return high_corr.sort_values('Corr_V1&V2',ascending=False)

In [3]:
def null_info(df):
    null_count = df.isna().sum().to_frame()
    null_count.columns = ['#_NA'] 
    null_count['%'] = null_count*100/len(df)
    #null_count[null_count['%']>30].sort_values('%',ascending=False)
    df_dtypes = df.dtypes.to_frame()
    df_dtypes.columns = ['data_type']
    return null_count.merge(df_dtypes,how='left',left_on=null_count.index,right_on=df_dtypes.index).sort_values('%',ascending=False)


In [4]:
#Generate 'Empty' category for categorical variables and fill with 0 numerical ones
def fill_missing_values(data):
    """
    Fill missing values with category 'Empty' for categorical variables and 0 for numerical
    Data: DataFrame. Contains the data that will be analyzed
    """
    df1 = data.copy()
    for i in df1:
        if df1[i].dtypes == 'object': 
            #print('Transforming column: ',i,'\n')
            df1[i].fillna('Empty',inplace=True)
        else:
            df1[i].fillna(0,inplace=True)
    return df1

In [5]:
def integrate_dummies(data,key):
    """Generates data set ready for applying a machine learning algorithm: sepparates the object variables, gets the dummie 
    variables and returns a data frame with all the features.
    data: pandas data frame, only contains the features that will be used for the machine learning algorithm
    key: pandas series, column that contains primary key of the dataset
    """
    is_object = data.dtypes==object
    numeric_features = [i for i in is_object.index if is_object[i]==False]
    object_features = [i for i in is_object.index if is_object[i]==True]
    dummies = pd.get_dummies(data[object_features],drop_first=True)
    dummies['key'] = key
    #dummies['key']=df_1.rut_deudor.astype(str)+df_1.rut_cliente.astype(str)
    numeric_df = data.loc[:,numeric_features]
    numeric_df['key'] = key
    return pd.merge(left=numeric_df,right=dummies, how='inner', on='key').drop('key',axis=1)

In [6]:
from sklearn.utils import check_array
def mean_absolute_percentage_error(y_true, y_pred): 
    """
    Returns the mean absolute percentage error
    y_true: 1d array. Real values of "y"
    y_pred: 1d array. Predicted values of "y"
    """
    #y_true, y_pred = check_array(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [7]:
def pearson_filtering(data,obj_variable,max_pvalue=0.05,min_pearson=0.5):
    """
    Returns DataFrame with the pearson correlation and p-values for numerical variables of "data" with a higher correlation than "threshold" with the "obj_variable"
    data: Data Frame containing numerical and categorical variables as well as the objective variable
    max_pvalue: float/int. Maximum pvalue for numeric variables correlation
    min_pearson: float/int. Minimum pearson correlation between the regressor variables and objective variable ("obj_variable")
    obj_variable: string. Name of the objective variable within "data"
    """
    data_corr = data.loc[:,data.dtypes!='object'].corr()
    selected_numerical = data_corr[data_corr[obj_variable].abs()>min_pearson][obj_variable]
    try:
        selected_numerical.drop([obj_variable],inplace=True)
    except (KeyError):
        pass

    #Calculate P-values and filtering significant variables (<5%)
    pearson_stats = [pearsonr(data[i].to_numpy(),data[obj_variable]) for i in data[selected_numerical.index]]
    result = pd.DataFrame(pearson_stats,columns=['Corr_Coef','P-value'],index=selected_numerical.index)
    return result[result['P-value']<max_pvalue].sort_values('Corr_Coef',ascending=False)

In [8]:
path = 'C:/Users/tgrasty/Desktop/DS/Otros/1. House Prices/train.csv'

In [9]:
df = pd.read_csv(path)

In [10]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
df.shape

(1460, 81)

In [12]:
pd.set_option('display.max_rows', 100)

In [13]:
# Check data types
sample = df.head().transpose()
types = df.dtypes.to_frame()
types.merge(sample,how='left',left_on=types.index,right_on=sample.index)

Unnamed: 0,key_0,0_x,0_y,1,2,3,4
0,Id,int64,1,2,3,4,5
1,MSSubClass,int64,60,20,60,70,60
2,MSZoning,object,RL,RL,RL,RL,RL
3,LotFrontage,float64,65,80,68,60,84
4,LotArea,int64,8450,9600,11250,9550,14260
5,Street,object,Pave,Pave,Pave,Pave,Pave
6,Alley,object,,,,,
7,LotShape,object,Reg,Reg,IR1,IR1,IR1
8,LandContour,object,Lvl,Lvl,Lvl,Lvl,Lvl
9,Utilities,object,AllPub,AllPub,AllPub,AllPub,AllPub


### Data Cleansing

In [14]:
df_nulls = null_info(df)
df_nulls
#df_nulls[df_nulls['%']>0].sort_values('%',ascending=False)

Unnamed: 0,key_0,#_NA,%,data_type
72,PoolQC,1453,99.520548,object
74,MiscFeature,1406,96.30137,object
6,Alley,1369,93.767123,object
73,Fence,1179,80.753425,object
57,FireplaceQu,690,47.260274,object
3,LotFrontage,259,17.739726,float64
59,GarageYrBlt,81,5.547945,float64
64,GarageCond,81,5.547945,object
58,GarageType,81,5.547945,object
60,GarageFinish,81,5.547945,object


In [15]:
df1 = fill_missing_values(df)

In [16]:
df1_nulls = null_info(df1)
df1_nulls[df1_nulls['%']>0].sort_values('%',ascending=False)

Unnamed: 0,key_0,#_NA,%,data_type


#### Correction of data types

In [17]:
df1['CentralAir'].unique()

array(['Y', 'N'], dtype=object)

In [18]:
#CentralAir -> transform to binary
df1['CentralAir'] = df1['CentralAir'].map(lambda x: 1 if x=='Y' else 0)

In [19]:
#Correct current numeric variables that represent categories: transform to string
to_string = ['MSSubClass','YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold']
new_categorical = df1[to_string].astype(str)
df1.drop(to_string,axis=1,inplace=True)
df1 = pd.concat([df1,new_categorical],axis=1)

In [20]:
df1.MSSubClass.dtypes

dtype('O')

###  Variable Generation

In [21]:
df1['TotalArea'] = df1.PoolArea+df1.LotArea+df1.GrLivArea+df1.GarageArea

In [22]:
#features_subset = df.loc[:,['Neighborhood','TotalArea','OveralQual','HeatingQC','CentralAir','SaleCondition']]
df1['TotalLivingArea'] = df1['1stFlrSF']+df1['2ndFlrSF']

### Standardization

In [23]:
df1.shape

(1460, 83)

In [24]:
numerical_variables = [i for i in df1 if df1[i].dtypes!=object and i!="Id"]
standard_df1 = (df1[numerical_variables]-df1[numerical_variables].mean())/df1[numerical_variables].std()
df2 = df1.drop(numerical_variables,axis=1)
df2 = pd.concat([df2,standard_df1],axis=1)
df2.head()
#scaler = StandardScaler() 
#data_scaled = scaler.fit_transform(df1)

Unnamed: 0,Id,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SalePrice,TotalArea,TotalLivingArea
0,1,RL,Pave,Empty,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,-0.751918,0.216429,-0.359202,-0.116299,-0.270116,-0.068668,-0.087658,0.347154,-0.176801,0.384487
1,2,RL,Pave,Empty,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,1.625638,-0.704242,-0.359202,-0.116299,-0.270116,-0.068668,-0.087658,0.007286,-0.116493,-0.475128
2,3,RL,Pave,Empty,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,-0.751918,-0.070337,-0.359202,-0.116299,-0.270116,-0.068668,-0.087658,0.53597,0.111576,0.530315
3,4,RL,Pave,Empty,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,-0.751918,-0.175988,4.091122,-0.116299,-0.270116,-0.068668,-0.087658,-0.515105,-0.058837,0.397919
4,5,RL,Pave,Empty,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0.77993,0.563567,-0.359202,-0.116299,-0.270116,-0.068668,-0.087658,0.869545,0.470083,1.320854


### Variable Selection

In [25]:
df2.loc[:,df2.dtypes!='object'].shape

(1460, 35)

#### Filtering numerical variables: Pearson correlation

In [94]:
selected_numerical = pearson_filtering(df2,'SalePrice',max_pvalue=0.05,min_pearson=0.4)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(obj)


In [95]:
selected_numerical

Unnamed: 0,Corr_Coef,P-value
OverallQual,0.790982,2.185675e-313
TotalLivingArea,0.716883,1.183941e-230
GrLivArea,0.708624,4.518034e-223
GarageCars,0.640409,2.498644e-169
GarageArea,0.623431,5.265038e-158
TotalBsmtSF,0.613581,9.484229e-152
1stFlrSF,0.605852,5.394711e-147
FullBath,0.560664,1.23647e-121
TotRmsAbvGrd,0.533723,2.772281e-108
MasVnrArea,0.472614,4.100461e-82


In [61]:
#Check correlation
a = pd.DataFrame(data=selected_numerical.index)
b = pd.DataFrame([['SalePrice']],columns=a.columns)
correlation = df2[a.append(b).iloc[:,0]].corr()
correlation

Unnamed: 0,OverallQual,TotalLivingArea,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,SalePrice
OverallQual,1.0,0.600758,0.593007,0.600671,0.562022,0.537808,0.476224,0.5506,0.427452,0.790982
TotalLivingArea,0.600758,1.0,0.995718,0.479932,0.479189,0.461738,0.572041,0.635296,0.820088,0.716883
GrLivArea,0.593007,0.995718,1.0,0.467247,0.468997,0.454868,0.566024,0.630012,0.825489,0.708624
GarageCars,0.600671,0.479932,0.467247,1.0,0.882475,0.434585,0.439317,0.469672,0.362289,0.640409
GarageArea,0.562022,0.479189,0.468997,0.882475,1.0,0.486665,0.489782,0.405656,0.337822,0.623431
TotalBsmtSF,0.537808,0.461738,0.454868,0.434585,0.486665,1.0,0.81953,0.323722,0.285573,0.613581
1stFlrSF,0.476224,0.572041,0.566024,0.439317,0.489782,0.81953,1.0,0.380637,0.409516,0.605852
FullBath,0.5506,0.635296,0.630012,0.469672,0.405656,0.323722,0.380637,1.0,0.554784,0.560664
TotRmsAbvGrd,0.427452,0.820088,0.825489,0.362289,0.337822,0.285573,0.409516,0.554784,1.0,0.533723
SalePrice,0.790982,0.716883,0.708624,0.640409,0.623431,0.613581,0.605852,0.560664,0.533723,1.0


#### Filtering categorical variables: one hot encoding + Pearson correlation

In [51]:
selected_categorical = df2.loc[:,df2.dtypes == 'object']
#selected_categorical = pd.concat([selected_categorical,df2['SalePrice']],axis=1)

In [52]:
dummies = pd.get_dummies(selected_categorical.iloc[:,:],drop_first=True)

In [53]:
dummies = pd.concat([dummies,df2['SalePrice']],axis=1)

In [91]:
selected_categorical = pearson_filtering(dummies,'SalePrice',max_pvalue=0.05,min_pearson=0.4)
selected_categorical

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(obj)


Unnamed: 0,Corr_Coef,P-value
BsmtQual_Ex,0.553105,9.157709000000001e-118
Foundation_PConc,0.497734,3.124704e-92
ExterQual_Gd,0.452466,1.380858e-74
BsmtFinType1_GLQ,0.434597,2.544986e-68
GarageFinish_Fin,0.419678,2.2742039999999997e-63
Neighborhood_NridgHt,0.402149,7.301451e-58
GarageFinish_Unf,-0.410608,1.7657100000000002e-60
BsmtQual_TA,-0.452394,1.46679e-74
KitchenQual_TA,-0.519298,1.285609e-101
ExterQual_TA,-0.589044,4.309567e-137


#### Filtering categorical variables: one hot encoding + linear regression coeficient´s significance

In [None]:
# Discarded because the coefficients depend on the amount of variables selected for the regression. It was not clear 
# wheather it was correct to iteratively consider one variable at a time

In [45]:
#selected_categorical = df2.loc[:,df2.dtypes == 'object']
#dummies = pd.get_dummies(selected_categorical.iloc[:,:],drop_first=True)
#X2 = sm.add_constant(dummies)
#est = sm.OLS(df2['SalePrice'], X2)
#est2 = est.fit()
#results_as_html = est2.summary().tables[1].as_html()

#lm_results = pd.read_html(results_as_html, header=0, index_col=0)[0]
#lm_results
#lm_results.sort_values('P>|t|',ascending=True)

#### Filtering categorical variables: Kendall’s tau correlation

In [None]:
# Kendall´s correlation is discarded: because Kendall´s tau is designed for "ordinal association", 
# which is not the case here for categorical variables and also because it doesn´t measure the "magnitude" or "strength" of
# the association between the variables, but just whether they are in the same order or not.

In [44]:
#selected_categorical = df2.loc[:,df2.dtypes == 'object']
#selected_categorical = pd.concat([selected_categorical,df2['SalePrice']],axis=1)

# Null hypothesis for Kendall Tau test is absence of association
#kendall_corr = pd.DataFrame(index=selected_categorical.columns,columns=['kendall_corr','p-value'])
#j = 0
#for i in selected_categorical:
#    result = kendalltau(selected_categorical.loc[:,i],df2['SalePrice'])
#    kendall_corr.iloc[j,0] = result.correlation
#    kendall_corr.iloc[j,1] = result.pvalue
#    j+=1
#kendall_corr[kendall_corr['p-value']<0.05].sort_values('kendall_corr',ascending=False)

#### Filtering categorical variables: ANOVA

In [None]:
# ANOVA is discarded: the assumption of the distribution of the error is not met.

In [32]:
#selected_categorical = df1.loc[:,df2.dtypes=='object']
#selected_categorical = pd.concat([selected_categorical,df2['SalePrice']],axis=1)

#mod = ols('SalePrice ~ LotShape',data=selected_categorical).fit()

##To test normality of errors execute the following:
##import scipy.stats as stats
##stats.shapiro(mod.resid)

#aov_table = sm.stats.anova_lm(mod, typ=2)
#print(aov_table)

# Post hoc test
#mc = MultiComparison(df2['SalePrice'], df['LotShape'])
#mc_results = mc.tukeyhsd()
#print(mc_results)

### Correlation and Integration

In [101]:
# Join selected variables

#Generate list of unique original variable names
a = list(set([i.split('_')[0] for i in selected_categorical.index]))

b = [i for i in selected_numerical.index]
c = ['SalePrice']
total_selected = a+b+c
total_selected

['ExterQual',
 'Neighborhood',
 'BsmtQual',
 'GarageFinish',
 'KitchenQual',
 'BsmtFinType1',
 'Foundation',
 'OverallQual',
 'TotalLivingArea',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'FullBath',
 'TotRmsAbvGrd',
 'MasVnrArea',
 'Fireplaces',
 'SalePrice']

In [102]:
a = pd.concat([df2[total_selected],df2['Id']],axis=1)
df3 = integrate_dummies(a,a['Id'])
df3

Unnamed: 0,OverallQual,TotalLivingArea,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,MasVnrArea,...,BsmtFinType1_Empty,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
0,0.651256,0.384487,0.370207,0.311618,0.350880,-0.459145,-0.793162,0.789470,0.911897,0.513928,...,0,1,0,0,0,0,1,0,0,0
1,-0.071812,-0.475128,-0.482347,0.311618,-0.060710,0.466305,0.257052,0.789470,-0.318574,-0.570555,...,0,0,0,0,0,1,0,0,0,0
2,0.651256,0.530315,0.514836,0.311618,0.631510,-0.313261,-0.627611,0.789470,-0.318574,0.325803,...,0,1,0,0,0,0,1,0,0,0
3,0.651256,0.397919,0.383528,1.649742,0.790533,-0.687089,-0.521555,-1.025689,0.296662,-0.570555,...,0,0,0,0,0,0,0,0,0,0
4,1.374324,1.320854,1.298881,1.649742,1.697903,0.199611,-0.045596,0.789470,1.527133,1.366021,...,0,1,0,0,0,0,1,0,0,0
5,-0.794879,-0.283249,-0.292045,0.311618,0.032833,-0.595911,-0.948366,-1.025689,-0.933810,-0.570555,...,0,1,0,0,0,0,0,0,0,1
6,1.374324,0.353787,0.339758,0.311618,0.762470,1.432785,1.374522,0.789470,0.296662,0.458597,...,0,1,0,0,0,0,1,0,0,0
7,0.651256,1.113625,1.093354,0.311618,0.051542,0.112993,-0.143892,0.789470,0.296662,0.757383,...,0,0,0,0,0,1,0,0,0,0
8,0.651256,0.507290,0.492000,0.311618,-0.023293,-0.240320,-0.363764,0.789470,0.911897,-0.570555,...,0,0,0,0,1,0,0,0,0,0
9,-0.794879,-0.830103,-0.834405,-1.026506,-1.253387,-0.151422,-0.221494,-1.025689,-0.933810,-0.570555,...,0,1,0,0,0,0,0,0,0,0


In [103]:
# Filter highly correlated variables
correlation = df3.corr()
pre_selection = correlated_variables(correlation,'SalePrice')
pre_selection

Unnamed: 0,Variable1,Variable2,Corr_V1&V2,Corr_V1&Obj,Corr_V2&Obj
0,TotalLivingArea,GrLivArea,0.995718,0.716883,0.708624
0,GarageCars,GarageArea,0.882475,0.640409,0.623431
0,GrLivArea,TotRmsAbvGrd,0.825489,0.708624,0.533723
0,TotalLivingArea,TotRmsAbvGrd,0.820088,0.716883,0.533723
0,TotalBsmtSF,1stFlrSF,0.81953,0.613581,0.605852
0,BsmtFinType1_Empty,Foundation_Slab,0.801733,-0.152829,-0.11974
0,ExterQual_TA,KitchenQual_TA,0.6716,-0.589044,-0.519298
0,TotalLivingArea,FullBath,0.635296,0.716883,0.560664
0,GrLivArea,FullBath,0.630012,0.708624,0.560664
0,ExterQual_Gd,KitchenQual_Gd,0.628363,0.452466,0.321641


In [104]:
# Filter strongly correlated variables

pre_selection.reset_index(drop=True,inplace=True)
result = []
max_correlation = 0.9
for i in pre_selection.index:
    if pre_selection.loc[i,'Corr_V1&V2'] < max_correlation:
        a = pre_selection.loc[i,'Corr_V1&Obj']
        b = pre_selection.loc[i,'Corr_V2&Obj']
        if a>=b:
            result.append(pre_selection.loc[i,'Variable1'])
        else:
            result.append(pre_selection.loc[i,'Variable2'])
result

['GarageCars',
 'GrLivArea',
 'TotalLivingArea',
 'TotalBsmtSF',
 'Foundation_Slab',
 'KitchenQual_TA',
 'TotalLivingArea',
 'GrLivArea',
 'ExterQual_Gd',
 'OverallQual',
 'OverallQual']

## Transformation

## Modeling

In [79]:
x_train,x_test,y_train,y_test = train_test_split(df3.drop(['SalePrice','Id'],axis=1),df3['SalePrice'], test_size=0.33)

In [80]:
model = LinearRegression()

#Standardize x_train
std_x_train = (x_train[numerical_variables]-df1[numerical_variables].mean())/df1[numerical_variables].std()

model.fit(x_train, y_train)
score = model.score(x_test, y_test)
y_pred = np.maximum(model.predict(x_test),np.full(len(x_test),0))
print('R^2: ',score)
print('Mean Absolute Error (MAE): ',metrics.mean_absolute_error(y_pred,y_test))
print('Root Mean Square Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_pred,y_test)))
print('Mean Absolute Percentage Error (MAPE): ',mean_absolute_percentage_error(y_test,y_pred),'%')
print('Root Mean Square Logarithmic Error (RMSLE): ',np.sqrt(metrics.mean_squared_log_error(y_test,y_pred)))

R^2:  -4.602530826003815e+16
Mean Absolute Error (MAE):  15086801.3631799
Root Mean Square Error (RMSE): 110116673.9836079
Mean Absolute Percentage Error (MAPE):  3470308093.4219217 %


ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [None]:
#model.intercept_
#model.coef_

In [None]:
results = pd.DataFrame({'y_pred':y_pred[:],'y_test':y_test[:]})
results.head(10)

In [None]:
# Train Using Stats Model
X = x_train
y = y_train

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
results_as_html = est2.summary().tables[1].as_html()

In [None]:
# Display results
lm_results = pd.read_html(results_as_html, header=0, index_col=0)[0]
lm_results.sort_values('P>|t|',ascending=True)

In [None]:
#Validate OLS coef of determination (R**2)
model.score(x_train,y_train)

### Decision Tree

In [None]:
#tree = DecisionTreeClassifier(#class_weight=None, criterion='gini', max_depth=4,max_features=None, max_leaf_nodes=None,
                       #min_impurity_decrease=0.0, min_impurity_split=None,
                       #min_samples_leaf=200, min_samples_split=2,
                       #min_weight_fraction_leaf=0.0, presort=False,
                       #random_state=123, splitter='best'
                    )
#classifier = tree.fit(x_train,y_train)
#predictions = classifier.predict(x_test)
#accuracy_score(y_true = y_test, y_pred = predictions)