In [300]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('ggplot')
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [301]:
all_data = pd.read_csv('./Ames_data.csv', header=[0])
msk = np.random.rand(len(all_data)) < 0.7

all_data[msk].to_csv('train.csv',index=False)
all_data[~msk].to_csv('test.csv',index=False)

train_data_orig = pd.read_csv('./train.csv', header=[0])
test_data_orig = pd.read_csv('./test.csv', header=[0])
test_sale=test_data_orig.copy().Sale_Price
test_data_orig.drop('Sale_Price', axis=1, inplace=True)
test_data_orig.to_csv('test.csv',index=False)
# train_data_orig = all_data[msk]
# test_data_orig = all_data[~msk]


In [302]:
train_data=train_data_orig.copy()
test_data=test_data_orig.copy()

# Project 1
1. Language used Python
2. For feature evaluation and engineering I have used https://www.kaggle.com/leeclemmer/exploratory-data-analysis-of-housing-in-ames-iowa
3. All application depemdencies has been added to pipfile



#### Find categorical and numerical features
1. We divide the features into categorical and numerical features based on there types.
2. We also drop Longitude and Latitude from our analysis

In [303]:
def get_features_columns(dataset):
    num=dataset.select_dtypes(include=['int64','float64']).columns.drop(['PID','Sale_Price','Latitude','Longitude'])
    cat = dataset.select_dtypes(include=['object'])
    return list(num), list(cat) 

numerical_features,categorical_features=get_features_columns(train_data)



#### Sales Price data analysis
From the distribution plot for Sales price we see the it is slightly positively skewed. We will do a log transformation for further analysis 

In [304]:
# plt.figure(figsize=(10,6))
# sns.distplot(all_data.Sale_Price)
# plt.show()

#### Numerical feature data analysis
1. From the plots below we see that several of the numerical features are positively skewed and we will be log transforming these features
2. Couple of features looks like categorical, for example MS_SubClass. We will stringify them and treat as categorical.


In [305]:
# f = pd.melt(train_data, value_vars=sorted(numerical_features))
# g = sns.FacetGrid(f, col='variable', col_wrap=4, sharex=False, sharey=False)
# g = g.map(sns.distplot, 'value')

In [306]:
def stringify_numerical(dataset):
    dataset['MS_SubClass'] = dataset.MS_SubClass.apply(lambda x: str(x))
    dataset['Mo_Sold'] = dataset.Mo_Sold.apply(lambda x: str(x))
    dataset['Year_Sold'] = dataset.Year_Sold.apply(lambda x: str(x))
    return dataset

train_data=stringify_numerical(train_data)
test_data=stringify_numerical(test_data)

numerical_features,categorical_features=get_features_columns(train_data)

#### Categorical data analysis
1. From plots below we see that several categorical features can be easily converted to numerical features by assigning some kind of rank to various categories. For example Bsmnt_Cond can be categorised as -
    a Poor-0
    b Fair -1
    c. Typlical-2
    d. Good-3
    e. Excellent-4
2. For Year built we will create ranges for the year and assign it ranking. The older the year built the lower the ranking

In [307]:
# f = pd.melt(all_data, value_vars=sorted(categorical_features))
# g = sns.FacetGrid(f, col='variable', col_wrap=4, sharex=False, sharey=False)
# plt.xticks(rotation='vertical')
# g = g.map(sns.countplot, 'value')
# [plt.setp(ax.get_xticklabels(), rotation=60) for ax in g.axes.flat]
# g.fig.tight_layout()
# plt.show()

In [308]:
def create_numerical_from_categorical(dataset):
    dataset.Alley.replace({'No_Alley_Access':0,'Gravel':1, 'Paved':2}, inplace=True)
    dataset.Alley= dataset.Alley.apply(lambda i:0 if i not in [0,1,2] else i)
    dataset.BsmtFin_Type_1.replace({'No_Basement':0,'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}, inplace=True)
    dataset.BsmtFin_Type_1= dataset.BsmtFin_Type_1.apply(lambda i:0 if i not in [0,1,2,3,4,5,6] else i)
    dataset.BsmtFin_Type_2.replace({'No_Basement':0,'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}, inplace=True)
    dataset.BsmtFin_Type_2= dataset.BsmtFin_Type_2.apply(lambda i:0 if i not in [0,1,2,3,4,5,6] else i)
    
#     dataset.Bsmt_Cond.replace({'No_Basement':0,'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, inplace=True)
#     dataset.Bsmt_Cond= dataset.Bsmt_Cond.apply(lambda i:0 if i not in [0,1,2,3,4,5] else i)
    
    dataset.Bsmt_Cond.replace({'No_Basement':0,'Poor':1, 'Fair':2, 'Typical':3, 'Good':4, 'Excellent':5}, inplace=True)
    dataset.Bsmt_Cond= dataset.Bsmt_Qual.apply(lambda i:0 if i not in [0,1,2,3,4,5] else i)
    
    dataset.Bsmt_Exposure.replace({'No_Basement':0,'No':1, 'Mn':2, 'Av':3, 'Gd':4}, inplace=True)
    dataset.Bsmt_Exposure= dataset.Bsmt_Exposure.apply(lambda i:0 if i not in [0,1,2,3,4] else i)
    
    dataset.Bsmt_Qual.replace({'No_Basement':0,'Poor':1, 'Fair':2, 'Typical':3, 'Good':4, 'Excellent':5}, inplace=True)
    dataset.Bsmt_Qual= dataset.Bsmt_Qual.apply(lambda i:0 if i not in [0,1,2,3,4,5] else i)
    
    dataset.Central_Air.replace({'Y':1,'N':0}, inplace=True)
    
    dataset.Exter_Cond.replace({'Poor':0, 'Fair':1, 'Typical':2, 'Good':3, 'Excellent':4}, inplace=True)
    dataset.Exter_Cond= dataset.Exter_Cond.apply(lambda i:0 if i not in [0,1,2,3,4] else i)
    
    dataset.Exter_Qual.replace({'Fair':0, 'Typical':1, 'Good':2, 'Excellent':3}, inplace=True)
    dataset.Exter_Qual= dataset.Exter_Qual.apply(lambda i:0 if i not in [0,1,2,3] else i)
    
    dataset.Fireplace_Qu.replace({'No_Fireplace':0,'Poor':1, 'Fair':2, 'Typical':3, 'Good':4, 'Excellent':5}, inplace=True)
    dataset.Fireplace_Qu= dataset.Fireplace_Qu.apply(lambda i:0 if i not in [0,1,2,3,4,5] else i)
    
    dataset.Functional.replace({'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8}, inplace=True)
    dataset.Functional= dataset.Functional.apply(lambda i:0 if i not in [0,1,2,3,4,5,6,7,8] else i)
    
    dataset.Garage_Qual.replace({'No_Garage':0,'Poor':1, 'Fair':2, 'Typical':3, 'Good':4, 'Excellent':5}, inplace=True)
    dataset.Garage_Qual= dataset.Garage_Qual.apply(lambda i:0 if i not in [0,1,2,3,4,5] else i)
    
    dataset.Garage_Finish.replace({'No_Garage':0,'Unf':1, 'RFn':2, 'Fin':3}, inplace=True)
    dataset.Garage_Finish= dataset.Garage_Finish.apply(lambda i:0 if i not in [0,1,2,3] else i)
    
    dataset.Garage_Cond.replace({'No_Garage':0,'Poor':1, 'Fair':2, 'Typical':3, 'Good':4, 'Excellent':5}, inplace=True)
    dataset.Garage_Cond= dataset.Garage_Cond.apply(lambda i:0 if i not in [0,1,2,3,4,5] else i)
    
    dataset.Heating_QC.replace({'Poor':0, 'Fair':1, 'Typical':2, 'Good':3, 'Excellent':4}, inplace=True)
    dataset.Heating_QC= dataset.Heating_QC.apply(lambda i:0 if i not in [0,1,2,3,4] else i)
    
    dataset.Kitchen_Qual.replace({'Poor':0, 'Fair':1, 'Typical':2, 'Good':3, 'Excellent':4}, inplace=True)
    dataset.Kitchen_Qual= dataset.Kitchen_Qual.apply(lambda i:0 if i not in [0,1,2,3,4] else i)
    
    dataset.Lot_Shape.replace({'Regular':3, 'Slightly_Irregular':2, 'Moderately_Irregular':1, 'Irregular':0}, inplace=True)
    dataset.Lot_Shape= dataset.Lot_Shape.apply(lambda i:0 if i not in [0,1,2,3] else i)
    
    dataset.Land_Slope.replace({'Sev':0, 'Mod':1, 'Gtl':2}, inplace=True)
    dataset.Land_Slope= dataset.Land_Slope.apply(lambda i:0 if i not in [0,1,2] else i)
    
    dataset.Land_Contour.replace({'Low':0, 'HLS':1, 'Bnk':2, 'Lvl':3}, inplace=True)
    dataset.Land_Contour= dataset.Land_Contour.apply(lambda i:0 if i not in [0,1,2,3] else i)
    
    dataset.Utilities.replace({'NoSeWa':0, 'NoSewr':1, 'AllPub':2}, inplace=True)
    dataset.Utilities= dataset.Utilities.apply(lambda i:0 if i not in [0,1,2] else i)
    
    dataset.Overall_Cond.replace({'Very_Poor':0, 'Poor':1,'Below_Average':2, 'Average':3, 'Above_Average':4,'Fair':5, 'Good':6, 'Very_Good':7,'Excellent':8}, inplace=True)
    dataset.Overall_Cond= dataset.Overall_Cond.apply(lambda i:0 if i not in [0,1,2,3,4,5,6,7,8] else i)
    
    dataset.Overall_Qual.replace({'Very_Poor':0, 'Poor':1,'Below_Average':2, 'Average':3, 'Above_Average':4,'Fair':5, 'Good':6, 'Very_Good':7,'Excellent':8}, inplace=True)
    dataset.Overall_Qual= dataset.Overall_Qual.apply(lambda i:0 if i not in [0,1,2,3,4,5,6,7,8] else i)
    
    dataset.Paved_Drive.replace({'No_Pavement':0, 'Dirt_Gravel':1, 'Paved':2}, inplace=True)
    dataset.Paved_Drive= dataset.Paved_Drive.apply(lambda i:0 if i not in [0,1,2] else i)
    
    dataset.Pool_QC.replace({'No_Pool':0, 'Fair':1, 'Typical':2, 'Good':3,'Excellent':4}, inplace=True)
    dataset.Pool_QC= dataset.Pool_QC.apply(lambda i:0 if i not in [0,1,2,3,4] else i)
    
    dataset.Street.replace({'Pave':1, 'Grvl':0}, inplace=True)
    dataset.Street= dataset.Street.apply(lambda i:0 if i not in [0,1] else i)
    
    def year_fixer(yr):
        if yr<1950:
            return 0
        if yr>=1950 and yr<1960:
            return 1
        if yr>=1960 and yr<1970:
            return 2
        if yr>=1970 and yr<1980:
            return 3
        if yr>=1980 and yr<1990:
            return 4
        if yr>=1990 and yr<2000:
            return 5
        if yr>=2000 and yr<2010:
            return 6
        if yr>=2010 :
            return 7
        
        
    dataset.Year_Built = dataset.Year_Built.apply(year_fixer)
    
    
    
    return dataset
    
    
train_data=create_numerical_from_categorical(train_data)
test_data=create_numerical_from_categorical(test_data)
    
numerical_features,categorical_features=get_features_columns(train_data)


In [309]:
def split_and_fix_features_and_label(dataset):
    all_data_copy = dataset.copy()

    all_data_copy.drop('PID', axis=1, inplace=True)
    all_data_copy.drop('Longitude', axis=1, inplace=True)
    all_data_copy.drop('Latitude', axis=1, inplace=True)
    all_response=None
    if 'Sale_Price' in all_data_copy:
        all_response = all_data_copy.Sale_Price
        all_data_copy.drop('Sale_Price', axis=1, inplace=True)

    return all_data_copy,all_response

train_feature_data,train_Y =split_and_fix_features_and_label(train_data)
test_feature_data,test_Y1 =split_and_fix_features_and_label(test_data)





#### Fixing Null data
From the plot below we see that Garage_Yr_Blt has null data. We will mark null values with 0

In [310]:
# missing_data = train_feature_data.isnull().sum() / train_feature_data.shape[0]
# missing_data[missing_data > 0].\
#     sort_values(ascending=True).\
#     plot(kind='barh', figsize=(10,6))
# plt.title('Percentage of missing values')
# plt.show()

In [311]:
def fill_na(featureset):
    featureset.Garage_Yr_Blt.fillna(0, inplace=True)
    return featureset

train_feature_data=fill_na(train_feature_data)
test_feature_data=fill_na(test_feature_data)

In [312]:
def log_skewed_data(featureset,label):
    features = numerical_features
    featureset.loc[:,features] = np.log1p(featureset.loc[:,features])
    label= np.log1p(label)
    return featureset,label
    
train_feature_data,train_Y=log_skewed_data(train_feature_data,train_Y)
# test_feature_data,test_Y=log_skewed_data(test_feature_data,test_Y)
test_feature_data.loc[:,numerical_features] = np.log1p(test_feature_data.loc[:,numerical_features])


#### Categorical data transformation
We will convert all categorical data by one hot encoding

In [313]:
def onehotrod_featureset(featureset):
    return pd.get_dummies(featureset)
    
train_feature_data=onehotrod_featureset(train_feature_data)
test_feature_data=onehotrod_featureset(test_feature_data)

train_feature_data, test_feature_data = train_feature_data.align(test_feature_data, join='inner', axis=1) 


In [314]:
from sklearn.metrics import mean_squared_error
def root_mean_sqr_error(actual_data,prediction_data):
#    return np.sqrt(np.mean(np.power((np.log(prediction_data)-np.log(actual_data)),2)))
    return np.sqrt(mean_squared_error(np.log(actual_data), prediction_data))


#### Standardize numerical features
Before creating the model we will center numerical data to mean 0 and scale it to unit variance

In [315]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_feature_data.loc[:,numerical_features]  = scaler.fit_transform(train_feature_data.loc[:,numerical_features] )
# test_feature_data.loc[:,numerical_features] = scaler.fit_transform(test_feature_data.loc[:,numerical_features])

In [316]:
output=pd.DataFrame()
output['PID'] =test_data_orig.PID

In [317]:
def xgb_predict(model,outputfile,params):
    clf = GridSearchCV(model,params, verbose=1,n_jobs=10,cv=3)

    model = clf.fit(train_feature_data,train_Y)

 
    train_pred = model.predict(train_feature_data)
    rmse_train=root_mean_sqr_error(train_Y,train_pred)
    print (f'RMSE for training data is :{rmse_train}')

    test_pred = model.predict(test_feature_data)
    rmse_test=root_mean_sqr_error(test_sale,test_pred)
    
#     print(test_y)
#     print(test_y -np.expm1(test_pred))
#     print(test_pred)
#     print(test_pred)

    output['Sale_Price'] = np.expm1(test_pred)
    output.to_csv(outputfile,index=False)
    print (f'RMSE for testing data is :{rmse_test}')
    return model

#### Model for prediction
For prediction I have used two models from XGBoost package. For the first model I have used a range of values mainly for different sampling rates to find best sampling rate for a conservative model. Also I have used a depth of 8, which may actually result in overfitting
For second model I have used parameters which are conservative. Also I have used a range of lower depths so as to avoid overfitting

In [318]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor()
# xgb_model=xgb_predict(xgb_model,'mysubmission1.txt',{'min_child_weight':[i/10.0 for i in range(0,10)], 'gamma':[i/10.0 for i in range(0,10)],'subsample':[i/10.0 for i in range(5,10)],'colsample_bytree':[i/10.0 for i in range(5,10)], 'max_depth': [8]})


xgb_model2 = xgb.XGBRegressor()
xgb_model2=xgb_predict(xgb_model2,'mysubmission2.txt',{'colsample_bytree':[0.2],'gamma':[0.0],'learning_rate':[0.01],'max_depth':[2,4,6],'min_child_weight':[1.5],'n_estimators':[7200],'reg_alpha':[0.9],'reg_lambda':[0.6],'subsample':[0.2]})




Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:   12.0s finished


RMSE for training data is :9.546114366060301
RMSE for testing data is :0.12726218123546013
