In [14]:
import pandas as pd
pd.set_option('display.max_columns', 300)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from uszipcode import SearchEngine
import pickle

In [32]:
df=pd.read_csv('kc_house_data_test_features.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4323 entries, 0 to 4322
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             4323 non-null   int64  
 1   date           4323 non-null   object 
 2   bedrooms       4323 non-null   int64  
 3   bathrooms      4323 non-null   float64
 4   sqft_living    4323 non-null   int64  
 5   sqft_lot       4323 non-null   int64  
 6   floors         4323 non-null   float64
 7   waterfront     4323 non-null   int64  
 8   view           4323 non-null   int64  
 9   condition      4323 non-null   int64  
 10  grade          4323 non-null   int64  
 11  sqft_above     4323 non-null   int64  
 12  sqft_basement  4323 non-null   int64  
 13  yr_built       4323 non-null   int64  
 14  yr_renovated   4323 non-null   int64  
 15  zipcode        4323 non-null   int64  
 16  lat            4323 non-null   float64
 17  long           4323 non-null   float64
 18  sqft_liv

In [33]:
search = SearchEngine(simple_zipcode=True)
main_city = [search.by_zipcode(i).major_city for i in df['zipcode']]
df['main_city'] = main_city

In [34]:
cities=pd.get_dummies(df['main_city'], drop_first=True)
df=pd.concat([df,cities], axis=1)

In [35]:
df.columns=[col.replace(' ','_') for col in df.columns]


In [36]:
df['year']=df['date'].apply(lambda x: int(x[0:4]))
df['month']=df['date'].apply(lambda x: int(x[4:6]))


In [37]:
def make_dummies(df,col_dummies):
    new_df=df.copy()
    for col in col_dummies:
        dummies=pd.get_dummies(new_df[col], prefix='{}_'.format(col),drop_first=True)
        new_df=pd.concat([new_df,dummies], axis=1)
    return new_df

In [38]:
new_df=make_dummies(df,['year','month','view','condition'])

In [59]:
dummies=pd.get_dummies(new_df['grade'], prefix='grade_',drop_first=True)
dummies['grade__3']=0
dummies=dummies[['grade__3', 'grade__4', 'grade__5', 'grade__6', 'grade__7', 'grade__8', 'grade__9', 'grade__10', 'grade__11', 'grade__12', 'grade__13']]
dummies[dummies['grade__4']==1]

Unnamed: 0,grade__3,grade__4,grade__5,grade__6,grade__7,grade__8,grade__9,grade__10,grade__11,grade__12,grade__13
104,0,1,0,0,0,0,0,0,0,0,0
762,0,1,0,0,0,0,0,0,0,0,0
876,0,1,0,0,0,0,0,0,0,0,0
1089,0,1,0,0,0,0,0,0,0,0,0
2054,0,1,0,0,0,0,0,0,0,0,0


In [61]:
new_df=pd.concat([new_df,dummies], axis=1)

In [63]:
extreme_cols=['bedrooms','bathrooms','sqft_living','sqft_lot','sqft_above','sqft_basement']

In [64]:
def ext_values(df, extreme_cols):
    new_df = df.copy()
    for col in extreme_cols:
        std = new_df[col].std()
        mean = new_df[col].mean()
        pos_value = mean+(5*std)
        neg_value = mean-(5*std)
        new_df[col] = new_df[col].apply(lambda x: pos_value if (x > pos_value) else (neg_value if x < neg_value else x))
    return new_df
#df=ext_values(df, extreme_cols)

In [65]:
new_df=ext_values(new_df, extreme_cols)

In [66]:
new_df['grade'].unique()

array([ 8,  7,  6,  9, 10,  5,  4, 11, 12, 13,  1], dtype=int64)

In [67]:
inter_df=new_df.drop(['date','id','main_city','year','month','view','condition','grade'], axis=1)


In [68]:
inter_df.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'Bellevue',
       'Black_Diamond', 'Bothell', 'Carnation', 'Duvall', 'Enumclaw',
       'Fall_City', 'Federal_Way', 'Issaquah', 'Kenmore', 'Kent', 'Kirkland',
       'Maple_Valley', 'Medina', 'Mercer_Island', 'North_Bend', 'Redmond',
       'Renton', 'Sammamish', 'Seattle', 'Snoqualmie', 'Vashon', 'Woodinville',
       'year__2015', 'month__2', 'month__3', 'month__4', 'month__5',
       'month__6', 'month__7', 'month__8', 'month__9', 'month__10',
       'month__11', 'month__12', 'view__1', 'view__2', 'view__3', 'view__4',
       'condition__2', 'condition__3', 'condition__4', 'condition__5',
       'grade__3', 'grade__4', 'grade__5', 'grade__6', 'grade__7', 'grade__8',
       'grade__9', 'grade__10', 'grade__11', 'grade__12', 'grade__13'],
      dtype='object')

In [69]:
ridge=pickle.load(open('ridge_model','rb'))
y_pred_all=ridge.predict(inter_df)

# rmse_ridge=np.sqrt(metrics.mean_squared_error(target, y_pred_all))
# rmse_ridge
y_pred_all

array([623368.12108632, 630771.51962779, 354283.01372368, ...,
       213133.57188136, 412765.46481233, 205949.12360501])

In [72]:
pred_df=pd.DataFrame()
pred_df['price']=y_pred_all
pred_df

Unnamed: 0,price
0,623368.121086
1,630771.519628
2,354283.013724
3,264540.409648
4,498348.415558
...,...
4318,526927.062303
4319,472274.940065
4320,213133.571881
4321,412765.464812


In [73]:
pred_df.to_csv('housing_preds_GAZA.csv', header=None)