In [155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [156]:
df = pd.read_csv('AmesHousing.csv')

In [157]:
df.shape

In [158]:
df.head()

In [159]:
df.info()

In [160]:
df.describe().T

In [161]:
list(df.columns)

In [162]:
sns.boxplot(data=df, x='SalePrice')

In [163]:
# The columns you mentioned are part of the Ames Housing Dataset, which is often used for housing price prediction and regression analysis. Here's the full meaning of each column:

# 1. `Order`: An identifier for each record, typically representing the order in which the data was collected or entered.

# 2. `PID`: Parcel identification number, a unique identifier for each property.

# 3. `MS SubClass`: The type of dwelling, represented as a numeric code.

# 4. `MS Zoning`: The zoning classification of the property, indicating how it can be used (e.g., residential, commercial, industrial).

# 5. `Lot Frontage`: Linear feet of street connected to the property.

# 6. `Lot Area`: Lot size in square feet.

# 7. `Street`: Type of road access to the property (e.g., paved or gravel).

# 8. `Alley`: Type of alley access to the property (e.g., paved, gravel, no alley access).

# 9. `Lot Shape`: General shape of the lot (e.g., regular, irregular).

# 10. `Land Contour`: Flatness of the property (e.g., level, banked, hillside).

# 11. `Utilities`: Type of utilities available (e.g., all public utilities, electricity only).

# 12. `Lot Config`: Lot configuration (e.g., inside lot, corner lot).

# 13. `Land Slope`: Slope of the property (e.g., gentle slope, moderate slope).

# 14. `Neighborhood`: Physical locations within the city of Ames, Iowa.

# 15. `Condition 1`: Proximity to various conditions (e.g., near a railroad, near park).

# 16. `Condition 2`: Proximity to other conditions (if more than one is present).

# 17. `Bldg Type`: Type of dwelling (e.g., single-family, townhouse).

# 18. `House Style`: Style of the dwelling (e.g., ranch, two-story).

# 19. `Overall Qual`: Overall material and finish quality of the house.

# 20. `Overall Cond`: Overall condition rating of the house.

# 21. `Year Built`: Year the house was originally built.

# 22. `Year Remod/Add`: Year of the most recent remodeling or addition.

# 23. `Roof Style`: Type of roof design.

# 24. `Roof Matl`: Roof material.

# 25. `Exterior 1st`: Exterior covering on the house.

# 26. `Exterior 2nd`: Secondary exterior covering (if applicable).

# 27. `Mas Vnr Type`: Masonry veneer type.

# 28. `Mas Vnr Area`: Masonry veneer area in square feet.

# 29. `Exter Qual`: Exterior material quality.

# 30. `Exter Cond`: Exterior material condition.

# 31. `Foundation`: Type of foundation.

# 32. `Bsmt Qual`: Height of the basement.

# 33. `Bsmt Cond`: General condition of the basement.

# 34. `Bsmt Exposure`: Walkout or garden level basement walls exposure.

# 35. `BsmtFin Type 1`: Quality of basement finished area.

# 36. `BsmtFin SF 1`: Type 1 finished square feet.

# 37. `BsmtFin Type 2`: Quality of the second finished area (if present).

# 38. `BsmtFin SF 2`: Type 2 finished square feet.

# 39. `Bsmt Unf SF`: Unfinished basement square feet.

# 40. `Total Bsmt SF`: Total square feet of basement area.

# 41. `Heating`: Type of heating.

# 42. `Heating QC`: Heating quality and condition.

# 43. `Central Air`: Central air conditioning.

# 44. `Electrical`: Electrical system.

# 45. `1st Flr SF`: First-floor square feet.

# 46. `2nd Flr SF`: Second-floor square feet.

# 47. `Low Qual Fin SF`: Low-quality finished square feet (e.g., low-quality basement finish).

# 48. `Gr Liv Area`: Above-grade (ground) living area square feet.

# 49. `Bsmt Full Bath`: Basement full bathrooms.

# 50. `Bsmt Half Bath`: Basement half bathrooms.

# 51. `Full Bath`: Full bathrooms above grade.

# 52. `Half Bath`: Half bathrooms above grade.

# 53. `Bedroom AbvGr`: Bedrooms above grade.

# 54. `Kitchen AbvGr`: Kitchens above grade.

# 55. `Kitchen Qual`: Kitchen quality.

# 56. `TotRms AbvGrd`: Total rooms above grade (does not include bathrooms).

# 57. `Functional`: Home functionality rating.

# 58. `Fireplaces`: Number of fireplaces.

# 59. `Fireplace Qu`: Fireplace quality.

# 60. `Garage Type`: Garage location and type.

# 61. `Garage Yr Blt`: Year garage was built.

# 62. `Garage Finish`: Interior finish of the garage.

# 63. `Garage Cars`: Size of garage in car capacity.

# 64. `Garage Area`: Size of garage in square feet.

# 65. `Garage Qual`: Garage quality.

# 66. `Garage Cond`: Garage condition.

# 67. `Paved Drive`: Paved driveway.

# 68. `Wood Deck SF`: Wood deck area in square feet.

# 69. `Open Porch SF`: Open porch area in square feet.

# 70. `Enclosed Porch`: Enclosed porch area in square feet.

# 71. `3Ssn Porch`: Three-season porch area in square feet.

# 72. `Screen Porch`: Screen porch area in square feet.

# 73. `Pool Area`: Pool area in square feet.

# 74. `Pool QC`: Pool quality.

# 75. `Fence`: Fence quality.

# 76. `Misc Feature`: Miscellaneous feature not covered in other categories.

# 77. `Misc Val`: Value of miscellaneous feature.

# 78. `Mo Sold`: Month Sold.

# 79. `Yr Sold`: Year Sold.

# 80. `Sale Type`: Type of sale.

# 81. `Sale Condition`: Condition of sale (e.g., normal, abnormal).

# These columns provide detailed information about various aspects of the properties in the dataset, and they can be used to analyze and predict housing prices.

In [164]:
# distribution of saleprice
sns.kdeplot(df['SalePrice'])

In [165]:
# sns.heatmap(df.corr())

In [166]:
df_num = df.select_dtypes(include='number')

In [167]:
# df.corr()['SalePrice'].sort_values()

# Data cleaning

In [168]:
# Outlier detection

In [169]:
# overall quality is highly correlated with the target variable
sns.scatterplot(data=df, x='Overall Qual', y='SalePrice')

In [170]:
sns.scatterplot(data=df, x='Gr Liv Area', y='SalePrice') # we can spot three data points that are deviating in the plot

In [171]:
# we remove the outliers from our DataFrame
df2 = df[~(df['Gr Liv Area']>= 4000) & (df['SalePrice']<=300000)]

In [172]:
df2.shape

In [173]:
sns.scatterplot(data=df2, x= 'Gr Liv Area', y='SalePrice')

In [174]:
df2[['Order','PID']]

In [175]:
# we drop the 'Order' and 'PID' columns
df3 = df2.drop(columns=['Order','PID'], axis=1)

In [176]:
df3.shape

In [177]:
df3.isnull().mean().mul(100)

In [178]:
def percent_missing(df):
    '''returns the percentage of missing values in each column of a dataframe'''
    percent_nan = df.isnull().mean().mul(100)
    percent_nan = percent_nan[percent_nan > 0].sort_values()
    return percent_nan

In [179]:
percent_nan = percent_missing(df3)

In [180]:
percent_nan

In [181]:
sns.barplot(x=percent_nan, y=percent_nan.index)

In [182]:
percent_nan[percent_nan<1]

In [183]:
df4 = df3.dropna(axis=0,subset=['Electrical','Garage Area'])

In [184]:
df4.shape

In [185]:
num_basement_cols = ['BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF','Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath']
df5 = df4.copy()
df5[num_basement_cols] = df5[num_basement_cols].fillna(0)

In [186]:
str_basement_cols = ['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2']
df5[str_basement_cols]= df5[str_basement_cols].fillna('None')

In [187]:
percent_nan = percent_missing(df5)

In [188]:
percent_nan

In [189]:
percent_nan[percent_nan<1]

In [190]:
df5['Mas Vnr Type'].unique()

In [191]:
df5['Mas Vnr Type'] = df5['Mas Vnr Type'].fillna('None')
df5['Mas Vnr Area'] = df5['Mas Vnr Area'].fillna(0)

In [192]:
percent_nan = percent_missing(df5)
percent_nan

In [193]:
df5['Garage Type'].unique()

In [194]:
df5['Garage Yr Blt'].unique()

In [195]:
df5['Garage Finish'].unique()

In [196]:
df5['Garage Qual'].unique()

In [197]:
df5['Garage Cond'].unique()

In [198]:
garage_str =['Garage Type','Garage Finish', 'Garage Qual', 'Garage Cond']
df6 = df5.copy()
df6[garage_str] = df6[garage_str].fillna('None')

In [199]:
df6['Garage Yr Blt'] = df6['Garage Yr Blt'].fillna(0)

In [200]:
percent_nan = percent_missing(df6)

In [201]:
percent_nan

In [202]:
# 'Fence', 'Alley', 'Misc Feature' and 'Pool QC' has over 70% of it missing, it will be better to drop it
df7 = df6.copy()
df7 = df7.drop(columns=['Fence','Alley','Misc Feature','Pool QC'], axis=1)

In [203]:
percent_nan = percent_missing(df7)

In [204]:
percent_nan

In [205]:
df7['Fireplace Qu'].unique()

In [206]:
# seems some houses do not have a fireplace
df8 = df7.copy()
df8['Fireplace Qu'] = df8['Fireplace Qu'].fillna('None')

In [207]:
percent_nan = percent_missing(df8)

In [208]:
percent_nan

In [209]:
df8.columns

In [210]:
df8['Lot Frontage'] = df8.groupby('Neighborhood')['Lot Frontage'].transform(lambda value: value.fillna(value.mean()))

In [211]:
percent_nan = percent_missing(df8)
percent_nan

In [212]:
df8['Lot Frontage'] = df8['Lot Frontage'].fillna(0)

In [213]:
X = df8.copy()
y = X.pop("SalePrice")

# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

In [214]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y)
mi_scores[::3]  # show a few features with their MI scores

In [215]:
mi_scores.tail(41)

In [216]:
X.head()

In [217]:
# dropping featues with mutual_info less than 1

In [218]:
X = X.drop(columns = ['Garage Cond','Garage Qual','Wood Deck SF','Central Air','Mas Vnr Type','House Style','Lot Shape',
'Paved Drive','Sale Type','Bsmt Exposure','Half Bath','Enclosed Porch','Bsmt Cond','Electrical','Sale Condition','Bedroom AbvGr',
'Bldg Type','Condition 1','Yr Sold','BsmtFin Type 2','Lot Config','Bsmt Full Bath','Exter Cond','Screen Porch','BsmtFin SF 2','Heating',
'Misc Val','Roof Matl','Land Contour','Functional','Kitchen AbvGr','Street','Utilities','Land Slope','Low Qual Fin SF',
'Pool Area','Condition 2','Bsmt Half Bath','Roof Style','3Ssn Porch','Mo Sold'], axis=1)

In [219]:
mi_scores.head()

In [220]:
X.shape

In [221]:
features = ['Neighborhood','Overall Qual','Gr Liv Area','Year Built','Garage Area']
X = df8[features]

In [222]:
X.head()

In [223]:
X['Neighborhood'].unique()

In [224]:
X_str = X.select_dtypes(include='object')

In [225]:
X_num = X.select_dtypes(include='number')

In [226]:
dummy_feature = pd.get_dummies(X_str, drop_first=True)

In [227]:
dummy_feature

In [228]:
dummy_feature.shape

In [229]:
X_final = pd.concat([X_num, dummy_feature], axis=1)

In [230]:
X_final.head()

In [231]:
X_final.shape

In [232]:
y.shape

#  linear model

In [233]:
from sklearn.model_selection import train_test_split

In [234]:
from sklearn.ensemble import RandomForestRegressor

In [235]:
model = RandomForestRegressor(random_state =1)

In [236]:
X_train,X_test,y_train,y_test = train_test_split(X_final,y,train_size=0.7,test_size=0.3,random_state=1)

In [237]:
model.fit(X_train,y_train)

In [238]:
from sklearn.metrics import mean_absolute_error

In [239]:
train_preds = model.predict(X_train)

In [240]:
print(mean_absolute_error(y_train, train_preds))

In [241]:
test_preds = model.predict(X_test)

In [242]:
print('MAE:', mean_absolute_error(y_test,test_preds))

In [243]:
# the mean absolute error of the model prediction on our train data which was used for modelling is smaller(5,031) compared to
# the mean absolute error on test data (13,661), this implies that the model is overfitting because it performs better on the
# train data than test data. However we can reduce overfitting by tuning some parameters to bulild a better model

In [244]:
X_final.columns

In [245]:
X_final.head()

In [250]:
import numpy as np

def predict_SalePrice(Neighborhood, Overall_Qual, Gr_Liv_Area, Year_Built, Garage_Area):
    loc_index = np.where(X_final.columns == Neighborhood)[0]

    if len(loc_index) > 0:
        loc_index = loc_index[0]
    else:
        loc_index = -1

    x = np.zeros(len(X_final.columns))
    x[0] = Overall_Qual
    x[1] = Gr_Liv_Area
    x[2] = Year_Built
    x[3] = Garage_Area

    if loc_index >= 0:
        x[loc_index] = 1

    return model.predict([x])[0]

# Assuming X_final and model are defined elsewhere in your code
result = predict_SalePrice('Neighborhood_StoneBr', 6, 1656, 1960, 528)
print(result)


In [247]:
predict_SalePrice('Neighborhood_BrDale',6,1656,1960,528)

In [251]:
import pickle

# Check if lr_clf is a valid model object before saving
if hasattr(model, 'predict'):
    with open('Ames_house_prices_model.pickle', 'wb') as f:
        pickle.dump(model, f)
    print("Model saved successfully.")
else:
    print("lr_clf is not a valid model object.")

In [253]:
import json
columns = {
    'data_columns': [col.lower() for col in X_final.columns]
}
with open ('columns.json','w') as f:
    f.write(json.dumps(columns))

In [None]:
import json

columns = {
    'data_columns': [col.lower() for col in X_final.columns]
}

with open('columns.json', 'w') as f:
    json.dump(columns, f, indent=4)  # Use json.dump with indentation for better readability