In [2]:
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [4]:
def solve_missing_value(df):
    column_to_fill_with_na = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
                          "GarageType", "GarageYrBlt", "GarageFinish", "GarageQual",
                          "GarageCond", "BsmtExposure", "BsmtFinType2", "BsmtQual",
                          "BsmtQual", "BsmtCond", "BsmtFinType1"]
    columns_to_fill_with_mode = ["Electrical", "MasVnrType", "GarageYrBlt"]
    columns_to_fill_with_median = ["LotFrontage", "MasVnrArea"]
    
    df[column_to_fill_with_na] = df[column_to_fill_with_na].fillna(
    "not available")
    for col in columns_to_fill_with_mode:
        df[col] = df[col].mode()[0]
    for col in columns_to_fill_with_median:
        df[col] = df[col].mode()[0]
    return df

In [9]:
train_df = pd.read_csv("../data/train.csv").drop(["Id"], axis = 1)
train_df = solve_missing_value(train_df)


test_df = pd.read_csv("../data/test.csv")
test_index = test_df.pop("Id")
test_df = solve_missing_value(test_df)

# Missing Value Analysis

In [12]:
temp = pd.DataFrame(test_df.isna().sum())

In [15]:
for column in temp[temp[0] > 0].index:
    print(test_df[column].value_counts())
    

RL         1114
RM          242
FV           74
C (all)      15
RH           10
Name: MSZoning, dtype: int64
AllPub    1457
Name: Utilities, dtype: int64
VinylSd    510
MetalSd    230
HdBoard    220
Wd Sdng    205
Plywood    113
CemntBd     65
BrkFace     37
WdShing     30
AsbShng     24
Stucco      18
BrkComm      4
CBlock       1
AsphShn      1
Name: Exterior1st, dtype: int64
VinylSd    510
MetalSd    233
HdBoard    199
Wd Sdng    194
Plywood    128
CmentBd     66
Wd Shng     43
BrkFace     22
Stucco      21
AsbShng     18
Brk Cmn     15
ImStucc      5
CBlock       2
Stone        1
AsphShn      1
Name: Exterior2nd, dtype: int64
0.0       462
24.0       15
276.0       6
602.0       6
300.0       5
         ... 
278.0       1
210.0       1
580.0       1
1328.0      1
771.0       1
Name: BsmtFinSF1, Length: 669, dtype: int64
0.0      1278
162.0       3
294.0       3
483.0       3
144.0       2
         ... 
308.0       1
167.0       1
186.0       1
250.0       1
750.0       1
Name: Bsmt

In [None]:
BsmtUnfSF, BsmtFinSF2

# Categorical and Numeric Columns Identification

In [6]:
columns = train_df.columns
unique_values = train_df.nunique().values
total_values = train_df.shape[0] - train_df.isna().sum().values
unique_df = pd.DataFrame(zip(columns, unique_values, total_values),
                         columns=["column_name", "unique_values",
                                  "total_values"])
columns_to_drop = unique_df[unique_df["unique_values"]==1]["column_name"].values.tolist()
columns_to_drop += ["Utilities", "Street", "Condition2"]
unique_df = unique_df[unique_df["unique_values"]>1]
train_df = train_df.drop(columns_to_drop, axis = 1)

In [7]:
numeric_columns = ["LotArea", "BsmtFinSF1", "BsmtFinSF2", "1stFlrSF", "2ndFlrSF",
                   "LowQualFinSF", "GrLivArea", "GarageArea", "WoodDeckSF",
                   "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "MiscVal",
                   "YearBuilt", "YearRemodAdd", "GarageYrBlt", "MoSold", "SalePrice",
                  "TotalBsmtSF","BsmtUnfSF"]
categorical_columns = unique_df[~unique_df["column_name"].isin(numeric_columns)]["column_name"].values

In [8]:
numeric_columns = list(set(numeric_columns) - set(columns_to_drop))
categorical_columns = list(set(categorical_columns) - set(columns_to_drop))

In [9]:
testing_df = train_df.copy()
train_x = testing_df[testing_df.columns[:-1]]
train_y = testing_df["SalePrice"].values

In [10]:
encoder = OneHotEncoder(handle_unknown='ignore')
temp = encoder.fit_transform(train_x[categorical_columns].values.tolist()).toarray()

In [11]:
train_x = train_x.drop(categorical_columns, axis = 1)
temp_df = pd.DataFrame(temp)
train_x = pd.merge(train_x, temp_df , left_on=train_x.index, right_on = temp_df.index )

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [13]:
X,x,Y,y = train_test_split(train_x, train_y)

In [14]:
model = LinearRegression()
model.fit(X,Y)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
model.score(x,y)

0.726557104814476