In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from   sklearn.linear_model import LinearRegression
from   sklearn.model_selection import train_test_split


%matplotlib inline
plt.rcParams['figure.figsize'] = (12.0, 8.0)
plt.style.use('seaborn-poster')

In [None]:
king = pd.read_excel("..\data\king_county_home_sales.xlsx", index_col= "Unnamed: 0" )
king.sample(10)

Here is a first attempt at a simple linear regression model using the sqft_living room of a home to model

In [None]:
# price as a function of sqfr_living room, this is just a pedigocial tool and will serve as a basis


X, y = king[['sqft_living']] , king[['price']]

# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=42
)

# Init, fit, score
model = LinearRegression()
model.fit(X_train, y_train)

print(f"Training score: {model.score(X_train, y_train)}")

print(f"Test score: {model.score(X, y)}")


In [None]:
# plot fit
plt.scatter(king.sqft_living, king.price, alpha=0.3)
plt.plot(king.sqft_living, model.predict(king[['sqft_living']]), c='darkorange')
plt.title("Squarefoot Living Room ~ Price")
plt.xlabel("Squart Footage")
plt.ylabel("Sales Price")
plt.tight_layout()
plt.show()

Now it is time for a multi linear regression:

In [None]:
king.columns # calling all columns in order to determine what subset of columns we will use for our models
features = king[['bedrooms', 'bathrooms', 'sqft_living',
                 'sqft_lot', 'floors', 'waterfront', 'sqft_above',
                 'sqft_basement', 'lat', 'long', 'sqft_living15',
                 'sqft_lot15', 'view_ord', 'condition_ord', 'grade_ord', 'age', "renovated"]]

In [None]:
# mulit model time: (all ~ price) decided to use every column here except for yr_built since this column and age are redundant and zipcode since it is an arbitraruy number not relevant to our dataframe yet.

X, y = features, king[['price']]

# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=42
)


# Init, fit, score
multi_model = LinearRegression()
multi_model.fit(X_train, y_train)

print(f"Training score: {multi_model.score(X_train, y_train)}")
multi_model_score = multi_model.score(X_test, y_test)
print(f"Test score: {multi_model.score(X_test, y_test)}")

An r^2 score of 0.68 isn't bad but it isn't great either. Maybe we have some collinearity issues going on and we need to be more selective about our features. So its time to do some feature selection

In [None]:
def identify_correlated(df, threshold):
    """
    A function to identify highly correlated features.
    """
    # Compute correlation matrix with absolute values
    matrix = df.corr().abs()

    # Create a boolean mask
    mask = np.triu(np.ones_like(matrix, dtype=bool))

    # Subset the matrix
    reduced_matrix = matrix.mask(mask)

    # Find cols that meet the threshold
    to_drop = [c for c in reduced_matrix.columns if \
              any(reduced_matrix[c] > threshold)]

    return to_drop

# Thanks to this towards data science for this function: https://towardsdatascience.com/how-to-use-pairwise-correlation-for-robust-feature-selection-20a60ef7d10

In [None]:
to_drop = identify_correlated(king, .7) #proceeding with dropping these features that are highly correlated

to_drop

In [None]:
# Build feature/target arrays
X, y = king[['bedrooms','sqft_lot', 'floors', 'waterfront',
             'sqft_basement', 'sqft_living15',
             'sqft_lot15', 'view_ord', 'condition_ord', 'grade_ord', 'age', "renovated"]] , king[['price']]

# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=42
)


# Init, fit, score
multi_model_red = LinearRegression()
multi_model_red.fit(X_train, y_train)

print(f"Training score: {multi_model_red.score(X_train, y_train)}")
multi_model_score_red = multi_model_red.score(X_test, y_test)
print(f"Test score: {multi_model_red.score(X_test, y_test)}")

print(f"\nOur r^2 score was {multi_model_score:.04f} and our new reduced score is {multi_model_score_red:.04f} this is a large decrease in r^2 after removing features that are collinear, so lets go back to our original model with all the features.")

So how should we proceed from here? One step is to calculate the interactions between different features.

In [None]:
king.columns

In [None]:
y = king['price']
X = king[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
          'sqft_above', 'sqft_basement','lat', 'long', 'zipcode',
          'sqft_living15', 'sqft_lot15', 'renovated', 'view_ord', 'condition_ord', 'grade_ord', 'age']]

from sklearn.preprocessing import PolynomialFeatures

linear_withinteraction = PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)

X_transformed = linear_withinteraction.fit_transform(X.drop(columns= 'zipcode')) #removing zipcode column this will become evident latter when we dummy our zip code

feat_names = pd.Series(linear_withinteraction.get_feature_names())

feat_names

In [None]:
xs = ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',  'x11', 'x12', 'x13', 'x14', 'x15','x16']

columns = X.columns

map_dict = {x:y for x,y in zip(xs, columns)}

for key, value in map_dict.items():
    feat_names = feat_names.str.replace(key, value)


X_trans_df = pd.DataFrame(X_transformed)
X_trans_df.columns = feat_names

X_trans_df.head()

In [None]:
# X_trans_standard = X_trans_df.apply(lambda x: (x - x.mean())/x.std() )
# y_standard = (y - y.mean())/y.std()

In [None]:
# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_trans_df, y, test_size=.3, random_state=42
)


# Init, fit, score
stand_model = LinearRegression()
stand_model.fit(X_train, y_train)

print(f"Training score: {stand_model.score(X_train, y_train)}")

print(f"Testing score: {stand_model.score(X_test, y_test)}")

stand_model_score  = stand_model.score(X_test, y_test)

Now one last model except now with zip code as a dummied feature instead of as an integer value!

In [None]:
dummy_zip = pd.get_dummies(king['zipcode'], drop_first = True)

One more model with all of our features alongside dummied zip code data.

In [None]:
king.columns # calling all columns in order to determine what subset of columns we will use for our models
features = king[['bedrooms', 'bathrooms', 'sqft_living',
                 'sqft_lot', 'floors', 'waterfront', 'sqft_above',
                 'sqft_basement', 'sqft_living15', 'lat', 'long',
                 'sqft_lot15', 'view_ord', 'condition_ord', 'grade_ord', 'age', "renovated"]]

In [None]:
features = features.join(dummy_zip)

X, y = features, king[['price']]

# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=42
)


# Init, fit, score
multi_model = LinearRegression()
multi_model.fit(X_train, y_train)

print(f"Training score: {multi_model.score(X_train, y_train)}")
multi_model_score = multi_model.score(X_test, y_test)
print(f"Test score: {multi_model.score(X_test, y_test)}")