In [3]:
# 4C

import numpy as np
from numpy.linalg import inv
import random

# create a random 3 x 3 matrix
def random_matrix():
    matrix = np.empty([3, 3])
    for i in range(3):
        for j in range(3):
            matrix[i,j] = random.randint(-10, 10)
    return matrix

# print the matrix, its inverse, and calculate the identity
def report_matrix(label):
    matrix = random_matrix()
    inv_matrix = inv(matrix)
    identity = np.round(np.matmul(matrix, inv_matrix), 1)
    
    print("Matrix "+label+":")
    print(matrix)
    print("Matrix "+label+"^-1:")
    print(inv_matrix)
    print("Product "+label+label+"^-1:")
    print(identity)
    print()

report_matrix("A")
report_matrix("B")
report_matrix("C")

Matrix A:
[[-3.  7.  6.]
 [-9.  6. -4.]
 [ 7. -6.  8.]]
Matrix A^-1:
[[ 0.07792208 -0.2987013  -0.20779221]
 [ 0.14285714 -0.21428571 -0.21428571]
 [ 0.03896104  0.10064935  0.1461039 ]]
Product AA^-1:
[[ 1.  0. -0.]
 [ 0.  1. -0.]
 [ 0. -0.  1.]]

Matrix B:
[[ -2.   0.  -7.]
 [-10.   8.   6.]
 [ -7.   0.  -3.]]
Matrix B^-1:
[[ 0.06976744 -0.         -0.1627907 ]
 [ 0.20930233  0.125      -0.23837209]
 [-0.1627907  -0.          0.04651163]]
Product BB^-1:
[[ 1.  0. -0.]
 [-0.  1. -0.]
 [ 0.  0.  1.]]

Matrix C:
[[  5. -10.   6.]
 [  6.   1.  -8.]
 [ -6.  -8.   1.]]
Matrix C^-1:
[[ 0.06382979  0.03850051 -0.07497467]
 [-0.04255319 -0.04154002 -0.07700101]
 [ 0.04255319 -0.10131712 -0.06585613]]
Product CC^-1:
[[ 1.  0. -0.]
 [ 0.  1.  0.]
 [-0.  0.  1.]]



In [4]:
# 5A

import pandas as pd

# read in the data into a pandas dataframe
df = pd.read_csv("kc_house_data.csv")

# print the mean, min, max, and variance for all features
def print_stats(df, feature):
    mean = sum(df[feature])/len(df[feature])
    minimum = min(df[feature])
    maximum = max(df[feature])
    
    # for loop calculates sum necessary for numerator of variance equation
    var_numerator = 0
    for value in df[feature]:
        var_numerator += ((value - mean) ** 2)
        
    # divide by number of values to get variance
    variance = var_numerator / len(df[feature])
    
    # print all stats for feature
    print(feature)
    print("mean:",round(mean,2))
    print("min:",minimum)
    print("max:",maximum)
    print("variance:",round(variance,2))
    print()

# iterate through all feature names and print stats for each
for name, data in df.items():
    if name == "id" or name == "date" or name == "zipcode" or name == "price":
        continue
    print_stats(df, name)


bedrooms
mean: 3.37
min: 0
max: 33
variance: 0.86

bathrooms
mean: 2.11
min: 0.0
max: 8.0
variance: 0.59

sqft_living
mean: 2079.9
min: 290
max: 13540
variance: 843494.65

sqft_lot
mean: 15106.97
min: 520
max: 1651359
variance: 1715579393.3

floors
mean: 1.49
min: 1.0
max: 3.5
variance: 0.29

waterfront
mean: 0.01
min: 0
max: 1
variance: 0.01

view
mean: 0.23
min: 0
max: 4
variance: 0.59

condition
mean: 3.41
min: 1
max: 5
variance: 0.42

grade
mean: 7.66
min: 1
max: 13
variance: 1.38

sqft_above
mean: 1788.39
min: 290
max: 9410
variance: 685702.94

sqft_basement
mean: 291.51
min: 0
max: 4820
variance: 195863.61

yr_built
mean: 1971.01
min: 1900
max: 2015
variance: 862.76

yr_renovated
mean: 84.4
min: 0
max: 2015
variance: 161338.75

lat
mean: 47.56
min: 47.1559
max: 47.7776
variance: 0.02

long
mean: -122.21
min: -122.519
max: -121.315
variance: 0.02

sqft_living15
mean: 1986.55
min: 399
max: 6210
variance: 469739.5

sqft_lot15
mean: 12768.46
min: 651
max: 871200
variance: 745483731.3

long has the lowest average with -122.21 (If going by magnitude, waterfront had the lowest average with 0.01).

sqft_loft has the highest average with 15106.97

waterfront has the lowest variance with 0.01

sqft_loft has the highest variance with 1715579393.3

In [8]:
# 5B

# calculate pearson correlation coefficient
def corr_coeff(df, feature):
    # get means of feature and response
    x_bar = sum(df[feature])/len(df[feature])
    y_bar = sum(df["price"])/len(df["price"])
    
    # instantiate covariance and variance variables
    cov = 0
    var_x = 0
    var_y = 0
    
    # calculate covariance and variances
    for i in range(len(df)):
        xi = float(df[feature].loc[df.index[i]])
        yi = float(df["price"].loc[df.index[i]])
        cov += ((xi - x_bar) * (yi - y_bar))
        var_x += ((xi - x_bar) ** 2)
        var_y += ((yi - y_bar) ** 2)
    
    # take square roots of to get standard deviations
    sd_x = var_x ** 0.5
    sd_y = var_y ** 0.5
    
    return round(cov/(sd_x * sd_y), 2)

# get list of feature names
features = []
for name, data in df.items():
    if name == "id" or name == "date" or name == "zipcode" or name == "price":
        continue
    features.append(name)

# get list of correlation coefficients
coeffs = []
for i in range(len(features)):
    coeffs.append(corr_coeff(df, features[i]))

# print table of correlations
corr_table = pd.DataFrame({'Feature':features,'Correlation':coeffs})
corr_table

Unnamed: 0,Feature,Correlation
0,bedrooms,0.31
1,bathrooms,0.53
2,sqft_living,0.7
3,sqft_lot,0.09
4,floors,0.26
5,waterfront,0.27
6,view,0.4
7,condition,0.04
8,grade,0.67
9,sqft_above,0.61


All features appear to be positively correlated with the response "price". The feature with the highest positive correlation was square footage (sq_ft), with a coefficient of 0.70.

5C<br>
There do not appear to be any features with a negative correlation with price.