In [323]:
import requests
import json
from urllib.request import Request, urlopen
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
import pingouin as pg
#Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Load Census Data by Zipcode

#Get the url for ASC 5-year in 2019
api_key = 'd146f6b6c0c401d2455e10e7496a50cfb9144ff4'
massachusetts_code = '25' #code in census
#replace specific variable and zipcode with {} to fill in later
url = "https://api.census.gov/data/2019/acs/acs5?key={}&get=NAME,{}&for=zip%20code%20tabulation%20area:*&in=state:{}"

#Get median income, variable: B06011_001E
median_income_variable = 'B06011_001E'
median_income_response = requests.get(url.format(api_key,median_income_variable,massachusetts_code))
median_income_json = json.loads(median_income_response.text)
median_income = {}
for i in median_income_json:
    median_income[i[-1]] = i[1]

#Get median home value, variable: B25077_001E
median_value_variable = 'B25077_001E'
median_value_response = requests.get(url.format(api_key,median_value_variable,massachusetts_code))
median_value_json = json.loads(median_value_response.text)
median_value = {}
for i in median_value_json:
    median_value[i[-1]] = i[1]

In [None]:
#create DataFrame
df = pd.DataFrame({'median_value':pd.Series(median_value), 'median_income':pd.Series(median_income)})
#remove the value -666666666 which indicates no value for such zipcode
df = df[df['median_value'] != '-666666666']
df = df[df['median_income'] != '-666666666']
#remove the first row with old column name
df = df[1:]

In [None]:
#Get zipcode to search for colleges
zipcode_list = list(df.index.values)

In [None]:
#Load average college size by zipcode from College Scorecard

#set up api key
new_college_api_key = 'sOvF1OBVPgno951MkFexY4r8e5cq4FwVp8sGD3HE'
college_size = {}

#replace zipcode and api key with {} to fill in later
url = 'https://api.data.gov/ed/collegescorecard/v1/schools.json?_zip={}&distance=3&_fields=school.zip,id,school.name,2019.student.size&api_key={}'

#iterate through list of zipcode
for zipcode in zipcode_list:
    #Get the JSON
    college_response = requests.get(url.format(zipcode, new_college_api_key))
    college_json = json.loads(college_response.text)
    college_sizes = []
    #when the zipcode is not supported by college scorecard
    if 'errors' in college_json.keys():
        college_size[zipcode] = 0
        continue
    #when the zipcode is supported, iterate through results of api call
    for college in college_json['results']:
        if college['2019.student.size'] != None:
            college_sizes.append(college['2019.student.size'])
    
    #add to dictionary
    if len(college_sizes) > 0:
        college_size[zipcode] = sum(college_sizes) / len(college_sizes)
    else:
        college_size[zipcode] = 0

In [None]:
#Add the new college size column into df
df['college_size'] = pd.Series(college_size, index = df.index)
#Remove all columns value 0.0
df = df[df['college_size'] != 0.0]

In [321]:
#perform predictive modelling using both median_income and college_size
#copy the df
df_predictive = df.copy()

#creating the categorical variable
#calculating average value
average_value = pd.to_numeric(df_predictive['median_value']).sum() / len(df_predictive['median_value'])
#assigning categorical variable
func = lambda x: 'High' if int(x) > average_value else 'Low'
df_predictive['class_label'] = df_predictive['median_value'].apply(func, convert_dtype = True)

#set x and y
x = df_predictive[['median_income', 'college_size']]
y = df_predictive['class_label']
#split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)


#Using KNN classifier with 4 neighbors
neigh = KNeighborsClassifier(n_neighbors=4)
y_pred = neigh.fit(x_train, y_train).predict(x_test)

#Do 5 fold cross validation
scores = cross_val_score(neigh.fit(x_train, y_train), x, y, cv = 5)

#Get accuracy score and confusion matrix
acc = metrics.accuracy_score(y_test, y_pred)
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
f_scores = metrics.f1_score(y_test, y_pred, pos_label='High')

#Print the accuracy metrics
print('Scores:', scores)
print("Accuracy:", acc)
print("Confusion Matrix:", conf_matrix)
print('F_score:', f_scores)

Scores: [0.92307692 0.71794872 0.76315789 0.78947368 0.78947368]
Accuracy: 0.8205128205128205
Confusion Matrix: [[14  5]
 [ 2 18]]
F_score: 0.7999999999999999


I built a predictive model using median income and college size as predictors. I first created a categorical variable for home values, with home values above average value considered 'High', and others 'Low'. I used KNN nearest neighbors model type because it has consistently been the most accurate model type I have used. I tested out the model with 5 neighbors and 3 neighbors before settling with 4 neighbors which return the highest accuracy metrics.

The model performs well but not as well as I have hoped. Overall, it has the accuracy score of 82%, with the best out of 5-fold valuations being 92% accurate. The confusion matrix also shows an accuracy of 82%. Finally, I was a bit worried about the low accuracy score so I decided to use F-score, which examines the statistical significance of the model I chose. It results in an F-score of 0.8 when used to predict 'High' home value. The closer the F-score is to 1, the better. Therefore, I can conclude that this model is statistically significance and has averagely good accuracy.

In [331]:
#Perform statistical association analysis: pairwise_corr
#copy df
df_pairwise = df.copy()
#change all columns into numeric
df_pairwise = df_pairwise.astype(int)
#do the pairwise correlation
pg.pairwise_corr(df_pairwise, columns = ['median_value'])

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,BF10,power
0,median_value,median_income,pearson,two-sided,192,0.71212,"[0.63, 0.78]",5.224763000000001e-31,6.965e+27,1.0
1,median_value,college_size,pearson,two-sided,192,0.147672,"[0.01, 0.28]",0.04094719,0.718,0.535786


The p-values (p-unc), which are extremely small, shows that both median income and college size are statistically significance in correlation with median home value. Moreover, we can be 95% confidence that the correlation between median value and median income is between 63% to 78%. However, we can be 95% confidence that the correlation between median value and college size is between 1% and 28%. This is a really wide gap which may indicate that we might not be so sure about the effects of college size on median home value. There is needs for a larger sample size to determine this effects. To further analyze the effects of the predictors, I want to use a linear regression.

In [342]:
#Perform statistical association analysis: linear_regression
#copy df
df_linear = df.copy()
#change all columns into numeric
df_linear = df_linear.astype(int)
#do the linear regression correlation
pg.linear_regression(df_linear[['median_income', 'college_size']], df_linear['median_value'])

Unnamed: 0,names,coef,se,T,pval,r2,adj_r2,CI[2.5%],CI[97.5%]
0,Intercept,-73077.839135,39631.409732,-1.843937,0.06675779,0.537684,0.532792,-151254.56295,5098.88468
1,median_income,11.722057,0.807177,14.522284,1.385565e-32,0.537684,0.532792,10.129823,13.31429
2,college_size,25.314087,7.160762,3.535111,0.0005125619,0.537684,0.532792,11.188803,39.439371


The linear regression analysis' p-values are as expected to be very low, indicating that the predictors are statistically significance. Moreover, we can see the particular effects of each predictor on median value. With every 1 unit increase in median income, the home value for the same area is increased by 11 units. Similarly, with every 1 unit increase in college size, the home value for the same area is increased by 25 units.