### Introduction
In this project, I am going to identifies which zip codes are the most similar to zip code 11385 based on feature attributes.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

### 1. Input data

In [2]:
df_input = pd.read_csv('NYC_Demographics_Input.csv')
df_input = df_input.fillna(0)
df_input

Unnamed: 0,Name,Family Household,Nonfamily Household,Median Household Income
0,10001,4039,8958,92840.0
1,10002,16723,16736,36982.0
2,10003,7504,18539,118161.0
3,10004,819,958,190223.0
4,10005,1422,2941,189702.0
...,...,...,...,...
210,11691,14275,6437,50267.0
211,11692,5341,1965,46819.0
212,11693,3104,1713,60569.0
213,11694,5025,3472,84485.0


In [3]:
df_target = pd.read_csv('NYC_Demographics_Target.csv')
df_target

Unnamed: 0,Name,Family Household,Nonfamily Household,Median Household Income
0,11385,24696,10805,75340


In [4]:
frames = [df_target, df_input]

df = pd.concat(frames)
df

Unnamed: 0,Name,Family Household,Nonfamily Household,Median Household Income
0,11385,24696,10805,75340.0
0,10001,4039,8958,92840.0
1,10002,16723,16736,36982.0
2,10003,7504,18539,118161.0
3,10004,819,958,190223.0
...,...,...,...,...
210,11691,14275,6437,50267.0
211,11692,5341,1965,46819.0
212,11693,3104,1713,60569.0
213,11694,5025,3472,84485.0


### 2. Z-score Normalization

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_data = scaler.fit_transform(df)
normalized_data

array([[ 1.08275307e+00,  2.33242810e+00,  9.24903730e-01,
         1.85620377e-01],
       [-1.30821832e+00, -7.22997718e-01,  5.70948952e-01,
         5.75268175e-01],
       [-1.30649074e+00,  1.15312278e+00,  2.06150676e+00,
        -6.68443065e-01],
       [-1.30476316e+00, -2.10481356e-01,  2.40702948e+00,
         1.13905514e+00],
       [-1.30303558e+00, -1.19927555e+00, -9.62152415e-01,
         2.74355798e+00],
       [-1.30130800e+00, -1.11008439e+00, -5.82134914e-01,
         2.73195760e+00],
       [-1.29958042e+00, -1.23625364e+00, -9.23441606e-01,
         2.49465096e+00],
       [-1.29785284e+00, -1.09987844e+00, -8.94504317e-01,
         3.49702549e+00],
       [-1.29439768e+00,  2.09441861e-01,  2.57605390e+00,
        -7.31725579e-02],
       [-1.29267010e+00, -3.74220350e-01,  8.50739952e-01,
         1.46918702e+00],
       [-1.29094252e+00, -1.15521615e-01,  2.86772644e+00,
         1.58683839e+00],
       [-1.28921494e+00, -7.25660140e-01,  4.18022091e-01,
      

In [6]:
normalized_df = pd.DataFrame(normalized_data, columns=df.columns,index = df.Name)
normalized_df = normalized_df.drop(['Name'],axis = 1)
normalized_df

Unnamed: 0_level_0,Family Household,Nonfamily Household,Median Household Income
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11385,2.332428,0.924904,0.185620
10001,-0.722998,0.570949,0.575268
10002,1.153123,2.061507,-0.668443
10003,-0.210481,2.407029,1.139055
10004,-1.199276,-0.962152,2.743558
...,...,...,...
11691,0.791033,0.087830,-0.372645
11692,-0.530416,-0.769173,-0.449416
11693,-0.861296,-0.817466,-0.143265
11694,-0.577156,-0.480375,0.389239


In [7]:
df_target = normalized_df.iloc[:1]
df_target

Unnamed: 0_level_0,Family Household,Nonfamily Household,Median Household Income
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11385,2.332428,0.924904,0.18562


In [8]:
df_input = normalized_df.iloc[1:]
df_input

Unnamed: 0_level_0,Family Household,Nonfamily Household,Median Household Income
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10001,-0.722998,0.570949,0.575268
10002,1.153123,2.061507,-0.668443
10003,-0.210481,2.407029,1.139055
10004,-1.199276,-0.962152,2.743558
10005,-1.110084,-0.582135,2.731958
...,...,...,...
11691,0.791033,0.087830,-0.372645
11692,-0.530416,-0.769173,-0.449416
11693,-0.861296,-0.817466,-0.143265
11694,-0.577156,-0.480375,0.389239


### 3. Cosine similarity

In [9]:
cosine_sim = cosine_similarity(df_target, df_input)

In [10]:
df_input1 = pd.read_csv('NYC_Demographics_Input.csv')

In [11]:
sim = cosine_similarity(df_target,df_input).reshape(-1,1)
df_sim = df_input1.copy()
df_sim['Similarity'] = sim
df_sim

Unnamed: 0,Name,Family Household,Nonfamily Household,Median Household Income,Similarity
0,10001,4039,8958,92840.0,-0.384789
1,10002,16723,16736,36982.0,0.724083
2,10003,7504,18539,118161.0,0.289664
3,10004,819,958,190223.0,-0.401612
4,10005,1422,2941,189702.0,-0.346514
...,...,...,...,...,...
210,11691,14275,6437,50267.0,0.839907
211,11692,5341,1965,46819.0,-0.778973
212,11693,3104,1713,60569.0,-0.927650
213,11694,5025,3472,84485.0,-0.807433


In [12]:
df_sim["Rank"] = df_sim["Similarity"].rank(ascending=False)
df_sim

Unnamed: 0,Name,Family Household,Nonfamily Household,Median Household Income,Similarity,Rank
0,10001,4039,8958,92840.0,-0.384789,129.0
1,10002,16723,16736,36982.0,0.724083,55.0
2,10003,7504,18539,118161.0,0.289664,82.0
3,10004,819,958,190223.0,-0.401612,131.0
4,10005,1422,2941,189702.0,-0.346514,126.0
...,...,...,...,...,...,...
210,11691,14275,6437,50267.0,0.839907,40.0
211,11692,5341,1965,46819.0,-0.778973,195.0
212,11693,3104,1713,60569.0,-0.927650,212.0
213,11694,5025,3472,84485.0,-0.807433,201.0


In [13]:
df_sim.to_csv("Similarity.csv")