# Explore Dimensionality Reduction Approaches
1. PCA on numerical features
2. MCA on categorical features
3. FAMD on mixed data dataframe

In [1]:
# import libraries

import os
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib
from matplotlib import pyplot as plt
from scipy import stats
import math

## Import functions from scripts:
* reading clean merged data
* factor analysis of mixed data

In [2]:
# make sure to navigate to project folder
os.chdir('/Users/trevor.mattos/Desktop/nycdsa/finalproject/cleancode')
# import functions from scripts
from data_compiler import *
from factor_analysis_mixed_data import *

#### ***df=mycompiler()*** brings in the full dataframe, clean and merged

In [4]:
# use data compiler to read in clean merged dataframe
#df=mycompiler()

#### To save time, read the clean merged dataset as a csv

In [6]:
# read data from file
df=pd.read_csv('./data/compiled.csv')

In [7]:
# recast categorical columns as object 
    # to ensure that reading the CSV doesn't affect our analysis
for i in ['special_features',
          'transaction_type',
          'listing_status',
          'listing_special_features',
          'zip']:
    df[i]=df[i].astype(object)

#### Drop price (target) for dimensionality reduction and clustering

In [8]:
# drop price (our target for supervised learning later on)
df=df.drop(['price'], axis=1)

## PCA with numerical features
#### Note that we exclude lat and long

In [9]:
# identify list of numerical features
numers=[]
for col in df.columns[2:]:
    if df[col].dtype==('float64') or df[col].dtype==('int64'):
        numers.append(col)

In [12]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA(n_components=3)


In [13]:
# scale numeric data for PCA
scaler=StandardScaler()
scaler.fit(df[numers])
scaled_df=scaler.transform(df[numers])

In [14]:
# fit PCA to data
pca.fit(scaled_df)

PCA(n_components=3)

In [15]:
# examine the share of variance attributed to the principal components
list(pca.explained_variance_ratio_)

[0.389790992449892, 0.10758050708470814, 0.08565034610605664]

In [19]:
c_list=[]
for i in range(0,(pca.components_.shape[0])):
    c_list.append('pc%d' %(i+1))
    

In [35]:
# examine the loading vectors to assess correlations between components and features
loadings = pd.DataFrame(
    pca.components_.T, columns=c_list, index=df[numers].columns)

# display loading vectors
loadings.head()


Unnamed: 0,pc1,pc2,pc3
beds,-0.080432,0.437268,0.260569
baths_full,-0.095839,0.441846,0.322004
square_footage,-0.092703,0.407962,0.305637
year_built,-0.146979,0.133306,0.282036
grocer_dist,-0.006645,-0.336494,0.269408


## Multiple Correspondence Analysis (categorical features)

#### First try with all categorical data

In [21]:
# save a list of categorical cols 
cats=[]
for col in df.columns:
    if df[col].dtype=='object':
        cats.append(col)

In [22]:
# run MCA
from prince import MCA
mca=MCA(n_components=10, n_iter=3, random_state=101)

In [23]:
mca.fit(df[cats])

  uniques = Index(uniques)


MCA(n_components=10, n_iter=3, random_state=101)

In [24]:
mca.explained_inertia_

[0.00042349226787438584,
 0.0004189996973328196,
 0.00041752234555208627,
 0.00041676307610658073,
 0.0004160226267005604,
 0.0004153091777298361,
 0.00041302333947006414,
 0.0004129749408885789,
 0.0004115016680274987,
 0.0004100676319091386]

## Attempt MCA on subset of categorical features (non-geographic categorical features, with a meaningful distribution of values)

In [25]:
newcats=[
 'special_features',
 'listing_status',
 'listing_special_features']

In [26]:
# run MCA
from prince import MCA
mca=MCA(n_components=3, n_iter=3, random_state=101)

In [27]:
mca.fit(df[newcats])

  uniques = Index(uniques)


MCA(n_components=3, n_iter=3, random_state=101)

In [28]:
mca.explained_inertia_

[0.1539220441317993, 0.1538461538461586, 0.15384615384615427]

## Factor Analysis of Mixed Data with full feature set

#### create subset of numerical columns with only one crime rate

In [29]:
newnumers=list(df[numers].iloc[:,:9].columns)

In [30]:
# identify all columns for FAMD
famdlist=newnumers+newcats

In [31]:
# obtain FAMD components, variance explained, processed dataframe
famd_components,famd_variance_explained,processed_dataframe=FAMD_(df[famdlist], n_components=12)
    # note that this relies on the FAMD algorithm specified in 
    # factor_analysis_of_mixed_data.py script

  uniques = Index(uniques)


In [32]:
processed_dataframe.head(5)

Unnamed: 0,beds,baths_full,square_footage,year_built,grocer_dist,bank_dist,school_dist,walkscore,violent_crime_total_rate,special_features_1,...,special_features_20,special_features_512,special_features_514,listing_status_1,listing_special_features_1,listing_special_features_2,listing_special_features_3,listing_special_features_20,listing_special_features_512,listing_special_features_514
0,-0.668369,-0.496465,-0.510508,1.149556,1.725367,2.203153,0.362622,-0.625285,0.893674,-0.178712,...,-0.009329,-0.084988,-0.03613,0.018444,-0.178712,1.804267,-0.064631,-0.009329,-0.084988,-0.03613
1,-0.668369,-0.496465,-0.615014,-1.090088,2.530822,-0.222516,-0.213489,0.666836,0.580385,-0.178712,...,-0.009329,-0.084988,-0.03613,0.018444,-0.178712,-0.444657,-0.064631,-0.009329,-0.084988,-0.03613
2,-0.668369,-0.496465,-0.710272,-0.883352,2.572102,-0.077376,-0.424137,0.666836,0.580385,-0.178712,...,-0.009329,-0.084988,-0.03613,0.018444,-0.178712,-0.444657,-0.064631,-0.009329,-0.084988,-0.03613
3,0.327338,0.107501,-0.311669,-1.262369,2.580749,-0.791301,-0.390427,0.666836,0.580385,-0.178712,...,-0.009329,-0.084988,-0.03613,0.018444,-0.178712,-0.444657,-0.064631,-0.009329,-0.084988,-0.03613
4,0.327338,0.541624,0.683451,-0.02195,2.53134,-0.588489,-0.075493,0.666836,0.580385,-0.178712,...,-0.009329,11.68131,-0.03613,0.018444,-0.178712,-0.444657,-0.064631,-0.009329,11.68131,-0.03613


In [33]:
# take a quick peak at the components
famd_components

array([[ 1.0974554 , -2.60640459, -0.94922635, ...,  0.59877746,
        -0.34890884, -0.30210349],
       [-1.78543842, -0.84618129,  0.12736023, ...,  1.99213333,
         1.07024087, -0.08522108],
       [-1.74033724, -0.87297357,  0.09650077, ...,  2.00087299,
         1.07382994, -0.33230471],
       ...,
       [-0.81049378, -1.38974663, -0.01472136, ...,  0.03083026,
         0.11521436, -0.37521974],
       [-1.40605626, -0.69795204, -0.11733366, ...,  0.56914397,
         0.41851429,  0.46298957],
       [-2.67881749, -1.08948455,  0.10006522, ...,  0.35298805,
         0.01869401,  1.46309122]])

In [34]:
# check to see how much overall variance is attributed to the components
famd_variance_explained

array([0.14955372, 0.11500777, 0.09863883, 0.09757036, 0.0974401 ,
       0.09734954, 0.08849408, 0.0717146 , 0.05090371, 0.04105563,
       0.02569437, 0.01909849])