In [5]:
import pandas as pd
import numpy as np
real_estate_df = pd.read_csv("realestate_data.csv",index_col=0)
real_estate_df

Unnamed: 0,人口（千人）,学年中央値,雇用者（千人）,健康サービス従業員（百人）,家の価格中央値（万ドル）
1,5.935,14.2,2.265,2.27,2.91
2,1.523,13.1,0.597,0.75,2.62
3,2.599,12.7,1.237,1.11,1.72
4,4.009,15.2,1.649,0.81,3.02
5,4.687,14.7,2.312,2.5,2.22
6,8.044,15.6,3.641,4.51,2.36
7,2.766,13.3,1.244,1.03,1.97
8,6.538,17.0,2.618,2.39,1.85
9,6.451,12.9,3.147,5.52,2.01
10,3.314,12.2,1.606,2.18,1.82


In [6]:
from sklearn.decomposition import PCA #主成分分析器

# 行列の標準化
df = real_estate_df.apply(lambda x: (x-x.mean())/x.std(ddof=0), axis=0)
display(df)

#主成分分析の実行
pca = PCA()
pca.fit(df)
# データを主成分空間に写像
feature = pca.transform(df)

# PCA の固有ベクトル
eig_vec = pd.DataFrame(pca.components_, columns=df.columns, index=["u{}".format(x + 1) for x in range(len(df.columns))])
display(eig_vec)

# PCA の固有値
eig = pd.DataFrame(pca.explained_variance_, index=["u{}".format(x + 1) for x in range(len(df.columns))], columns = ["固有値"])
display(eig)

# 寄与率
ev = pd.DataFrame(pca.explained_variance_ratio_, index=["u{}".format(x + 1) for x in range(len(df.columns))], columns = ["寄与率"])
display(ev)

# 主成分得点
print("主成分得点")
cor = pd.DataFrame(feature, columns=["u{}".format(x + 1) for x in range(len(df.columns))], index=[x+1 for x in range(len(df.index))])
display(cor)


Unnamed: 0,人口（千人）,学年中央値,雇用者（千人）,健康サービス従業員（百人）,家の価格中央値（万ドル）
1,0.80587,0.144964,0.362672,0.07289,0.665895
2,-1.400165,-0.713671,-1.571798,-1.051095,0.242144
3,-0.862157,-1.025902,-0.829556,-0.784888,-1.072947
4,-0.157145,0.925542,-0.351737,-1.006728,0.826629
5,0.18186,0.535253,0.41718,0.242967,-0.342341
6,1.860387,1.237773,1.958494,1.72929,-0.137771
7,-0.778655,-0.557555,-0.821437,-0.844046,-0.707644
8,1.107375,2.330581,0.772065,0.161626,-0.88299
9,1.063874,-0.869786,1.385575,2.476149,-0.649196
10,-0.504651,-1.416191,-0.401606,0.006338,-0.926826


Unnamed: 0,人口（千人）,学年中央値,雇用者（千人）,健康サービス従業員（百人）,家の価格中央値（万ドル）
u1,0.558359,0.313283,0.568258,0.486625,-0.174266
u2,0.131393,0.628873,0.004262,-0.309561,0.701006
u3,-0.007946,0.549031,-0.11728,-0.454924,-0.691225
u4,0.550553,-0.452654,0.268116,-0.647982,0.015107
u5,0.606465,-0.006565,-0.769041,0.201326,-0.014203


Unnamed: 0,固有値
u1,3.261888
u2,1.390456
u3,0.616491
u4,0.102737
u5,0.013044


Unnamed: 0,寄与率
u1,0.605779
u2,0.258228
u3,0.114491
u4,0.01908
u5,0.002422


主成分得点


Unnamed: 0,u1,u2,u3,u4,u5
1,0.620897,0.642828,-0.462791,0.438122,0.214087
2,-2.452249,-0.144357,0.114432,-0.184495,0.14926
3,-1.469159,-1.27115,0.639602,0.259682,-0.020949
4,-0.631616,1.451016,0.437249,0.065056,-0.0453
5,0.684187,0.047083,0.369602,-0.192918,-0.160273
6,3.40499,0.399292,-0.256364,-0.133567,-0.035921
7,-1.363644,-0.691222,0.669528,0.139684,0.003275
8,2.019702,0.945417,1.71703,-0.356344,0.107615
9,2.426986,-1.622902,-1.326212,-0.263381,0.093081
10,-0.789062,-1.610295,-0.08866,0.237421,0.026536


In [4]:
#標準化
std_data_df = (real_estate_df-real_estate_df.mean())/real_estate_df.std(ddof = 0) #ddof:標準偏差を計算する際にデータの個数で割り算を行う際、本来のデータの個数Nではなく”N-ddof”で割るようにする
display(std_data_df)

#分散共分散行列
cov_vec = np.cov(std_data_df.T, bias = 0) #bias:Falseだと不偏共分散,Trueにすると(データ数)で割る

# 固有値と固有ベクトル
eig_val, eig_vec =np.linalg.eig(cov_vec)

# 主成分得点
pca_cor = np.dot(std_data_df, eig_vec)

# 固有値ベクトルのマトリックス表示
eig_vec = pd.DataFrame(eig_vec, index = df.columns, columns = ["u{}".format(x + 1) for x in range(len(std_data_df.columns))])
display(eig_vec)

# 固有値
eig = pd.DataFrame(eig_val, index=["u{}".format(x + 1) for x in range(len(std_data_df.columns))], columns=['固有値']).T
display(eig)

# 寄与率
ev = eig_val / eig_val.sum()
ev = pd.DataFrame(ev, index=["u{}".format(x + 1) for x in range(len(std_data_df.columns))], columns=['寄与率']).T
display(ev)

# 主成分得点
print('主成分得点')
cor = pd.DataFrame(pca_cor, columns=["u{}".format(x + 1) for x in range(len(std_data_df.columns))], index=[x+1 for x in range(len(df.index))])
display(cor)

Unnamed: 0,人口（千人）,学年中央値,雇用者（千人）,健康サービス従業員（百人）,家の価格中央値（万ドル）
1,0.80587,0.144964,0.362672,0.07289,0.665895
2,-1.400165,-0.713671,-1.571798,-1.051095,0.242144
3,-0.862157,-1.025902,-0.829556,-0.784888,-1.072947
4,-0.157145,0.925542,-0.351737,-1.006728,0.826629
5,0.18186,0.535253,0.41718,0.242967,-0.342341
6,1.860387,1.237773,1.958494,1.72929,-0.137771
7,-0.778655,-0.557555,-0.821437,-0.844046,-0.707644
8,1.107375,2.330581,0.772065,0.161626,-0.88299
9,1.063874,-0.869786,1.385575,2.476149,-0.649196
10,-0.504651,-1.416191,-0.401606,0.006338,-0.926826


Unnamed: 0,u1,u2,u3,u4,u5
人口（千人）,0.558359,0.131393,-0.606465,-0.550553,0.007946
学年中央値,0.313283,0.628873,0.006565,0.452654,-0.549031
雇用者（千人）,0.568258,0.004262,0.769041,-0.268116,0.11728
健康サービス従業員（百人）,0.486625,-0.309561,-0.201326,0.647982,0.454924
家の価格中央値（万ドル）,-0.174266,0.701006,0.014203,-0.015107,0.691225


Unnamed: 0,u1,u2,u3,u4,u5
固有値,3.261888,1.390456,0.013044,0.102737,0.616491


Unnamed: 0,u1,u2,u3,u4,u5
寄与率,0.605779,0.258228,0.002422,0.01908,0.114491


主成分得点


Unnamed: 0,u1,u2,u3,u4,u5
1,0.620897,0.642828,-0.214087,-0.438122,0.462791
2,-2.452249,-0.144357,-0.14926,0.184495,-0.114432
3,-1.469159,-1.27115,0.020949,-0.259682,-0.639602
4,-0.631616,1.451016,0.0453,-0.065056,-0.437249
5,0.684187,0.047083,0.160273,0.192918,-0.369602
6,3.40499,0.399292,0.035921,0.133567,0.256364
7,-1.363644,-0.691222,-0.003275,-0.139684,-0.669528
8,2.019702,0.945417,-0.107615,0.356344,-1.71703
9,2.426986,-1.622902,-0.093081,0.263381,1.326212
10,-0.789062,-1.610295,-0.026536,-0.237421,0.08866


In [17]:
def power_method(A, eig_vec, n, eig):
    MAX_CAL=100
    eps=0.001 #小さな正の実数
    u = np.zeros(A.shape[0])
    np.random.seed(0)
    x_old = np.random.rand(A.shape[0])*10 #初期ベクトル
    x_old = np.floor(x_old)
    for k in range(MAX_CAL):
        u = x_old/np.linalg.norm(x_old)
        x_new = np.dot(A,u)
        con_judg_value = max(abs(x_new - x_old))
        if con_judg_value < eps :
            eig = np.dot(u,x_new)
            eig_vec = x_new
            return eig,eig_vec
        x_old=x_new
    return -1, -1
    

#標準化
std_data_df = (real_estate_df-real_estate_df.mean())
display(std_data_df)
#A行列(分散共分散行列)
A = np.cov(std_data_df.T, bias = 0) #bias:Falseだと不偏共分散,Trueにすると(データ数)で割る

eig_vec = np.zeros((len(A),len(A)))
eig = np.zeros(len(A))
for n in range(len(A)):
    #eig:固有値,eig_vec:固有ベクトル
    eig[n], eig_vec[n] = power_method(A, eig_vec[n], n, eig[n])
    lamda = eig[n]
    eig_vec[n] = eig_vec[n]/np.linalg.norm(eig_vec[n])
    A = A- lamda*np.dot(eig_vec[n].reshape(-1,1),eig_vec[n].reshape(1,-1))
            
# 固有値ベクトルのマトリックス表示
eig_vec_df = pd.DataFrame(eig_vec, columns=df.columns, index=["u{}".format(x + 1) for x in range(len(df.columns))])
display(eig_vec_df)

# 固有値
eig_df = pd.DataFrame(eig, index=["u{}".format(x + 1) for x in range(len(std_data_df.columns))], columns=['固有値']).T
display(eig_df)

# 寄与率
ev = eig / eig.sum()
ev_df = pd.DataFrame(ev, index=["u{}".format(x + 1) for x in range(len(std_data_df.columns))], columns=['寄与率']).T
display(ev_df)    

# 主成分得点
pca_cor = np.dot(std_data_df, eig_vec.T)
print('主成分得点')
cor_df = pd.DataFrame(pca_cor, columns=["u{}".format(x + 1) for x in range(len(std_data_df.columns))], index=[x+1 for x in range(len(real_estate_df.index))])
display(cor_df)

Unnamed: 0,人口（千人）,学年中央値,雇用者（千人）,健康サービス従業員（百人）,家の価格中央値（万ドル）
1,1.611714,0.185714,0.312714,0.098571,0.455714
2,-2.800286,-0.914286,-1.355286,-1.421429,0.165714
3,-1.724286,-1.314286,-0.715286,-1.061429,-0.734286
4,-0.314286,1.185714,-0.303286,-1.361429,0.565714
5,0.363714,0.685714,0.359714,0.328571,-0.234286
6,3.720714,1.585714,1.688714,2.338571,-0.094286
7,-1.557286,-0.714286,-0.708286,-1.141429,-0.484286
8,2.214714,2.985714,0.665714,0.218571,-0.604286
9,2.127714,-1.114286,1.194714,3.348571,-0.444286
10,-1.009286,-1.814286,-0.346286,0.008571,-0.634286


Unnamed: 0,人口（千人）,学年中央値,雇用者（千人）,健康サービス従業員（百人）,家の価格中央値（万ドル）
u1,0.78121,0.305668,0.334446,0.425993,-0.054348
u2,0.070793,0.763825,-0.082939,-0.579469,0.262482
u3,0.000993,-0.159621,0.014642,0.223865,0.961348
u4,-0.541786,0.546082,-0.050941,0.634164,-0.059181
u5,-0.300228,-0.011104,0.937421,-0.174263,0.024781


Unnamed: 0,u1,u2,u3,u4,u5
固有値,6.931074,1.785144,0.389639,0.229533,0.014155


Unnamed: 0,u1,u2,u3,u4,u5
寄与率,0.741327,0.190934,0.041675,0.02455,0.001514


主成分得点


Unnamed: 0,u1,u2,u3,u4,u5
1,1.437664,0.292512,0.436703,-0.752179,-0.198684
2,-3.534874,0.082982,-0.035586,0.175695,-0.167787
3,-2.400243,-0.644298,-0.745919,-0.37674,0.028521
4,-0.595226,1.845978,0.045054,-0.063626,0.04815
5,0.766745,0.267787,-0.255501,0.38131,0.157329
6,4.957485,-0.04533,0.208192,0.252692,0.038502
7,-2.131706,-0.062783,-0.618996,-0.205454,-0.001582
8,2.991391,2.096866,-0.996634,0.571004,-0.127082
9,3.171773,-2.856591,0.519986,0.32772,-0.101018
10,-1.420724,-1.599982,-0.324325,-0.383318,-0.018666
