In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA, VectorAssembler, StandardScaler

In [2]:
# Initialize SparkSession
spark = (SparkSession
         .builder
         .appName("news")
         .enableHiveSupport()
         .getOrCreate())

In [3]:
# Read raw data
df = spark.read.csv('/home/worker/data/Data7602.csv', header=True, inferSchema=True, mode="DROPMALFORMED", encoding='UTF-8').drop("Area")
df = df.union(df)
df = df.union(df)
df = df.union(df)

print("==== 生データ ====")
df.show(truncate=False)

==== 生データ ====
+--------+----+---------+--------+
|anzsic06|year|geo_count|ec_count|
+--------+----+---------+--------+
|A       |2000|96       |130     |
|A       |2000|198      |110     |
|A       |2000|42       |25      |
|A       |2000|66       |40      |
|A       |2000|63       |40      |
|A       |2000|21       |12      |
|A       |2000|45       |60      |
|A       |2000|36       |60      |
|A       |2000|78       |18      |
|A       |2000|42       |9       |
|A       |2000|39       |35      |
|A       |2000|105      |20      |
|A       |2000|99       |30      |
|A       |2000|42       |12      |
|A       |2000|57       |9       |
|A       |2000|54       |15      |
|A       |2000|81       |25      |
|A       |2000|63       |50      |
|A       |2000|75       |50      |
|A       |2000|123      |30      |
+--------+----+---------+--------+
only showing top 20 rows



In [4]:
print("==== 元のデータフレーム行数 ====")
print((df.count(), len(df.columns)))

==== 元のデータフレーム行数 ====
(39071968, 4)


In [5]:
assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="変量")
feature_vectors = assembler.transform(df)
feature_vectors.show()

+--------+----+---------+--------+--------------------+
|anzsic06|year|geo_count|ec_count|                変量|
+--------+----+---------+--------+--------------------+
|       A|2000|       96|     130| [2000.0,96.0,130.0]|
|       A|2000|      198|     110|[2000.0,198.0,110.0]|
|       A|2000|       42|      25|  [2000.0,42.0,25.0]|
|       A|2000|       66|      40|  [2000.0,66.0,40.0]|
|       A|2000|       63|      40|  [2000.0,63.0,40.0]|
|       A|2000|       21|      12|  [2000.0,21.0,12.0]|
|       A|2000|       45|      60|  [2000.0,45.0,60.0]|
|       A|2000|       36|      60|  [2000.0,36.0,60.0]|
|       A|2000|       78|      18|  [2000.0,78.0,18.0]|
|       A|2000|       42|       9|   [2000.0,42.0,9.0]|
|       A|2000|       39|      35|  [2000.0,39.0,35.0]|
|       A|2000|      105|      20| [2000.0,105.0,20.0]|
|       A|2000|       99|      30|  [2000.0,99.0,30.0]|
|       A|2000|       42|      12|  [2000.0,42.0,12.0]|
|       A|2000|       57|       9|   [2000.0,57.0,

In [6]:
scaler = StandardScaler(inputCol="変量", outputCol="標準化変量", withStd=True, withMean=True)
scalerModel = scaler.fit(feature_vectors)
std_feature_vectors = scalerModel.transform(feature_vectors)

In [7]:
print("==== 標準化されたデータ ====")
std_feature_vectors.select("標準化変量").show(truncate=False)

==== 標準化されたデータ ====
+-----------------------------------------------------------------+
|標準化変量                                                       |
+-----------------------------------------------------------------+
|[-1.6836394436497946,0.0315689322007757,-0.006186700502461097]   |
|[-1.6836394436497946,0.09283059431567955,-0.00939729984735488]   |
|[-1.6836394436497946,-8.637124482910466E-4,-0.02304234706315346] |
|[-1.6836394436497946,0.01355079628462751,-0.02063439755448312]   |
|[-1.6836394436497946,0.01174898269301269,-0.02063439755448312]   |
|[-1.6836394436497946,-0.013476407589594784,-0.025129236637334417]|
|[-1.6836394436497946,9.381011433237729E-4,-0.017423798209589336] |
|[-1.6836394436497946,-0.004467339631520685,-0.017423798209589336]|
|[-1.6836394436497946,0.02075805065108679,-0.024166056833866283]  |
|[-1.6836394436497946,-8.637124482910466E-4,-0.025610826539068484]|
|[-1.6836394436497946,-0.002665526039905866,-0.021437047390706565]|
|[-1.6836394436497946,0.036974372

In [8]:
# build PCA model
pca = PCA(k=2, inputCol="標準化変量", outputCol="主成分得点")
pcaModel = pca.fit(std_feature_vectors)

In [8]:
print("==== 固有ベクトル ====")
print(pcaModel.pc)

==== 固有ベクトル ====
DenseMatrix([[-0.00244874,  0.99999695],
             [-0.70710493, -0.00151372],
             [-0.70710439, -0.00194932]])


In [9]:
print("==== 寄与率 ====")
print(pcaModel.explainedVariance)

==== 寄与率 ====
[0.6458571404833907,0.3333314889844293]


In [10]:
pca_score = pcaModel.transform(std_feature_vectors).select("主成分得点")
print("==== 主成分得点 ====")

pca_score.show(truncate=False)

==== 主成分得点 ====
+-------------------------------------------+
|主成分得点                                 |
+-------------------------------------------+
|[-0.013825114875613533,-1.6836700425729052]|
|[-0.05487330913853759,-1.6837565169718303] |
|[0.02102686973362607,-1.683588091671154]   |
|[0.009131627896970279,-1.6836146050328742] |
|[0.01040569916663741,-1.6836118775952351]  |
|[0.031421017410038894,-1.683564931595154]  |
|[0.015779897878881847,-1.6836017714502085] |
|[0.019602111687883235,-1.6835935891372908] |
|[0.006532594614635932,-1.6836186304545417] |
|[0.022843052858232737,-1.6835830848865072] |
|[0.02116582655041403,-1.6835884934739191]  |
|[-0.005161069702944072,-1.6836438032413756]|
|[-0.003748041616488975,-1.6836414776065016]|
|[0.022502518522368987,-1.6835840236586286] |
|[0.01647269650989709,-1.6835967220747032]  |
|[0.01706569910783672,-1.683595872181307]   |
|[0.004463943227953387,-1.683623548360464]  |
|[0.009270584713758242,-1.6836150068356395] |
|[0.004174299635089723,