In [60]:
import pandas
import numpy as np
from sklearn.decomposition import PCA

# get data
data = pandas.read_csv('close_prices.csv')

# prepare train features
X_train = np.array(data[['AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DD', 'DIS', 'GE', 'GS', 'HD', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 
                         'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PFE', 'PG', 'T', 'TRV', 'UNH', 'UTX', 'V', 'VZ', 'WMT', 'XOM']])

# start train with PCA
pca = PCA(n_components=10)
pca.fit(X_train)
print('Components:')
print(pca.explained_variance_ratio_)
print()
# we can see, thatf for 90% of dispersion enougth first 4 components, 
# becouse 0.73897118 + 0.11007169 + 0.04995088 + 0.0287492 ~ 90%

# get first component
first_component = pca.explained_variance_ratio_[0]

# get data from Dow Jones index
data_dj = pandas.read_csv('djia_index.csv')
DJ = np.array(data_dj['^DJI'])

# get transform for 1-st component
first_component = pca.transform(X_train)[0:,0]

# calculate Pirson
print('Pirson k.:')
print(np.corrcoef(DJ, first_component))
# Pirson k-t ~ 0.91
print()

# find index of company with max weight in first component
weight = -999
index = 0
for i in range(len(pca.components_[0])):
    if pca.components_[0][i] > weight:
        weight = pca.components_[0][i]
        index = i

print('Max weight in 1-st component: ' + '\t' + str(weight))
print('Max index in 1-st component: ' + '\t' + str(index))
# index = 26. It's company 'V'

Components:
[ 0.73897118  0.11007169  0.04995088  0.0287492   0.02215448  0.01931577
  0.00674853  0.00614091  0.00320594  0.00305611]

Pirson k.:
[[ 1.          0.90965222]
 [ 0.90965222  1.        ]]

Max weight in 1-st component: 	0.579683945747
Max index in 1-st component: 	26
