In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from glob import glob
from sklearn import preprocessing
from sklearn.cluster import KMeans

In [None]:
url = 'https://en.wikipedia.org/wiki/S%26P_100'
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")

In [None]:
i = 0
symbols = []
names = []
for table in soup.find_all('table', {'class': 'wikitable sortable'})[:1]:
    for td in table.find_all('td'):
        if i % 2 == 0:
            symbols.append(td.string)
        else:
            names.append(td.string)
        i += 1

In [None]:
data = { symbol: name for symbol, name in zip(symbols, names)}

In [None]:
data

In [None]:
# example url 'http://www.google.com/finance/historical?q=AAPL&output=csv'

base_url = 'http://www.google.com/finance/historical?'
user_agent = {'User-agent': 'Mozilla/5.0'}

for i, symbol in enumerate(symbols):
    url = base_url + 'q=' + symbol + '&output=csv'
    r = requests.get(url, headers=user_agent)
    if r.ok:
        with open('stocks/' + symbol.lower()  + '.csv', 'w') as f:
            f.write(r.text)
    else:
        print "Error at:", symbol
    if i % 10 == 0:
        time.sleep(5)

In [2]:
csv_files = glob('stocks/*.csv')

In [3]:
frames = []
for csv_file in csv_files:
    tmp_df = pd.read_csv(csv_file)
    tmp_df['Stock'] = csv_file.split('/')[-1].split('.')[0].upper()
    frames.append(tmp_df)
df = pd.concat(frames)

In [4]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Stock
0,6-Apr-17,144.29,144.52,143.45,143.66,21149034,AAPL
1,5-Apr-17,144.22,145.46,143.81,144.02,27717854,AAPL
2,4-Apr-17,143.25,144.89,143.17,144.77,19891354,AAPL
3,3-Apr-17,143.71,144.12,143.05,143.70,19985714,AAPL
4,31-Mar-17,143.72,144.27,143.01,143.66,19661651,AAPL
5,30-Mar-17,144.19,144.50,143.50,143.93,21207252,AAPL
6,29-Mar-17,143.68,144.49,143.19,144.12,29189955,AAPL
7,28-Mar-17,140.91,144.04,140.62,143.80,33374805,AAPL
8,27-Mar-17,139.39,141.22,138.62,140.88,23575094,AAPL
9,24-Mar-17,141.50,141.74,140.35,140.64,22395563,AAPL


In [5]:
stocks = df.as_matrix()

In [6]:
stocks[:,1:5] = preprocessing.normalize(stocks[:,1:5])



In [7]:
columns = ['Date', 'OpenNormalized', 'HighNormalized', 'LowNormalized', 'CloseNormalized', 'Volume', 'Stock']
df = pd.DataFrame(stocks, columns=columns)

In [8]:
df.head()

Unnamed: 0,Date,OpenNormalized,HighNormalized,LowNormalized,CloseNormalized,Volume,Stock
0,6-Apr-17,0.501074,0.501873,0.498157,0.498886,21149034,AAPL
1,5-Apr-17,0.49945,0.503744,0.49803,0.498757,27717854,AAPL
2,4-Apr-17,0.497319,0.503012,0.497041,0.502596,19891354,AAPL
3,3-Apr-17,0.500224,0.501652,0.497927,0.50019,19985714,AAPL
4,31-Mar-17,0.500189,0.502103,0.497718,0.49998,19661651,AAPL


In [9]:
def dist(s1, s2):
    return ((s1 - s2)**2).sum()

In [10]:
stock_names = list(set(df['Stock']))

In [11]:
stocks = df.as_matrix()

In [12]:
n = len(stock_names)
distance_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i):
        stock_i = df[df['Stock'] == stock_names[i]].as_matrix()[:,1:5]
        stock_j = df[df['Stock'] == stock_names[j]].as_matrix()[:,1:5]
        dist_result = dist(stock_i, stock_j)
        distance_matrix[i,j] = dist_result
        distance_matrix[j,i] = dist_result

In [13]:
clusters = KMeans().fit(distance_matrix)

In [14]:
clusters.n_clusters

8

In [15]:
predicted = clusters.predict(distance_matrix)
cluster_groups = dict()
for i in range(len(predicted)):
    cluster_groups[predicted[i]] = cluster_groups.get(predicted[i], [])
    cluster_groups[predicted[i]].append(stock_names[i])
    
for cluster_group in cluster_groups:
    cluster_groups[cluster_group] = sorted(cluster_groups[cluster_group])

In [16]:
cluster_groups

{0: ['AAPL',
  'ACN',
  'AIG',
  'ALL',
  'AXP',
  'BA',
  'BLK',
  'BRK',
  'CL',
  'CMCSA',
  'COST',
  'CSCO',
  'CVX',
  'DHR',
  'DIS',
  'FDX',
  'GD',
  'GE',
  'GOOG',
  'GOOGL',
  'HD',
  'HON',
  'IBM',
  'INTC',
  'JNJ',
  'JPM',
  'KO',
  'LOW',
  'MA',
  'MCD',
  'MDT',
  'MMM',
  'MO',
  'MRK',
  'MSFT',
  'NEE',
  'ORCL',
  'PEP',
  'PFE',
  'PG',
  'PM',
  'RTN',
  'SBUX',
  'SO',
  'T',
  'TXN',
  'UNH',
  'UPS',
  'USB',
  'UTX',
  'V',
  'VZ',
  'WMT',
  'XOM'],
 1: ['COP', 'HAL', 'KMI'],
 2: ['ABBV', 'BMY', 'DUK', 'EXC', 'FOX', 'MON', 'PYPL', 'TWX'],
 3: ['ABT',
  'AMGN',
  'AMZN',
  'CVS',
  'FB',
  'GILD',
  'KHC',
  'LLY',
  'MDLZ',
  'NKE',
  'PCLN',
  'SPG',
  'TGT'],
 4: ['CELG', 'FOXA', 'GM', 'MET', 'MS', 'QCOM'],
 5: ['AGN', 'BIIB'],
 6: ['WBA'],
 7: ['BK',
  'C',
  'CAT',
  'COF',
  'DD',
  'DOW',
  'EMR',
  'F',
  'GS',
  'OXY',
  'SLB',
  'UNP',
  'WFC']}

## Answer to Question 6

**Question**: How many clusters do you observe?

**Answer**: 8 clusters

**Question**: What are the stocks within each cluster?

**Answer**
0. ['AAPL',
  'ACN',
  'AIG',
  'ALL',
  'AXP',
  'BA',
  'BLK',
  'BRK',
  'CL',
  'CMCSA',
  'COST',
  'CSCO',
  'CVX',
  'DHR',
  'DIS',
  'FDX',
  'GD',
  'GE',
  'GOOG',
  'GOOGL',
  'HD',
  'HON',
  'IBM',
  'INTC',
  'JNJ',
  'JPM',
  'KO',
  'LOW',
  'MA',
  'MCD',
  'MDT',
  'MMM',
  'MO',
  'MRK',
  'MSFT',
  'NEE',
  'ORCL',
  'PEP',
  'PFE',
  'PG',
  'PM',
  'RTN',
  'SBUX',
  'SO',
  'T',
  'TXN',
  'UNH',
  'UPS',
  'USB',
  'UTX',
  'V',
  'VZ',
  'WMT',
  'XOM']
1. ['COP', 'HAL', 'KMI']
2. ['ABBV', 'BMY', 'DUK', 'EXC', 'FOX', 'MON', 'PYPL', 'TWX']
3. ['ABT',
  'AMGN',
  'AMZN',
  'CVS',
  'FB',
  'GILD',
  'KHC',
  'LLY',
  'MDLZ',
  'NKE',
  'PCLN',
  'SPG',
  'TGT']
4. ['CELG', 'FOXA', 'GM', 'MET', 'MS', 'QCOM']
5. ['AGN', 'BIIB']
6. ['WBA']
7. ['BK',
  'C',
  'CAT',
  'COF',
  'DD',
  'DOW',
  'EMR',
  'F',
  'GS',
  'OXY',
  'SLB',
  'UNP',
  'WFC']

**Question**: Do you observe any relationship between your cluster structure and their business sectors?

**Answer**

1. Technology/Computer/Electronic/Energy/ 

2. Energy

3. Bank/Pharmaceutical/Media

4. E-commerce/Social Network/Food Processing/Travel/Health Care/Accessories

5. Financial Services/Automative/Mass Media/Telecommunications/Biotechnology

6. Biotechnology/Pharmaceutical

7. Pharmaceutical/Retail

8. Financial Services/Electric/Energy/Automative/Chemical/Equipment