In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [15]:
# load dataset
data_1 = pd.read_csv("prices-split-adjusted.csv", header=0)

In [16]:
data_1.head()

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05,WLTW,123.43,125.839996,122.309998,126.25,2163600.0
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0
3,2016-01-08,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0


In [18]:
data_2=pd.pivot_table(data_1, values = 'close', index=['date'], columns = 'symbol').reset_index()
data_2.head()

symbol,date,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACN,ADBE,...,XLNX,XOM,XRAY,XRX,XYL,YHOO,YUM,ZBH,ZION,ZTS
0,2010-01-04,22.389128,4.77,40.380001,30.572857,,26.629999,26.129884,42.07,37.09,...,25.379999,69.150002,35.330002,8.63,,17.1,25.226457,60.02,13.33,
1,2010-01-05,22.145923,5.31,40.139999,30.625713,,26.440001,25.918773,42.330002,37.700001,...,25.059999,69.419998,34.91,8.64,,17.23,25.140187,61.919998,13.8,
2,2010-01-06,22.06724,5.09,40.490002,30.138571,,26.190001,26.062713,42.779999,37.619999,...,24.889999,70.019997,35.139999,8.56,,17.17,24.960462,61.900002,15.0,
3,2010-01-07,22.038626,5.24,40.48,30.082857,,25.77,26.278623,42.740002,36.889999,...,24.639999,69.800003,35.599998,8.6,,16.700001,24.953272,63.32,16.68,
4,2010-01-08,22.031474,5.14,40.639999,30.282858,,26.049999,26.412967,42.57,36.689999,...,25.0,69.519997,35.599998,8.57,,16.700001,24.960462,61.990002,16.41,


# What Stocks have null values?

In [19]:
null_columns=data_2.columns[data_2.isnull().any()]
data_2[null_columns].isnull().sum()

symbol
ABBV     754
ALLE     976
CFG     1258
CHTR       1
COTY     866
CSRA    1478
DLPH     754
EVHC     909
FB       754
FBHS     754
FTV     1636
GM       222
HCA      754
HPE     1458
KHC     1384
KMI      754
KORS     754
LYB       79
MNK      868
MPC      754
NAVI    1258
NLSN     754
NWS      870
NWSA     870
PSX      754
PYPL    1384
QRVO    1258
SYF     1258
TDG      754
TRIP     754
WLTW    1511
WRK     1377
XYL      754
ZTS      775
dtype: int64

# Drop Stocks w/NaN

In [20]:
NoNaN = data_2.dropna(axis=1, thresh=1762, how="any") 

# Confirm Stocks with NaN are gone

In [21]:
null_columns=NoNaN.columns[NoNaN.isnull().any()]
NoNaN[null_columns].isnull().sum()

Series([], dtype: float64)

In [22]:
NoNaN.head()

symbol,date,A,AAL,AAP,AAPL,ABC,ABT,ACN,ADBE,ADI,...,XEL,XL,XLNX,XOM,XRAY,XRX,YHOO,YUM,ZBH,ZION
0,2010-01-04,22.389128,4.77,40.380001,30.572857,26.629999,26.129884,42.07,37.09,31.67,...,21.08,18.719999,25.379999,69.150002,35.330002,8.63,17.1,25.226457,60.02,13.33
1,2010-01-05,22.145923,5.31,40.139999,30.625713,26.440001,25.918773,42.330002,37.700001,31.620001,...,20.83,18.59,25.059999,69.419998,34.91,8.64,17.23,25.140187,61.919998,13.8
2,2010-01-06,22.06724,5.09,40.490002,30.138571,26.190001,26.062713,42.779999,37.619999,31.559999,...,20.870001,18.4,24.889999,70.019997,35.139999,8.56,17.17,24.960462,61.900002,15.0
3,2010-01-07,22.038626,5.24,40.48,30.082857,25.77,26.278623,42.740002,36.889999,31.309999,...,20.780001,18.4,24.639999,69.800003,35.599998,8.6,16.700001,24.953272,63.32,16.68
4,2010-01-08,22.031474,5.14,40.639999,30.282858,26.049999,26.412967,42.57,36.689999,31.49,...,20.790001,18.200001,25.0,69.519997,35.599998,8.57,16.700001,24.960462,61.990002,16.41


# Order the data chronologically

In [23]:
import warnings
warnings.filterwarnings("ignore")


NoNaN.sort_values('date', inplace=True)
NoNaN

NoNaN['date']=pd.to_datetime(NoNaN['date'])
NoNaN.sort_values(by='date')


symbol,date,A,AAL,AAP,AAPL,ABC,ABT,ACN,ADBE,ADI,...,XEL,XL,XLNX,XOM,XRAY,XRX,YHOO,YUM,ZBH,ZION
0,2010-01-04,22.389128,4.770000,40.380001,30.572857,26.629999,26.129884,42.070000,37.090000,31.670000,...,21.080000,18.719999,25.379999,69.150002,35.330002,8.63,17.100000,25.226457,60.020000,13.330000
1,2010-01-05,22.145923,5.310000,40.139999,30.625713,26.440001,25.918773,42.330002,37.700001,31.620001,...,20.830000,18.590000,25.059999,69.419998,34.910000,8.64,17.230000,25.140187,61.919998,13.800000
2,2010-01-06,22.067240,5.090000,40.490002,30.138571,26.190001,26.062713,42.779999,37.619999,31.559999,...,20.870001,18.400000,24.889999,70.019997,35.139999,8.56,17.170000,24.960462,61.900002,15.000000
3,2010-01-07,22.038626,5.240000,40.480000,30.082857,25.770000,26.278623,42.740002,36.889999,31.309999,...,20.780001,18.400000,24.639999,69.800003,35.599998,8.60,16.700001,24.953272,63.320000,16.680000
4,2010-01-08,22.031474,5.140000,40.639999,30.282858,26.049999,26.412967,42.570000,36.689999,31.490000,...,20.790001,18.200001,25.000000,69.519997,35.599998,8.57,16.700001,24.960462,61.990002,16.410000
5,2010-01-11,22.045780,5.040000,40.240002,30.015715,26.340000,26.547311,42.529999,36.209999,31.309999,...,20.990000,18.280001,24.740000,70.300003,36.060001,8.78,16.740000,25.391804,63.360001,16.510000
6,2010-01-12,21.781117,5.080000,39.540001,29.674286,26.520000,26.470542,42.259998,35.660000,30.010000,...,21.240000,17.360001,24.049999,69.949997,35.799999,8.85,16.680000,25.873473,62.139999,16.049999
7,2010-01-13,21.952789,5.480000,40.090000,30.092857,27.040001,26.528119,42.740002,36.279999,30.000000,...,21.570000,17.510000,24.260000,69.669998,36.299999,8.95,16.900000,25.801583,62.549999,16.350000
8,2010-01-14,22.281832,5.590000,39.560001,29.918571,27.209999,26.556907,43.119999,35.900002,29.969999,...,21.610001,17.459999,24.110001,69.680000,36.570000,8.89,17.120001,26.081955,62.730000,16.709999
9,2010-01-15,21.766810,5.500000,39.310001,29.418571,26.920000,26.624078,42.810001,35.869999,28.959999,...,21.410000,17.090000,23.520000,69.110001,35.950001,8.84,16.820000,25.643422,61.750000,16.260000


In [24]:
NoNaN.to_csv('/Users/mattleli/Documents/Data Mining/NoNaN.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/mattleli/Documents/Data Mining/NoNaN.csv'

# Full discosure here, I opened the NoNaN csv with excel and just deleted the symbol column

In [25]:
data = pd.read_csv('cleanerNoNaN.csv', header = 0, index_col='date', parse_dates=True)

FileNotFoundError: [Errno 2] File b'cleanerNoNaN.csv' does not exist: b'cleanerNoNaN.csv'

In [26]:
data.head()

NameError: name 'data' is not defined

# make a new dataframe with log returns

In [15]:
log_returns = np.log(data).diff()
log_returns.head()

daily_diffs = data.diff(periods=1, axis=0)
daily_diffs

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABC,ABT,ACN,ADBE,ADI,ADM,...,XEL,XL,XLNX,XOM,XRAY,XRX,YHOO,YUM,ZBH,ZION
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,,,,,,,,,,,...,,,,,,,,,,
2010-01-05,-0.243205,0.540000,-0.240002,0.052856,-0.189998,-0.211111,0.260002,0.610001,-0.049999,0.170000,...,-0.250000,-0.129999,-0.320000,0.269996,-0.420002,0.01,0.130000,-0.086270,1.899998,0.470000
2010-01-06,-0.078684,-0.220000,0.350003,-0.487143,-0.250000,0.143940,0.449997,-0.080002,-0.060002,-0.080000,...,0.040001,-0.190000,-0.170000,0.599999,0.229999,-0.08,-0.060000,-0.179725,-0.019996,1.200000
2010-01-07,-0.028614,0.150000,-0.010002,-0.055714,-0.420001,0.215910,-0.039997,-0.730000,-0.250000,-0.329999,...,-0.090000,0.000000,-0.250000,-0.219994,0.459999,0.04,-0.469999,-0.007189,1.419998,1.680000
2010-01-08,-0.007152,-0.100000,0.159999,0.200001,0.279999,0.134344,-0.170002,-0.200000,0.180001,-0.390000,...,0.010000,-0.199999,0.360001,-0.280006,0.000000,-0.03,0.000000,0.007189,-1.329998,-0.270000
2010-01-11,0.014307,-0.100000,-0.399997,-0.267143,0.290001,0.134344,-0.040001,-0.480000,-0.180001,0.020001,...,0.199999,0.080000,-0.260000,0.780006,0.460003,0.21,0.039999,0.431343,1.369999,0.100000
2010-01-12,-0.264664,0.040000,-0.700001,-0.341429,0.180000,-0.076768,-0.270001,-0.549999,-1.299999,-0.330000,...,0.250000,-0.920000,-0.690001,-0.350006,-0.260002,0.07,-0.060000,0.481669,-1.220002,-0.460001
2010-01-13,0.171672,0.400000,0.549999,0.418572,0.520001,0.057577,0.480004,0.619999,-0.010000,0.129999,...,0.330000,0.149999,0.210001,-0.279999,0.500000,0.10,0.220000,-0.071890,0.410000,0.300001
2010-01-14,0.329043,0.110000,-0.529999,-0.174286,0.169998,0.028788,0.379997,-0.379997,-0.030001,0.080000,...,0.040001,-0.050001,-0.149999,0.010002,0.270001,-0.06,0.220001,0.280372,0.180001,0.359999
2010-01-15,-0.515022,-0.090000,-0.250000,-0.500000,-0.289999,0.067171,-0.309998,-0.030003,-1.010000,-0.199999,...,-0.200001,-0.369999,-0.590001,-0.569999,-0.619999,-0.05,-0.300001,-0.438533,-0.980000,-0.449999


In [31]:
######################################################################################
##note, DO NOT change this cell to your path unless you want to save a new copy of DMProject_LogReturns 
######################################################################################
log_returns.to_csv('/Users/mattleli/Documents/Data Mining/DMProject_LogReturns.csv')

# Do we get NaN's when converting to return data?

In [36]:
null_columns=log_returns.columns[log_returns.isnull().any()]
log_returns[null_columns].isnull().sum()

A       1
AAL     1
AAP     1
AAPL    1
ABC     1
ABT     1
ACN     1
ADBE    1
ADI     1
ADM     1
ADP     1
ADS     1
ADSK    1
AEE     1
AEP     1
AES     1
AET     1
AFL     1
AGN     1
AIG     1
AIV     1
AIZ     1
AJG     1
AKAM    1
ALB     1
ALK     1
ALL     1
ALXN    1
AMAT    1
AME     1
       ..
VRSK    1
VRSN    1
VRTX    1
VTR     1
VZ      1
WAT     1
WBA     1
WDC     1
WEC     1
WFC     1
WFM     1
WHR     1
WM      1
WMB     1
WMT     1
WU      1
WY      1
WYN     1
WYNN    1
XEC     1
XEL     1
XL      1
XLNX    1
XOM     1
XRAY    1
XRX     1
YHOO    1
YUM     1
ZBH     1
ZION    1
Length: 467, dtype: int64

# Perfect, our only NaN's are the first day in our data

Now, create the daily differences:

In [14]:
daily_diffs = data.diff(periods=1, axis=0)
daily_diffs

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABC,ABT,ACN,ADBE,ADI,ADM,...,XEL,XL,XLNX,XOM,XRAY,XRX,YHOO,YUM,ZBH,ZION
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,,,,,,,,,,,...,,,,,,,,,,
2010-01-05,-0.243205,0.540000,-0.240002,0.052856,-0.189998,-0.211111,0.260002,0.610001,-0.049999,0.170000,...,-0.250000,-0.129999,-0.320000,0.269996,-0.420002,0.01,0.130000,-0.086270,1.899998,0.470000
2010-01-06,-0.078684,-0.220000,0.350003,-0.487143,-0.250000,0.143940,0.449997,-0.080002,-0.060002,-0.080000,...,0.040001,-0.190000,-0.170000,0.599999,0.229999,-0.08,-0.060000,-0.179725,-0.019996,1.200000
2010-01-07,-0.028614,0.150000,-0.010002,-0.055714,-0.420001,0.215910,-0.039997,-0.730000,-0.250000,-0.329999,...,-0.090000,0.000000,-0.250000,-0.219994,0.459999,0.04,-0.469999,-0.007189,1.419998,1.680000
2010-01-08,-0.007152,-0.100000,0.159999,0.200001,0.279999,0.134344,-0.170002,-0.200000,0.180001,-0.390000,...,0.010000,-0.199999,0.360001,-0.280006,0.000000,-0.03,0.000000,0.007189,-1.329998,-0.270000
2010-01-11,0.014307,-0.100000,-0.399997,-0.267143,0.290001,0.134344,-0.040001,-0.480000,-0.180001,0.020001,...,0.199999,0.080000,-0.260000,0.780006,0.460003,0.21,0.039999,0.431343,1.369999,0.100000
2010-01-12,-0.264664,0.040000,-0.700001,-0.341429,0.180000,-0.076768,-0.270001,-0.549999,-1.299999,-0.330000,...,0.250000,-0.920000,-0.690001,-0.350006,-0.260002,0.07,-0.060000,0.481669,-1.220002,-0.460001
2010-01-13,0.171672,0.400000,0.549999,0.418572,0.520001,0.057577,0.480004,0.619999,-0.010000,0.129999,...,0.330000,0.149999,0.210001,-0.279999,0.500000,0.10,0.220000,-0.071890,0.410000,0.300001
2010-01-14,0.329043,0.110000,-0.529999,-0.174286,0.169998,0.028788,0.379997,-0.379997,-0.030001,0.080000,...,0.040001,-0.050001,-0.149999,0.010002,0.270001,-0.06,0.220001,0.280372,0.180001,0.359999
2010-01-15,-0.515022,-0.090000,-0.250000,-0.500000,-0.289999,0.067171,-0.309998,-0.030003,-1.010000,-0.199999,...,-0.200001,-0.369999,-0.590001,-0.569999,-0.619999,-0.05,-0.300001,-0.438533,-0.980000,-0.449999


In [15]:
daily_diffs.to_csv('/Users/mattleli/Documents/Data Mining/DMProject_DailyDiffs.csv')