In [1]:
import pandas as pd
import numpy as np
import os
from os.path import isfile, isdir, join, splitext
import glob
import shutil
from collections import Counter
import datetime
from process_raw_prices import get_filename_without_ext

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
def my_read_csv(p):
    df = pd.read_csv(p)
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    return df

In [3]:
# raw dataset files pattern
raw_files_path_pattern = "../../dataset/nyse-daily/*.csv"
nyse_csv_paths = sorted(glob.glob(raw_files_path_pattern))
nyse_csv_paths

['../../dataset/nyse-daily/AAN.csv',
 '../../dataset/nyse-daily/AER.csv',
 '../../dataset/nyse-daily/AL.csv',
 '../../dataset/nyse-daily/AMBR.csv',
 '../../dataset/nyse-daily/AMN.csv',
 '../../dataset/nyse-daily/ANET.csv',
 '../../dataset/nyse-daily/APY.csv',
 '../../dataset/nyse-daily/AQ.csv',
 '../../dataset/nyse-daily/AQUA.csv',
 '../../dataset/nyse-daily/ASGN.csv',
 '../../dataset/nyse-daily/ASX.csv',
 '../../dataset/nyse-daily/ATEN.csv',
 '../../dataset/nyse-daily/ATHM.csv',
 '../../dataset/nyse-daily/ATU.csv',
 '../../dataset/nyse-daily/AUO.csv',
 '../../dataset/nyse-daily/AVLR.csv',
 '../../dataset/nyse-daily/AYR.csv',
 '../../dataset/nyse-daily/AYX.csv',
 '../../dataset/nyse-daily/BB.csv',
 '../../dataset/nyse-daily/BHE.csv',
 '../../dataset/nyse-daily/BITA.csv',
 '../../dataset/nyse-daily/BKI.csv',
 '../../dataset/nyse-daily/BOX.csv',
 '../../dataset/nyse-daily/CACI.csv',
 '../../dataset/nyse-daily/CAI.csv',
 '../../dataset/nyse-daily/CARS.csv',
 '../../dataset/nyse-daily/CDAY

In [4]:
lengths = [len(pd.read_csv(p)) for p in nyse_csv_paths]
lengths

[2446,
 2516,
 1940,
 1207,
 2517,
 1154,
 174,
 293,
 294,
 2517,
 180,
 1207,
 1275,
 2516,
 2516,
 140,
 2514,
 449,
 309,
 2514,
 2044,
 324,
 995,
 2514,
 2515,
 411,
 175,
 381,
 425,
 2517,
 2516,
 1174,
 2516,
 1212,
 67,
 2516,
 2516,
 1419,
 111,
 1915,
 2514,
 2073,
 455,
 73,
 1415,
 2517,
 1940,
 1738,
 62,
 2515,
 1444,
 805,
 2514,
 894,
 834,
 948,
 415,
 2515,
 1125,
 238,
 1748,
 2517,
 1202,
 228,
 809,
 2515,
 1067,
 164,
 2514,
 2517,
 790,
 1994,
 2514,
 2047,
 2514,
 2514,
 2514,
 2174,
 2309,
 284,
 2514,
 2516,
 1326,
 1113,
 2517,
 624,
 1392,
 2516,
 889,
 348,
 1361,
 1460,
 2515,
 1966,
 2210,
 1022,
 2514,
 1639,
 1992,
 1968,
 182,
 2514,
 874,
 2517,
 239,
 1625,
 1190,
 2516,
 57,
 2517,
 2516,
 155,
 817,
 179,
 2517,
 1208,
 64,
 1931,
 2516,
 2516,
 1327,
 2446,
 1199,
 1703,
 1336,
 283,
 2514,
 285,
 2087,
 913,
 174,
 2503,
 465,
 2514,
 289,
 2515,
 2514,
 2515,
 786,
 1943,
 2514,
 1567,
 2514,
 313,
 52,
 2514,
 2516,
 2516,
 2516,
 1391,
 2516

In [5]:
max(lengths)

2517

In [6]:
Counter(lengths)

Counter({52: 1,
         57: 1,
         62: 1,
         64: 1,
         67: 1,
         73: 1,
         111: 1,
         140: 1,
         155: 1,
         164: 1,
         174: 2,
         175: 1,
         179: 1,
         180: 1,
         182: 1,
         185: 1,
         228: 1,
         238: 1,
         239: 1,
         278: 1,
         283: 1,
         284: 1,
         285: 1,
         289: 1,
         293: 1,
         294: 1,
         309: 1,
         313: 1,
         324: 1,
         348: 1,
         381: 1,
         411: 1,
         415: 1,
         425: 1,
         435: 1,
         449: 1,
         455: 1,
         465: 1,
         624: 1,
         638: 1,
         726: 1,
         786: 1,
         790: 1,
         805: 1,
         809: 1,
         817: 1,
         834: 1,
         874: 1,
         889: 1,
         894: 1,
         913: 1,
         948: 1,
         995: 1,
         1022: 2,
         1067: 1,
         1113: 1,
         1125: 1,
         1154: 1,
         1169: 

In [7]:
start_date = datetime.date(2015, 1, 1)
dest_folder = "../../dataset/nyse-daily-trimmed/"
shutil.rmtree(dest_folder, ignore_errors=True)
os.makedirs(dest_folder, exist_ok=True)

for p in nyse_csv_paths:
    filename = get_filename_without_ext(p)
    df = my_read_csv(p)
    df = df[df['date'] >= datetime.date(2015, 1, 1)]
    df.to_csv(path_or_buf=join(dest_folder, filename + '.csv'), index=False)

In [8]:
trimmed_files_path_pattern = "../../dataset/nyse-daily-trimmed/*.csv"
nyse_trimmed_csv_paths = sorted(glob.glob(trimmed_files_path_pattern))

In [9]:
lengths_to_p = {}
for p in nyse_trimmed_csv_paths:
    l = len(pd.read_csv(p))
    if l not in lengths_to_p:
        lengths_to_p[l] = [p]
    else:
        lengths_to_p[l].append(p)

lengths_to_p

{52: ['../../dataset/nyse-daily-trimmed/SWI.csv'],
 57: ['../../dataset/nyse-daily-trimmed/PLAN.csv'],
 62: ['../../dataset/nyse-daily-trimmed/ESTC.csv'],
 64: ['../../dataset/nyse-daily-trimmed/RAMP.csv'],
 67: ['../../dataset/nyse-daily-trimmed/CTK.csv'],
 73: ['../../dataset/nyse-daily-trimmed/EB.csv'],
 111: ['../../dataset/nyse-daily-trimmed/DAVA.csv'],
 140: ['../../dataset/nyse-daily-trimmed/AVLR.csv'],
 155: ['../../dataset/nyse-daily-trimmed/PRSP.csv'],
 164: ['../../dataset/nyse-daily-trimmed/HUYA.csv'],
 174: ['../../dataset/nyse-daily-trimmed/APY.csv',
  '../../dataset/nyse-daily-trimmed/SMAR.csv'],
 175: ['../../dataset/nyse-daily-trimmed/CDAY.csv'],
 179: ['../../dataset/nyse-daily-trimmed/PVTL.csv'],
 180: ['../../dataset/nyse-daily-trimmed/ASX.csv'],
 182: ['../../dataset/nyse-daily-trimmed/NVT.csv'],
 185: ['../../dataset/nyse-daily-trimmed/ZUO.csv'],
 228: ['../../dataset/nyse-daily-trimmed/HMI.csv'],
 238: ['../../dataset/nyse-daily-trimmed/GTES.csv'],
 239: ['../../

In [10]:
max_length = max(lengths_to_p)

In [11]:
# find intersection of the max length group of stocks
intersection = my_read_csv(nyse_trimmed_csv_paths[0])['date'].values
for p in lengths_to_p[max_length]:
    df_temp = my_read_csv(p)
    intersection, _, __ = np.intersect1d(
        intersection,
        df_temp['date'].values,
        assume_unique=True, return_indices=True
    )

In [12]:
if len(intersection) == max_length:
    print('All stocks with max length have common trading date.')
    
    # save those stocks in a result folder
    result_folder = "../../dataset/nyse-daily-trimmed-same-length/"
    shutil.rmtree(result_folder, ignore_errors=True)
    os.makedirs(result_folder, exist_ok=True)
    for p in lengths_to_p[max_length]:
        shutil.copy(p, result_folder)
    print('The processed dataset was placed in: \n'+result_folder)

All stocks with max length have common trading date.
The processed dataset was placed in: 
../../dataset/nyse-daily-trimmed-same-length/
