In [1]:
import pandas as pd
import numpy as np

# Part1: Read processed data

In [2]:
df_featured = pd.read_csv("data/df_featured.csv",index_col="Name")

In [3]:
df_featured.head()

Unnamed: 0_level_0,date,open,high,low,close,volume,MACD_12_26,MACDsign_12_26,MACDdiff_12_26,ROC_5,Momentum_3,MA_5,EMA_3
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAL,2013-02-08,15.07,15.12,14.63,14.75,8407500,,,,,,,
AAL,2013-02-11,14.89,15.01,14.26,14.46,8882000,,,,,,,14.556667
AAL,2013-02-12,14.45,14.51,14.1,14.27,8126000,,,,,,,14.392857
AAL,2013-02-13,14.3,14.94,14.25,14.66,10259500,,,,,-0.09,,14.535333
AAL,2013-02-14,14.94,14.96,13.16,13.99,31879900,,,,-0.051525,-0.47,14.426,14.253871


# Part 2: Generate fake customer data

## 2.1 Generate customer position data

In [4]:
import random

#### 2.1.1 Generate fake data: number of stocks that customer is holding

In [5]:
### randome generate number of stocks that customer is holding
num_holding = random.sample(list(range(3,10)),1)
num_holding

[4]

In [6]:
### make it into function
def sample(target,num):
    return random.sample(target,num)

In [7]:
### Generate fake data for 1000 customers
num_customer = 1000
num_stocks = []
for i in range(num_customer):
    num_stocks += sample(list(range(3,10)),1)

In [8]:
num_stocks[:10]

[8, 7, 8, 7, 5, 3, 5, 5, 3, 6]

#### 2.1.2 Generate fake data: stocks that customer is holding

In [9]:
### Extract stock list
stocksList = df_featured.index.unique().values


In [10]:
stocksList.shape

(505,)

In [11]:
### sample stocks
stocks = []
for customer,num_stock in zip(range(num_customer),num_stocks):
    sampled_stocks = sample(list(stocksList),num_stock) ### Sample num_stock of stocks from stocks list 
    stocks.append(sampled_stocks)

In [12]:
stocks

[['MOS', 'MAA', 'AMAT', 'SNA', 'KSU', 'GPN', 'RHI', 'ETR'],
 ['CNC', 'EQR', 'ADM', 'MMC', 'KHC', 'MAA', 'DHI'],
 ['GLW', 'PM', 'INTC', 'MAC', 'UA', 'DG', 'AVGO', 'CMI'],
 ['MLM', 'BLK', 'INTU', 'WDC', 'JBHT', 'MAA', 'BHF'],
 ['AEE', 'AMGN', 'RE', 'MHK', 'CL'],
 ['WBA', 'TPR', 'CINF'],
 ['MAA', 'CB', 'AWK', 'TIF', 'PNC'],
 ['CSX', 'M', 'FIS', 'ED', 'ATVI'],
 ['PHM', 'SO', 'DHI'],
 ['LH', 'XEL', 'TIF', 'NFLX', 'MCD', 'LLY'],
 ['AMGN', 'CINF', 'CMG', 'MMC', 'MAR', 'UPS', 'FOX', 'PCLN', 'AAP'],
 ['MO', 'ILMN', 'ZTS', 'MAT', 'SLB', 'MS'],
 ['COST', 'DRI', 'HES'],
 ['ZBH', 'FLR', 'AXP'],
 ['XEL', 'MCK', 'KORS', 'SRE'],
 ['M', 'ADSK', 'FBHS'],
 ['TSN', 'PSA', 'EXR'],
 ['AAPL', 'UA', 'EA', 'HST', 'DIS'],
 ['MCHP', 'MTB', 'MNST', 'COST'],
 ['ARE', 'PWR', 'CMG', 'JEC', 'ETFC', 'PM', 'FOX', 'EBAY', 'CB'],
 ['HIG', 'NWS', 'AMGN', 'LKQ', 'EW', 'NEE', 'CLX'],
 ['ORLY', 'TWX', 'SRE', 'F', 'V', 'UAA', 'REG'],
 ['SRCL', 'NRG', 'NSC', 'WM', 'PCG', 'STI'],
 ['PG', 'AMT', 'AMD', 'VRSK', 'FAST', 'JNPR', 'S

#### 2.1.3 Generate fake stock position data

In [13]:
### sample position for one user, eg user holding 3 stocks
eg_positions = []
for i in range(3):
    eg_positions.append(random.uniform(0,1))
[x/sum(eg_positions) for x in eg_positions]

[0.30383973069588055, 0.15613308996919298, 0.5400271793349264]

In [14]:
### build function to sample positions, input is number of stock holded, output is corresponding position
def sample_position(count):
    positions = []
    for i in range(count):
        positions.append(random.uniform(0,1))
    sum_val = sum(positions)
    return [x/sum_val for x in positions]

In [15]:
sample_position(3)

[0.47329540900060213, 0.44901451000022025, 0.07769008099917761]

In [16]:
### loop over every customers
positions = []
for i in num_stocks:
    sampled_positions = sample_position(i)
    positions.append(sampled_positions)

# Part 3: Save data

In [17]:
from collections import defaultdict

customer_dic = defaultdict(dict)
for customer,stock, position in zip(list(range(num_customer)),stocks,positions):
    customer_dic[customer]["stocks"] = stock
    customer_dic[customer]['positions'] = position

In [18]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [19]:
save_obj(customer_dic,"data/customer_data")

#  Part 4: Modulize 

In [20]:
class data_generator:
    from collections import defaultdict
    import pickle
    import random
    
    def __init__(self,stocksList=None,numHoldingList=None,customeSize=1000):
        self.customeSize = customeSize
        self.stocksList = stocksList
        self.numHoldingList = numHoldingList ### numHoldingList format: [3,4,5,6,7,8,9,10]
    
    def generate_num_stocks(self):
        self.num_stocks = []
        for i in range(num_customer):
            self.num_stocks += sample(self.numHoldingList,1)
            
    def generate_stocks(self):
        self.stocks = []
        for customer,num_stock in zip(range(self.customeSize),self.num_stocks):
            sampled_stocks = sample(list(self.stocksList),num_stock) ### Sample num_stock of stocks from stocks list 
            self.stocks.append(sampled_stocks)
            
    def generate_positions(self):
        self.positions = []
        for i in self.num_stocks:
            sampled_positions = self.sample_position(i)
            self.positions.append(sampled_positions)
            
    def sample_position(self,count):
        positions = []
        for i in range(count):
            positions.append(random.uniform(0,1))
        sum_val = sum(positions)
        return [x/sum_val for x in positions]
    
    def save_as_dict(self):

        self.customer_dic = defaultdict(dict)
        for customer,stock, position in zip(list(range(self.customeSize)),self.stocks,self.positions):
            self.customer_dic[customer]["stocks"] = stock
            self.customer_dic[customer]['positions'] = position
        return self.customer_dic
    
    def generate_data(self):
        self.generate_num_stocks()
        self.generate_stocks()
        self.generate_positions()
        customerDic = self.save_as_dict()
        return customer_dic

In [21]:
generator = data_generator(customeSize=1000,stocksList=stocksList,numHoldingList=[3,4,5,6,7,8,9,10]) 
customer_data = generator.generate_data()

In [22]:
customer_data

defaultdict(dict,
            {0: {'stocks': ['MOS',
               'MAA',
               'AMAT',
               'SNA',
               'KSU',
               'GPN',
               'RHI',
               'ETR'],
              'positions': [0.047695681158954874,
               0.23052107203572847,
               0.07297433012303643,
               0.04829579452243989,
               0.20374682906034589,
               0.03215473031572561,
               0.07179900329030696,
               0.2928125594934619]},
             1: {'stocks': ['CNC', 'EQR', 'ADM', 'MMC', 'KHC', 'MAA', 'DHI'],
              'positions': [0.21431997264819516,
               0.09382397891121017,
               0.06157383134363061,
               0.12668594057588903,
               0.1451204820580555,
               0.150490801311365,
               0.2079849931516544]},
             2: {'stocks': ['GLW',
               'PM',
               'INTC',
               'MAC',
               'UA',
               'DG',
    