# <center> IEOR 169 Final Project: Data Generation and Exploration </center>
# <center> Team: Chris Landgrebe, Calvin Suster, Wyatt Walsh </center>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Generation" data-toc-modified-id="Data-Generation-1">Data Generation</a></span><ul class="toc-item"><li><span><a href="#Step-1:-Imports-and-Environment-Considerations" data-toc-modified-id="Step-1:-Imports-and-Environment-Considerations-1.1">Step 1: Imports and Environment Considerations</a></span></li><li><span><a href="#Step-2:-Define-Generator-Function" data-toc-modified-id="Step-2:-Define-Generator-Function-1.2">Step 2: Define Generator Function</a></span></li><li><span><a href="#Step-3:-Define-Set-of-Person/Object-Combinations-and-Apply-Generator" data-toc-modified-id="Step-3:-Define-Set-of-Person/Object-Combinations-and-Apply-Generator-1.3">Step 3: Define Set of Person/Object Combinations and Apply Generator</a></span></li><li><span><a href="#Step-1:-Load-in-Data-and-View" data-toc-modified-id="Step-1:-Load-in-Data-and-View-1.4">Step 1: Load in Data and View</a></span></li></ul></li></ul></div>

## Data Generation

### Step 1: Imports and Environment Considerations

In [2]:
# import necessary libraries
import numpy as np
import pandas as pd
import os
from IPython.display import display

### Step 2: Define Generator Function

In [5]:
def synthesize(sizes):
    '''This function takes in row of a dataframe that contains three columns:
    1. the index associated with that row'
    2. the number of objects for this particular dataset
    3. the number of people for this particular dataset
    These parameters are then used to generate a matrix of Uniform[0,1] r.v.s of whic is then massaged into
    a compatible .dat format for use in AMPL'''
    
    # create r.v. matrix of size len(people) by len(objects) then normalize
    ## normalization in this case takes the form of dividing rows by their respective sums
    grid = np.random.uniform(0,1,(sizes[1:]))
    normed = grid/np.sum(grid,1)[:,None]
    
    # create df with columns whose values correspond with the indice of the value from the original matrix
    df = pd.DataFrame(normed).reset_index()

    # add 1 to all variables so that the ranges start from 1
    df.iloc[:,0] = df.iloc[:,0].astype('int32') + 1
    
    # move columns for correct AMPL reading
    columns = ['p', *range(1,sizes[2]+1)]
    df.columns = columns 
    
    # get index of current row for use in file naming
    name = './data/generated/' +  str(sizes[0]) + '.dat'
    # generate the sets for people and objects given their size
    people = ' '.join([str(i) for i in range(1,sizes[1]+1)])
    items = ' '.join([str(i) for i in range(1,sizes[2]+1)])
    # since np.savetxt will be used, bundle all other information in the header
    header = 'data; \nset P := %s; \n' % people
    header += 'set I := %s;' % items
    header += '\nparam v : '
    header += items + ':='
    
    types = ['%i'] + (['%f'] * sizes[2])
    # save df values space separated under header
    np.savetxt(name,df.values, fmt = types, header = header, comments = '')
    
    # add final semicolon and close file
    file = open(name, 'a')
    file.write('\n ;')
    file.close()

### Step 3: Define Set of Person/Object Combinations and Apply Generator

In [6]:
# define range of values to combine
## linear spacing is utilized until 160, then a log spacing is used since order of magnitude is more important
# values = np.append(np.array([5,10,20,40,80,160]), np.geomspace(320, 5120, 14, dtype = int))
values = [*range(5, 100,5), *range(100,250,50), *range(250, 1001, 250), *range(2000, 5001, 1000), 10000]

# create df from meshed value ranges.
# ## must take transpose of meshgrid and reshape in order for output to be .dat file ready
sizes = pd.DataFrame(np.array(np.meshgrid(values,values)).T.reshape(-1,2))

# # remove any rows where there are more people than objects since min p will always be 1
sizes = sizes.loc[sizes[0] <= sizes[1],:].reset_index(drop=True)
sizes = sizes.loc[sizes[1] <= 5 * sizes[0]].reset_index(drop=True).reset_index()
columns = ['fileNum', 'numPeople', 'numItems']
sizes.columns = columns

# # add 1 to index so that row sets start at 1 and display resultant df
sizes['fileNum'] = sizes['fileN + 1
display(sizes)

# # apply generator function to remaining rows 
sizes.apply(synthesize, axis = 1) # !!! this line is commmented out since the data already exists !!!

Unnamed: 0,fileNum,numPeople,numItems
0,1,5,5
1,2,5,10
2,3,5,15
3,4,5,20
4,5,5,25
...,...,...,...
257,258,4000,5000
258,259,4000,10000
259,260,5000,5000
260,261,5000,10000


0      None
1      None
2      None
3      None
4      None
       ... 
257    None
258    None
259    None
260    None
261    None
Length: 262, dtype: object

### Step 1: Load in Data and View

In [7]:



0.062616 + 0.046739+ 0.105262 +0.088495 +0.083414 +0.129013+ 0.034772+ 0.040384 +0.120307+ 0.038414+ 0.103403 +0.065507+ 0.008113+ 0.000052 +0.073509



1.0

In [None]:
df_1 = pd.read_csv('./data/output/1.txt')
df